27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
32 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
35 v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
38 ILVRL_H2_SW(zero, in0_h, in0_r_m, in0_l_m); \
39 ILVRL_H2_SW(zero, in1_h, in1_r_m, in1_l_m); \
40 MUL4(in0_r_m, wgt_w, in0_l_m, wgt_w, in1_r_m, wgt_w, in1_l_m, wgt_w, \
41 in0_r_m, in0_l_m, in1_r_m, in1_l_m); \
42 SRAR_W4_SW(in0_r_m, in0_l_m, in1_r_m, in1_l_m, rnd_w); \
43 ADD4(in0_r_m, offset_h, in0_l_m, offset_h, in1_r_m, offset_h, in1_l_m, \
44 offset_h, in0_r_m, in0_l_m, in1_r_m, in1_l_m); \
45 CLIP_SW4_0_255(in0_r_m, in0_l_m, in1_r_m, in1_l_m); \
46 PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \
49 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \
50 offset_h, rnd_w, out0_h, out1_h, \
53 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
55 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \
59 #define HEVC_FILT_8TAP_4W_SH(in0, in1, in2, in3, filt0, filt1, \
60 filt2, filt3, dst0, dst1) \
62 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7; \
63 ILVRL_B2_SH(zero, in0, tmp0, tmp4); \
64 ILVRL_B2_SH(zero, in1, tmp1, tmp5); \
65 ILVRL_B2_SH(zero, in2, tmp2, tmp6); \
66 ILVRL_B2_SH(zero, in3, tmp3, tmp7); \
67 dst0 = __msa_dotp_s_w((v8i16) tmp0, (v8i16) filt0); \
68 dst1 = __msa_dotp_s_w((v8i16) tmp4, (v8i16) filt0); \
69 DPADD_SH2_SW(tmp1, tmp5, filt1, filt1, dst0, dst1); \
70 DPADD_SH2_SW(tmp2, tmp6, filt2, filt2, dst0, dst1); \
71 DPADD_SH2_SW(tmp3, tmp7, filt3, filt3, dst0, dst1); \
83 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
87 v8i16 dst0, dst1, dst2, dst3, offset_vec;
88 v4i32 weight_vec, rnd_vec;
90 weight_vec = __msa_fill_w(
weight);
91 offset_vec = __msa_fill_w(
offset);
92 rnd_vec = __msa_fill_w(rnd_val);
97 LW2(
src, src_stride, tp0, tp1);
99 dst0 = (v8i16) __msa_ilvr_b(
zero,
src0);
103 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
105 dst0_r += offset_vec;
106 dst0_l += offset_vec;
108 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
109 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
110 ST_W2(out0, 0, 1, dst, dst_stride);
112 LW4(
src, src_stride, tp0, tp1, tp2, tp3);
117 rnd_vec, dst0, dst1);
118 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
119 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
120 }
else if (0 == (
height % 8)) {
121 for (loop_cnt = (
height >> 3); loop_cnt--;) {
122 LW4(
src, src_stride, tp0, tp1, tp2, tp3);
123 src += 4 * src_stride;
125 LW4(
src, src_stride, tp0, tp1, tp2, tp3);
126 src += 4 * src_stride;
130 SLLI_4V(dst0, dst1, dst2, dst3, 6);
132 offset_vec, rnd_vec, dst0, dst1,
135 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
136 dst += 8 * dst_stride;
152 uint64_t tp0, tp1, tp2, tp3;
154 v16u8 out0, out1, out2, out3;
156 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
157 v4i32 weight_vec, rnd_vec;
159 weight_vec = __msa_fill_w(
weight);
160 offset_vec = __msa_fill_w(
offset);
161 rnd_vec = __msa_fill_w(rnd_val);
163 for (loop_cnt = (
height >> 3); loop_cnt--;) {
164 LD4(
src, src_stride, tp0, tp1, tp2, tp3);
165 src += (4 * src_stride);
168 LD4(
src, src_stride, tp0, tp1, tp2, tp3);
169 src += (4 * src_stride);
178 SLLI_4V(dst0, dst1, dst2, dst3, 6);
179 SLLI_4V(dst4, dst5, dst6, dst7, 6);
182 offset_vec, rnd_vec, dst0, dst1, dst2,
185 offset_vec, rnd_vec, dst4, dst5, dst6,
190 ST_W2(out0, 0, 2, dst, dst_stride);
191 ST_H2(out0, 2, 6, dst + 4, dst_stride);
192 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
193 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
194 dst += (4 * dst_stride);
195 ST_W2(out2, 0, 2, dst, dst_stride);
196 ST_H2(out2, 2, 6, dst + 4, dst_stride);
197 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
198 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
199 dst += (4 * dst_stride);
202 LD4(
src, src_stride, tp0, tp1, tp2, tp3);
203 src += (4 * src_stride);
206 LD4(
src, src_stride, tp0, tp1, tp2, tp3);
207 src += (4 * src_stride);
216 SLLI_4V(dst0, dst1, dst2, dst3, 6);
217 SLLI_4V(dst4, dst5, dst6, dst7, 6);
220 offset_vec, rnd_vec, dst0, dst1, dst2,
223 offset_vec, rnd_vec, dst4, dst5, dst6,
229 ST_W2(out0, 0, 2, dst, dst_stride);
230 ST_H2(out0, 2, 6, dst + 4, dst_stride);
231 }
else if (res == 4) {
232 ST_W2(out0, 0, 2, dst, dst_stride);
233 ST_H2(out0, 2, 6, dst + 4, dst_stride);
234 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
235 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
237 ST_W2(out0, 0, 2, dst, dst_stride);
238 ST_H2(out0, 2, 6, dst + 4, dst_stride);
239 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
240 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
241 dst += (4 * dst_stride);
242 ST_W2(out2, 0, 2, dst, dst_stride);
243 ST_H2(out2, 2, 6, dst + 4, dst_stride);
258 uint64_t tp0, tp1, tp2, tp3;
259 v16i8
src0 = { 0 },
src1 = { 0 },
src2 = { 0 }, src3 = { 0 };
261 v16u8 out0, out1, out2, out3;
262 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
263 v4i32 weight_vec, rnd_vec;
265 weight_vec = __msa_fill_w(
weight);
266 offset_vec = __msa_fill_w(
offset);
267 rnd_vec = __msa_fill_w(rnd_val);
270 LD2(
src, src_stride, tp0, tp1);
275 rnd_vec, dst0, dst1);
276 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
277 ST_D2(out0, 0, 1, dst, dst_stride);
279 LD4(
src, src_stride, tp0, tp1, tp2, tp3);
284 SLLI_4V(dst0, dst1, dst2, dst3, 6);
286 offset_vec, rnd_vec, dst0, dst1, dst2,
289 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
291 LD4(
src, src_stride, tp0, tp1, tp2, tp3);
292 src += 4 * src_stride;
295 LD2(
src, src_stride, tp0, tp1);
300 SLLI_4V(dst0, dst1, dst2, dst3, 6);
303 offset_vec, rnd_vec, dst0, dst1, dst2,
306 rnd_vec, dst4, dst5);
307 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
308 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
309 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
310 }
else if (0 ==
height % 8) {
311 for (loop_cnt = (
height >> 3); loop_cnt--;) {
312 LD4(
src, src_stride, tp0, tp1, tp2, tp3);
313 src += 4 * src_stride;
316 LD4(
src, src_stride, tp0, tp1, tp2, tp3);
317 src += 4 * src_stride;
325 SLLI_4V(dst0, dst1, dst2, dst3, 6);
326 SLLI_4V(dst4, dst5, dst6, dst7, 6);
328 offset_vec, rnd_vec, dst0, dst1,
331 offset_vec, rnd_vec, dst4, dst5,
335 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
337 dst += (8 * dst_stride);
352 v16u8 out0, out1, out2;
354 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
357 v4i32 weight_vec, rnd_vec;
359 weight_vec = __msa_fill_w(
weight);
360 offset_vec = __msa_fill_w(
offset);
361 rnd_vec = __msa_fill_w(rnd_val);
363 for (loop_cnt = 4; loop_cnt--;) {
365 src += (4 * src_stride);
367 dst0, dst1, dst2, dst3);
371 SLLI_4V(dst0, dst1, dst2, dst3, 6);
374 offset_vec, rnd_vec, dst0, dst1, dst2,
377 rnd_vec, dst4, dst5);
379 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
380 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
381 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
382 dst += (4 * dst_stride);
396 v16u8 out0, out1, out2, out3;
399 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
400 v4i32 weight_vec, rnd_vec;
402 weight_vec = __msa_fill_w(
weight);
403 offset_vec = __msa_fill_w(
offset);
404 rnd_vec = __msa_fill_w(rnd_val);
406 for (loop_cnt =
height >> 2; loop_cnt--;) {
408 src += (4 * src_stride);
413 SLLI_4V(dst0, dst1, dst2, dst3, 6);
414 SLLI_4V(dst4, dst5, dst6, dst7, 6);
416 offset_vec, rnd_vec, dst0, dst1, dst2,
419 offset_vec, rnd_vec, dst4, dst5, dst6,
423 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
424 dst += (4 * dst_stride);
438 v16u8 out0, out1, out2, out3, out4, out5;
441 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
442 v8i16 dst8, dst9, dst10, dst11;
443 v4i32 weight_vec, rnd_vec;
445 weight_vec = __msa_fill_w(
weight);
446 offset_vec = __msa_fill_w(
offset);
447 rnd_vec = __msa_fill_w(rnd_val);
449 for (loop_cnt = (
height >> 2); loop_cnt--;) {
452 src += (4 * src_stride);
460 SLLI_4V(dst0, dst1, dst2, dst3, 6);
461 SLLI_4V(dst4, dst5, dst6, dst7, 6);
462 SLLI_4V(dst8, dst9, dst10, dst11, 6);
464 offset_vec, rnd_vec, dst0, dst1, dst2,
467 offset_vec, rnd_vec, dst4, dst5, dst6,
470 offset_vec, rnd_vec, dst8, dst9, dst10,
472 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
473 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
474 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
475 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
476 dst += (4 * dst_stride);
490 v16u8 out0, out1, out2, out3;
493 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
494 v4i32 weight_vec, rnd_vec;
496 weight_vec = __msa_fill_w(
weight);
497 offset_vec = __msa_fill_w(
offset);
498 rnd_vec = __msa_fill_w(rnd_val);
500 for (loop_cnt = (
height >> 1); loop_cnt--;) {
503 src += (2 * src_stride);
509 SLLI_4V(dst0, dst1, dst2, dst3, 6);
510 SLLI_4V(dst4, dst5, dst6, dst7, 6);
512 offset_vec, rnd_vec, dst0, dst1, dst2,
515 offset_vec, rnd_vec, dst4, dst5, dst6,
519 ST_UB2(out0, out1, dst, dst_stride);
520 ST_UB2(out2, out3, dst + 16, dst_stride);
521 dst += (2 * dst_stride);
535 v16u8 out0, out1, out2, out3, out4, out5;
538 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
539 v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
540 v4i32 weight_vec, rnd_vec;
542 weight_vec = __msa_fill_w(
weight);
543 offset_vec = __msa_fill_w(
offset);
544 rnd_vec = __msa_fill_w(rnd_val);
546 for (loop_cnt = (
height >> 1); loop_cnt--;) {
558 SLLI_4V(dst0, dst1, dst2, dst3, 6);
559 SLLI_4V(dst4, dst5, dst6, dst7, 6);
560 SLLI_4V(dst8, dst9, dst10, dst11, 6);
562 offset_vec, rnd_vec, dst0, dst1, dst2,
565 offset_vec, rnd_vec, dst4, dst5, dst6,
568 offset_vec, rnd_vec, dst8, dst9, dst10,
570 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
571 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
572 ST_UB2(out0, out1, dst, 16);
573 ST_UB(out2, dst + 32);
575 ST_UB2(out3, out4, dst, 16);
576 ST_UB(out5, dst + 32);
591 v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
594 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
595 v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
596 v4i32 weight_vec, rnd_vec;
598 weight_vec = __msa_fill_w(
weight);
599 offset_vec = __msa_fill_w(
offset);
600 rnd_vec = __msa_fill_w(rnd_val);
602 for (loop_cnt = (
height >> 1); loop_cnt--;) {
616 SLLI_4V(dst0, dst1, dst2, dst3, 6);
617 SLLI_4V(dst4, dst5, dst6, dst7, 6);
618 SLLI_4V(dst8, dst9, dst10, dst11, 6);
619 SLLI_4V(dst12, dst13, dst14, dst15, 6);
621 offset_vec, rnd_vec, dst0, dst1, dst2,
624 offset_vec, rnd_vec, dst4, dst5, dst6,
627 offset_vec, rnd_vec, dst8, dst9, dst10,
630 offset_vec, rnd_vec, dst12, dst13, dst14,
635 PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
636 ST_UB4(out0, out1, out2, out3, dst, 16);
638 ST_UB4(out4, out5, out6, out7, dst, 16);
654 uint32_t res =
height & 0x07;
657 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
658 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
659 v16i8 mask0, mask1, mask2, mask3;
660 v8i16 filter_vec, filt0, filt1, filt2, filt3;
661 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
662 v4i32 weight_vec, rnd_vec, offset_vec;
667 weight_vec = __msa_fill_w(
weight);
668 rnd_vec = __msa_fill_w(rnd_val);
669 offset_vec = __msa_fill_w(
offset);
680 for (loop_cnt = (
height >> 3); loop_cnt--;) {
682 src += (8 * src_stride);
684 vec0, vec1, vec2, vec3);
686 vec4, vec5, vec6, vec7);
687 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
688 vec8, vec9, vec10, vec11);
689 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
690 vec12, vec13, vec14, vec15);
701 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
702 weight_vec, dst0, dst1, dst2, dst3)
703 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
704 weight_vec, dst4, dst5, dst6, dst7);
707 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
708 offset_vec, dst0, dst1, dst2, dst3);
709 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
710 offset_vec, dst4, dst5, dst6, dst7);
712 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
713 vec0, vec1, vec2, vec3);
716 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
717 dst += (8 * dst_stride);
722 vec0, vec1, vec2, vec3);
724 vec4, vec5, vec6, vec7);
725 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
726 vec8, vec9, vec10, vec11);
727 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
728 vec12, vec13, vec14, vec15);
739 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
740 weight_vec, dst0, dst1, dst2, dst3)
741 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
742 weight_vec, dst4, dst5, dst6, dst7);
745 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
746 offset_vec, dst0, dst1, dst2, dst3);
747 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
748 offset_vec, dst4, dst5, dst6, dst7);
750 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
751 vec0, vec1, vec2, vec3);
755 ST_W2(out0, 0, 1, dst, dst_stride);
756 }
else if (res == 4) {
757 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
759 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
760 ST_W2(out1, 0, 1, dst + 4 * dst_stride, dst_stride);
776 uint32_t res =
height & 0x03;
779 v8i16 filt0, filt1, filt2, filt3;
780 v16i8 mask0, mask1, mask2, mask3;
782 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
783 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
784 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
785 v4i32 weight_vec, rnd_vec, offset_vec;
790 weight_vec = __msa_fill_w(
weight);
791 rnd_vec = __msa_fill_w(rnd_val);
792 offset_vec = __msa_fill_w(
offset);
803 for (loop_cnt = (
height >> 2); loop_cnt--;) {
805 src += (4 * src_stride);
808 vec0, vec1, vec2, vec3);
810 vec4, vec5, vec6, vec7);
812 vec8, vec9, vec10, vec11);
813 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
814 vec12, vec13, vec14, vec15);
824 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
825 weight_vec, dst0, dst1, dst2, dst3)
826 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
827 weight_vec, dst4, dst5, dst6, dst7);
830 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
831 offset_vec, dst0, dst1, dst2, dst3);
832 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
833 offset_vec, dst4, dst5, dst6, dst7);
835 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
836 vec0, vec1, vec2, vec3);
838 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
839 dst += (4 * dst_stride);
845 vec0, vec1, vec2, vec3);
847 vec4, vec5, vec6, vec7);
853 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
854 weight_vec, dst0, dst1, dst2, dst3)
856 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
857 offset_vec, dst0, dst1, dst2, dst3);
860 out0 = __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
861 ST_D2(out0, 0, 1, dst, dst_stride);
877 v8i16 filt0, filt1, filt2, filt3;
879 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
880 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
882 v4i32 dst0, dst1, dst2, dst3;
884 v4i32 weight_vec, rnd_vec, offset_vec;
889 weight_vec = __msa_fill_w(
weight);
890 rnd_vec = __msa_fill_w(rnd_val);
891 offset_vec = __msa_fill_w(
offset);
906 for (loop_cnt = (
height >> 1); loop_cnt--;) {
909 src += (2 * src_stride);
912 vec0, vec1, vec2, vec3);
914 vec4, vec5, vec6, vec7);
920 vec0, vec1, vec2, vec3);
922 filt3, dst00, dst01);
924 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
925 weight_vec, dst0, dst1, dst2, dst3)
926 MUL2(dst00, weight_vec, dst01, weight_vec, dst00, dst01);
929 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
930 offset_vec, dst0, dst1, dst2, dst3);
931 ADD2(dst00, offset_vec, dst01, offset_vec, dst00, dst01);
935 vec2 = __msa_pckev_h((v8i16) dst01, (v8i16) dst00);
938 ST_D2(out0, 0, 1, dst, dst_stride);
939 ST_W2(out1, 0, 1, dst + 8, dst_stride);
940 dst += (2 * dst_stride);
957 v8i16 filt0, filt1, filt2, filt3;
958 v16i8 mask0, mask1, mask2, mask3;
960 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
961 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
962 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
963 v4i32 weight_vec, rnd_vec, offset_vec;
968 weight_vec = __msa_fill_w(
weight);
969 rnd_vec = __msa_fill_w(rnd_val);
970 offset_vec = __msa_fill_w(
offset);
981 for (loop_cnt = (
height >> 1); loop_cnt--;) {
984 src += (2 * src_stride);
987 vec0, vec1, vec2, vec3);
989 vec4, vec5, vec6, vec7);
991 vec8, vec9, vec10, vec11);
992 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
993 vec12, vec13, vec14, vec15);
1003 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1004 weight_vec, dst0, dst1, dst2, dst3)
1005 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1006 weight_vec, dst4, dst5, dst6, dst7);
1009 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1010 offset_vec, dst0, dst1, dst2, dst3);
1011 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1012 offset_vec, dst4, dst5, dst6, dst7);
1014 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1015 vec0, vec1, vec2, vec3);
1018 ST_UB2(out0, out1, dst, dst_stride);
1019 dst += (2 * dst_stride);
1034 v16u8 out0, out1, out2;
1036 v8i16 filt0, filt1, filt2, filt3;
1037 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1038 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1039 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1040 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1041 v4i32 dst8, dst9, dst10, dst11;
1043 v4i32 weight_vec, rnd_vec, offset_vec;
1048 weight_vec = __msa_fill_w(
weight);
1049 rnd_vec = __msa_fill_w(rnd_val);
1050 offset_vec = __msa_fill_w(
offset);
1065 for (loop_cnt = 16; loop_cnt--;) {
1071 vec0, vec1, vec2, vec3);
1073 vec4, vec5, vec6, vec7);
1075 vec8, vec9, vec10, vec11);
1077 vec12, vec13, vec14, vec15);
1088 vec0, vec1, vec2, vec3);
1089 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1090 vec4, vec5, vec6, vec7);
1094 filt3, dst10, dst11);
1096 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1097 weight_vec, dst0, dst1, dst2, dst3)
1098 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1099 weight_vec, dst4, dst5, dst6, dst7);
1100 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
1101 weight_vec, dst8, dst9, dst10, dst11)
1104 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
1105 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1106 offset_vec, dst0, dst1, dst2, dst3);
1107 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1108 offset_vec, dst4, dst5, dst6, dst7);
1109 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
1110 offset_vec, dst8, dst9, dst10, dst11);
1113 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1114 vec0, vec1, vec2, vec3);
1115 PCKEV_H2_SH(dst9, dst8, dst11, dst10, vec4, vec5);
1117 PCKEV_B3_UB(vec1, vec0, vec4, vec3, vec5, vec2, out0, out1, out2);
1118 ST_UB2(out0, out1, dst, dst_stride);
1119 ST_D2(out2, 0, 1, dst + 16, dst_stride);
1120 dst += (2 * dst_stride);
1135 v16u8 out0, out1, out2, out3;
1137 v8i16 filt0, filt1, filt2, filt3;
1138 v16i8 mask0, mask1, mask2, mask3;
1139 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1140 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1142 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1143 v4i32 dst10, dst11, dst12, dst13, dst14, dst15, dst16, dst17;
1144 v4i32 weight_vec, rnd_vec, offset_vec;
1149 weight_vec = __msa_fill_w(
weight);
1150 rnd_vec = __msa_fill_w(rnd_val);
1151 offset_vec = __msa_fill_w(
offset);
1162 for (loop_cnt =
height >> 1; loop_cnt--;) {
1169 vec0, vec1, vec2, vec3);
1171 vec4, vec5, vec6, vec7);
1173 vec8, vec9, vec10, vec11);
1174 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1175 vec12, vec13, vec14, vec15);
1185 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1186 vec0, vec1, vec2, vec3);
1187 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1188 vec4, vec5, vec6, vec7);
1189 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1190 vec8, vec9, vec10, vec11);
1191 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1192 vec12, vec13, vec14, vec15);
1194 filt3, dst10, dst11);
1196 filt3, dst12, dst13);
1198 filt3, dst14, dst15);
1200 filt3, dst16, dst17);
1202 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1203 weight_vec, dst0, dst1, dst2, dst3)
1204 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1205 weight_vec, dst4, dst5, dst6, dst7);
1206 MUL4(dst10, weight_vec, dst11, weight_vec, dst12, weight_vec, dst13,
1207 weight_vec, dst10, dst11, dst12, dst13)
1208 MUL4(dst14, weight_vec, dst15, weight_vec, dst16, weight_vec, dst17,
1209 weight_vec, dst14, dst15, dst16, dst17);
1212 SRAR_W4_SW(dst10, dst11, dst12, dst13, rnd_vec);
1213 SRAR_W4_SW(dst14, dst15, dst16, dst17, rnd_vec);
1214 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1215 offset_vec, dst0, dst1, dst2, dst3);
1216 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1217 offset_vec, dst4, dst5, dst6, dst7);
1218 ADD4(dst10, offset_vec, dst11, offset_vec, dst12, offset_vec, dst13,
1219 offset_vec, dst10, dst11, dst12, dst13);
1220 ADD4(dst14, offset_vec, dst15, offset_vec, dst16, offset_vec, dst17,
1221 offset_vec, dst14, dst15, dst16, dst17);
1223 CLIP_SW8_0_255(dst10, dst11, dst12, dst13, dst14, dst15, dst16, dst17);
1224 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1225 vec0, vec1, vec2, vec3);
1226 PCKEV_H4_SH(dst11, dst10, dst13, dst12, dst15, dst14, dst17, dst16,
1227 vec4, vec5, vec6, vec7);
1231 ST_UB2(out0, out1, dst, 16);
1233 ST_UB2(out2, out3, dst, 16);
1249 v16u8 out0, out1, out2;
1251 v8i16 filt0, filt1, filt2, filt3;
1252 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1253 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1254 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1255 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1256 v4i32 dst8, dst9, dst10, dst11;
1258 v4i32 weight_vec, rnd_vec, offset_vec;
1263 weight_vec = __msa_fill_w(
weight);
1264 rnd_vec = __msa_fill_w(rnd_val);
1265 offset_vec = __msa_fill_w(
offset);
1280 for (loop_cnt = 64; loop_cnt--;) {
1286 vec0, vec1, vec2, vec3);
1288 vec4, vec5, vec6, vec7);
1290 vec8, vec9, vec10, vec11);
1292 vec12, vec13, vec14, vec15);
1303 vec0, vec1, vec2, vec3);
1304 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1305 vec4, vec5, vec6, vec7);
1309 filt3, dst10, dst11);
1311 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1312 weight_vec, dst0, dst1, dst2, dst3)
1313 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1314 weight_vec, dst4, dst5, dst6, dst7);
1315 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
1316 weight_vec, dst8, dst9, dst10, dst11)
1319 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
1320 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1321 offset_vec, dst0, dst1, dst2, dst3);
1322 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1323 offset_vec, dst4, dst5, dst6, dst7);
1324 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
1325 offset_vec, dst8, dst9, dst10, dst11);
1328 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1329 vec0, vec1, vec2, vec3);
1330 PCKEV_H2_SH(dst9, dst8, dst11, dst10, vec4, vec5);
1332 out2 = __msa_pckev_b((v16i8) vec5, (v16i8) vec4);
1333 ST_UB2(out0, out1, dst, 16);
1334 ST_UB(out2, dst + 32);
1349 const uint8_t *src_tmp;
1351 uint32_t loop_cnt, cnt;
1354 v8i16 filt0, filt1, filt2, filt3;
1355 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1356 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1357 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1358 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1360 v4i32 weight_vec, rnd_vec, offset_vec;
1365 weight_vec = __msa_fill_w(
weight);
1366 rnd_vec = __msa_fill_w(rnd_val);
1367 offset_vec = __msa_fill_w(
offset);
1382 for (loop_cnt =
height; loop_cnt--;) {
1386 for (cnt = 2; cnt--;) {
1392 vec0, vec1, vec2, vec3);
1394 vec4, vec5, vec6, vec7);
1396 vec8, vec9, vec10, vec11);
1398 vec12, vec13, vec14, vec15);
1407 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1408 weight_vec, dst0, dst1, dst2, dst3)
1409 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1410 weight_vec, dst4, dst5, dst6, dst7);
1413 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1414 offset_vec, dst0, dst1, dst2, dst3);
1415 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1416 offset_vec, dst4, dst5, dst6, dst7);
1418 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1419 vec0, vec1, vec2, vec3);
1421 ST_UB2(out0, out1, dst_tmp, 16);
1443 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1444 v16i8 src9, src10, src11, src12, src13, src14;
1445 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1446 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1447 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1448 v16i8 src2110, src4332, src6554, src8776, src10998;
1449 v16i8 src12111110, src14131312;
1451 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1452 v8i16 filt0, filt1, filt2, filt3;
1453 v8i16 vec0, vec1, vec2, vec3;
1454 v4i32 weight_vec, rnd_vec, offset_vec;
1457 src -= (3 * src_stride);
1460 weight_vec = __msa_fill_w(
weight);
1461 rnd_vec = __msa_fill_w(rnd_val);
1462 offset_vec = __msa_fill_w(
offset);
1469 src += (7 * src_stride);
1472 src10_r, src32_r, src54_r, src21_r);
1474 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1477 src32_r, src65_r, src54_r, src2110, src4332, src6554);
1480 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1482 src7, src8, src9, src10, src11, src12, src13, src14);
1483 src += (8 * src_stride);
1484 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1485 src76_r, src87_r, src98_r, src109_r);
1486 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1487 src1110_r, src1211_r, src1312_r, src1413_r);
1488 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1489 src1413_r, src1312_r,
1490 src8776, src10998, src12111110, src14131312);
1492 filt1, filt2, filt3, dst0, dst1);
1494 filt1, filt2, filt3, dst2, dst3);
1496 filt0, filt1, filt2, filt3, dst4, dst5);
1498 filt0, filt1, filt2, filt3, dst6, dst7);
1499 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1500 weight_vec, dst0, dst1, dst2, dst3)
1501 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1502 weight_vec, dst4, dst5, dst6, dst7);
1505 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1506 offset_vec, dst0, dst1, dst2, dst3);
1507 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1508 offset_vec, dst4, dst5, dst6, dst7);
1510 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1511 vec0, vec1, vec2, vec3);
1515 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1516 dst += (8 * dst_stride);
1519 src4332 = src12111110;
1520 src6554 = src14131312;
1525 src7, src8, src9, src10, src11, src12, src13, src14);
1526 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1527 src76_r, src87_r, src98_r, src109_r);
1528 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1529 src1110_r, src1211_r, src1312_r, src1413_r);
1530 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1531 src1413_r, src1312_r,
1532 src8776, src10998, src12111110, src14131312);
1534 filt1, filt2, filt3, dst0, dst1);
1536 filt1, filt2, filt3, dst2, dst3);
1538 filt0, filt1, filt2, filt3, dst4, dst5);
1540 filt0, filt1, filt2, filt3, dst6, dst7);
1541 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1542 weight_vec, dst0, dst1, dst2, dst3)
1543 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1544 weight_vec, dst4, dst5, dst6, dst7);
1547 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1548 offset_vec, dst0, dst1, dst2, dst3);
1549 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1550 offset_vec, dst4, dst5, dst6, dst7);
1552 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1553 vec0, vec1, vec2, vec3);
1557 ST_W2(out0, 0, 1, dst, dst_stride);
1558 }
else if (res == 4) {
1559 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
1561 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
1562 ST_W2(out1, 0, 1, dst + 4 * dst_stride, dst_stride);
1580 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1581 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1582 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1583 v8i16 filt0, filt1, filt2, filt3;
1584 v8i16 filter_vec, vec0, vec1, vec2, vec3;
1585 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1586 v4i32 weight_vec, rnd_vec, offset_vec;
1589 src -= (3 * src_stride);
1591 weight_vec = __msa_fill_w(
weight);
1592 rnd_vec = __msa_fill_w(rnd_val);
1593 offset_vec = __msa_fill_w(
offset);
1600 src += (7 * src_stride);
1603 src10_r, src32_r, src54_r, src21_r);
1604 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1606 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1607 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1608 src += (4 * src_stride);
1609 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1610 src76_r, src87_r, src98_r, src109_r);
1612 filt1, filt2, filt3, dst0, dst1);
1614 filt1, filt2, filt3, dst2, dst3);
1616 filt0, filt1, filt2, filt3, dst4, dst5);
1618 filt0, filt1, filt2, filt3, dst6, dst7);
1619 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1620 weight_vec, dst0, dst1, dst2, dst3)
1621 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1622 weight_vec, dst4, dst5, dst6, dst7);
1625 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1626 offset_vec, dst0, dst1, dst2, dst3);
1627 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1628 offset_vec, dst4, dst5, dst6, dst7);
1630 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1631 vec0, vec1, vec2, vec3);
1633 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1634 dst += (4 * dst_stride);
1646 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1648 filt1, filt2, filt3, dst0, dst1);
1650 filt1, filt2, filt3, dst2, dst3);
1651 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1652 weight_vec, dst0, dst1, dst2, dst3)
1654 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1655 offset_vec, dst0, dst1, dst2, dst3);
1658 out0 = __msa_pckev_b((v16i8) vec1, (v16i8) vec0);
1659 ST_D2(out0, 0, 1, dst, dst_stride);
1674 v16u8 out0, out1, out2;
1675 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1676 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1677 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1678 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1679 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1680 v16i8 src2110, src4332, src6554, src8776, src10998;
1681 v8i16 filt0, filt1, filt2, filt3;
1682 v4i32 dst0, dst1, dst2, dst3, dst4, dst5;
1683 v4i32 dst6, dst7, dst8, dst9, dst10, dst11;
1684 v8i16 filter_vec, vec0, vec1, vec2, vec3, vec4, vec5;
1685 v4i32 weight_vec, rnd_vec, offset_vec;
1688 src -= (3 * src_stride);
1690 weight_vec = __msa_fill_w(
weight);
1691 rnd_vec = __msa_fill_w(rnd_val);
1692 offset_vec = __msa_fill_w(
offset);
1699 src += (7 * src_stride);
1702 src10_r, src32_r, src54_r, src21_r);
1703 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1705 src10_l, src32_l, src54_l, src21_l);
1706 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1707 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1708 src2110, src4332, src6554);
1710 for (loop_cnt = 4; loop_cnt--;) {
1711 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1712 src += (4 * src_stride);
1714 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1715 src76_r, src87_r, src98_r, src109_r);
1716 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1717 src76_l, src87_l, src98_l, src109_l);
1718 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1721 filt1, filt2, filt3, dst0, dst1);
1723 filt1, filt2, filt3, dst2, dst3);
1725 filt0, filt1, filt2, filt3, dst4, dst5);
1727 filt0, filt1, filt2, filt3, dst6, dst7);
1729 filt0, filt1, filt2, filt3, dst8, dst9);
1731 filt0, filt1, filt2, filt3, dst10, dst11);
1733 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1734 weight_vec, dst0, dst1, dst2, dst3)
1735 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1736 weight_vec, dst4, dst5, dst6, dst7);
1737 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
1738 weight_vec, dst8, dst9, dst10, dst11);
1741 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
1742 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1743 offset_vec, dst0, dst1, dst2, dst3);
1744 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1745 offset_vec, dst4, dst5, dst6, dst7);
1746 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
1747 offset_vec, dst8, dst9, dst10, dst11);
1750 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1751 vec0, vec1, vec2, vec3);
1752 PCKEV_H2_SH(dst9, dst8, dst11, dst10, vec4, vec5);
1753 PCKEV_B3_UB(vec1, vec0, vec3, vec2, vec5, vec4, out0, out1, out2);
1754 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1755 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
1756 dst += (4 * dst_stride);
1782 const uint8_t *src_tmp;
1786 v16u8 out0, out1, out2, out3;
1787 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1788 v16i8 src10_r, src32_r, src54_r, src76_r;
1789 v16i8 src21_r, src43_r, src65_r, src87_r;
1790 v16i8 src10_l, src32_l, src54_l, src76_l;
1791 v16i8 src21_l, src43_l, src65_l, src87_l;
1792 v16i8 src98_r, src109_r, src98_l, src109_l;
1793 v8i16 filt0, filt1, filt2, filt3;
1794 v8i16 filter_vec, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1795 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1796 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
1797 v4i32 weight_vec, rnd_vec, offset_vec;
1800 src -= (3 * src_stride);
1802 weight_vec = __msa_fill_w(
weight);
1803 rnd_vec = __msa_fill_w(rnd_val);
1804 offset_vec = __msa_fill_w(
offset);
1811 for (cnt = weightmul16; cnt--;) {
1816 src_tmp += (7 * src_stride);
1818 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1819 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1820 src_tmp += (4 * src_stride);
1823 src10_r, src32_r, src54_r, src21_r);
1824 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1826 src10_l, src32_l, src54_l, src21_l);
1827 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1828 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1829 src76_r, src87_r, src98_r, src109_r);
1830 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1831 src76_l, src87_l, src98_l, src109_l);
1834 filt1, filt2, filt3, dst0, dst1);
1836 filt1, filt2, filt3, dst2, dst3);
1838 filt1, filt2, filt3, dst4, dst5);
1840 filt1, filt2, filt3, dst6, dst7);
1842 filt1, filt2, filt3, dst8, dst9);
1844 filt1, filt2, filt3, dst10, dst11);
1846 filt1, filt2, filt3, dst12, dst13);
1848 filt1, filt2, filt3, dst14, dst15);
1850 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1851 weight_vec, dst0, dst1, dst2, dst3)
1852 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1853 weight_vec, dst4, dst5, dst6, dst7);
1854 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
1855 weight_vec, dst8, dst9, dst10, dst11);
1856 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec, dst15,
1857 weight_vec, dst12, dst13, dst14, dst15);
1860 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
1861 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
1862 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1863 offset_vec, dst0, dst1, dst2, dst3);
1864 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1865 offset_vec, dst4, dst5, dst6, dst7);
1866 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
1867 offset_vec, dst8, dst9, dst10, dst11);
1868 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec, dst15,
1869 offset_vec, dst12, dst13, dst14, dst15);
1871 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
1872 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1873 vec0, vec1, vec2, vec3);
1874 PCKEV_H4_SH(dst9, dst8, dst11, dst10, dst13, dst12, dst15,
1875 dst14, vec4, vec5, vec6, vec7);
1876 PCKEV_B4_UB(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6,
1877 out0, out1, out2, out3);
1879 ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
1880 dst_tmp += (4 * dst_stride);
1891 LD_SB2(src_tmp, src_stride, src7, src8);
1894 src10_r, src32_r, src54_r, src21_r);
1895 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1897 src10_l, src32_l, src54_l, src21_l);
1898 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1899 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1900 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1903 filt1, filt2, filt3, dst0, dst1);
1905 filt1, filt2, filt3, dst2, dst3);
1907 filt1, filt2, filt3, dst4, dst5);
1909 filt1, filt2, filt3, dst6, dst7);
1910 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
1911 weight_vec, dst0, dst1, dst2, dst3)
1912 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec, dst7,
1913 weight_vec, dst4, dst5, dst6, dst7);
1916 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
1917 offset_vec, dst0, dst1, dst2, dst3);
1918 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec, dst7,
1919 offset_vec, dst4, dst5, dst6, dst7);
1921 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
1922 vec0, vec1, vec2, vec3);
1925 ST_UB2(out0, out1, dst_tmp, dst_stride);
2015 const int8_t *filter_x,
2016 const int8_t *filter_y,
2024 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2025 v8i16 filt0, filt1, filt2, filt3;
2026 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2027 v16i8 mask1, mask2, mask3;
2029 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2030 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2031 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
2032 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
2033 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2034 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
2035 v4i32 weight_vec, offset_vec, rnd_vec;
2039 src -= ((3 * src_stride) + 3);
2040 filter_vec =
LD_SH(filter_x);
2044 filter_vec =
LD_SH(filter_y);
2046 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2052 weight_vec = __msa_fill_w(
weight);
2053 offset_vec = __msa_fill_w(
offset);
2054 rnd_vec = __msa_fill_w(rnd_val);
2057 src += (7 * src_stride);
2060 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2061 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2063 vec8, vec9, vec10, vec11);
2064 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
2065 vec12, vec13, vec14, vec15);
2072 vec0 = __msa_ilvl_b((v16i8)
zero, (v16i8) vec12);
2073 vec1 = __msa_ilvl_b((v16i8)
zero, (v16i8) vec13);
2074 vec2 = __msa_ilvl_b((v16i8)
zero, (v16i8) vec14);
2075 vec3 = __msa_ilvl_b((v16i8)
zero, (v16i8) vec15);
2076 dst6 = __msa_dotp_s_w((v8i16) vec0, (v8i16) filt0);
2077 dst6 = __msa_dpadd_s_w((v4i32) dst6, (v8i16) vec1, (v8i16) filt1);
2078 dst6 = __msa_dpadd_s_w((v4i32) dst6, (v8i16) vec2, (v8i16) filt2);
2079 dst6 = __msa_dpadd_s_w((v4i32) dst6, (v8i16) vec3, (v8i16) filt3);
2081 ILVEV_H2_SH(dst0, dst1, dst3, dst4, dst10_r, dst43_r);
2082 ILVEV_H2_SH(dst1, dst2, dst4, dst5, dst21_r, dst54_r);
2083 ILVEV_H2_SH(dst2, dst3, dst5, dst6, dst32_r, dst65_r);
2085 for (loop_cnt =
height >> 2; loop_cnt--;) {
2086 LD_SB4(
src, src_stride, src7, src8, src9, src10);
2087 src += (4 * src_stride);
2089 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2090 vec0, vec1, vec2, vec3);
2091 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2092 vec4, vec5, vec6, vec7);
2096 filt3, dst8, dst10);
2098 dst76_r = __msa_ilvev_h((v8i16) dst7, (v8i16) dst6);
2099 ILVEV_H2_SH(dst7, dst8, dst9, dst10, dst87_r, dst109_r);
2100 dst98_r = __msa_ilvev_h((v8i16) dst9, (v8i16) dst8);
2102 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2103 filt_h1, filt_h2, filt_h3);
2104 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2105 filt_h1, filt_h2, filt_h3);
2106 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2107 filt_h1, filt_h2, filt_h3);
2108 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2109 filt_h1, filt_h2, filt_h3);
2111 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2112 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2113 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2114 SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2115 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2116 ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2118 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2119 out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2120 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2121 dst += (4 * dst_stride);
2137 const int8_t *filter_x,
2138 const int8_t *filter_y,
2145 uint32_t loop_cnt, cnt;
2146 const uint8_t *src_tmp;
2148 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
2149 v8i16 filt0, filt1, filt2, filt3;
2150 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2151 v16i8 mask1, mask2, mask3;
2153 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2154 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2155 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2156 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2157 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2158 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2159 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2160 v4i32 weight_vec, offset_vec, rnd_vec;
2161 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
2162 v4i32 dst11, dst12, dst13, dst14, dst15;
2166 src -= ((3 * src_stride) + 3);
2168 weight_vec = __msa_fill_w(
weight);
2169 offset_vec = __msa_fill_w(
offset);
2170 rnd_vec = __msa_fill_w(rnd_val);
2173 filter_vec =
LD_SH(filter_x);
2177 filter_vec =
LD_SH(filter_y);
2179 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2185 for (cnt =
width >> 3; cnt--;) {
2190 src_tmp += (7 * src_stride);
2193 vec0, vec1, vec2, vec3);
2195 vec4, vec5, vec6, vec7);
2197 vec8, vec9, vec10, vec11);
2198 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2199 vec12, vec13, vec14, vec15);
2207 filt2, filt3, dst6, dst7);
2209 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2210 vec0, vec1, vec2, vec3);
2211 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2212 vec4, vec5, vec6, vec7);
2213 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2214 vec8, vec9, vec10, vec11);
2218 filt3, dst10, dst11);
2220 filt3, dst12, dst13);
2222 ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
2223 ILVEV_H2_SH(dst2, dst4, dst3, dst5, dst21_r, dst21_l);
2224 ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst32_r, dst32_l);
2225 ILVEV_H2_SH(dst6, dst8, dst7, dst9, dst43_r, dst43_l);
2226 ILVEV_H2_SH(dst8, dst10, dst9, dst11, dst54_r, dst54_l);
2227 ILVEV_H2_SH(dst10, dst12, dst11, dst13, dst65_r, dst65_l);
2229 for (loop_cnt =
height >> 1; loop_cnt--;) {
2230 LD_SB2(src_tmp, src_stride, src7, src8);
2231 src_tmp += 2 * src_stride;
2233 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2234 vec0, vec1, vec2, vec3);
2236 filt2, filt3, dst14, dst15);
2238 ILVEV_H2_SH(dst12, dst14, dst13, dst15, dst76_r, dst76_l);
2240 filt_h0, filt_h1, filt_h2, filt_h3);
2242 filt_h0, filt_h1, filt_h2, filt_h3);
2247 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2248 vec0, vec1, vec2, vec3);
2250 filt2, filt3, dst0, dst1);
2252 ILVEV_H2_SH(dst14, dst0, dst15, dst1, dst87_r, dst87_l);
2254 filt_h0, filt_h1, filt_h2, filt_h3);
2256 filt_h0, filt_h1, filt_h2, filt_h3);
2260 MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2261 MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
2262 SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
2263 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2264 ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
2267 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2268 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2269 ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride);
2270 dst_tmp += (2 * dst_stride);
2297 const int8_t *filter_x,
2298 const int8_t *filter_y,
2313 const int8_t *filter_x,
2314 const int8_t *filter_y,
2321 const uint8_t *src_tmp;
2324 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2325 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2326 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2327 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2328 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2329 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
2330 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2331 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
2332 v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2333 v8i16 dst76_l, filter_vec;
2334 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
2335 v4i32 weight_vec, offset_vec, rnd_vec;
2338 src -= ((3 * src_stride) + 3);
2340 filter_vec =
LD_SH(filter_x);
2344 filter_vec =
LD_SH(filter_y);
2347 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2349 weight_vec = __msa_fill_w(
weight);
2350 offset_vec = __msa_fill_w(
offset);
2351 rnd_vec = __msa_fill_w(rnd_val);
2362 src_tmp += (7 * src_stride);
2369 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2378 filt2, filt3, dst6, dst7);
2379 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2380 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2381 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2386 filt3, dst10, dst11);
2388 filt3, dst12, dst13);
2390 for (loop_cnt = 16; loop_cnt--;) {
2391 src7 =
LD_SB(src_tmp);
2392 src_tmp += src_stride;
2394 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2397 filt3, dst14, dst15);
2398 ILVEV_H2_SH(dst0, dst2, dst1, dst3, dst10_r, dst10_l);
2399 ILVEV_H2_SH(dst4, dst6, dst5, dst7, dst32_r, dst32_l);
2400 ILVEV_H2_SH(dst8, dst10, dst9, dst11, dst54_r, dst54_l);
2401 ILVEV_H2_SH(dst12, dst14, dst13, dst15, dst76_r, dst76_l);
2404 filt_h0, filt_h1, filt_h2, filt_h3);
2406 filt_h0, filt_h1, filt_h2, filt_h3);
2410 MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2412 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2414 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2415 out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2417 dst_tmp += dst_stride;
2444 src += (7 * src_stride);
2446 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2447 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2448 VSHF_B4_SB(
src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2450 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2460 ILVEV_H2_SH(dst0, dst1, dst3, dst4, dst10_r, dst43_r);
2461 ILVEV_H2_SH(dst1, dst2, dst4, dst5, dst21_r, dst54_r);
2462 ILVEV_H2_SH(dst2, dst3, dst5, dst6, dst32_r, dst65_r);
2464 for (loop_cnt = 4; loop_cnt--;) {
2465 LD_SB4(
src, src_stride, src7, src8, src9, src10);
2466 src += (4 * src_stride);
2468 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2470 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2475 filt3, dst8, dst10);
2477 ILVEV_H2_SH(dst6, dst7, dst7, dst8, dst76_r, dst87_r);
2478 ILVEV_H2_SH(dst9, dst10, dst8, dst9, dst109_r, dst98_r);
2480 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2481 filt_h1, filt_h2, filt_h3);
2482 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2483 filt_h1, filt_h2, filt_h3);
2484 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2485 filt_h1, filt_h2, filt_h3);
2486 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2487 filt_h1, filt_h2, filt_h3);
2489 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2490 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2491 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2492 SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2493 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2494 ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2496 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2497 out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2498 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2499 dst += (4 * dst_stride);
2515 const int8_t *filter_x,
2516 const int8_t *filter_y,
2531 const int8_t *filter_x,
2532 const int8_t *filter_y,
2547 const int8_t *filter_x,
2548 const int8_t *filter_y,
2563 const int8_t *filter_x,
2564 const int8_t *filter_y,
2579 const int8_t *filter_x,
2580 const int8_t *filter_y,
2601 v8i16 filt0, filt1, filter_vec;
2603 v8i16 tmp0, tmp1, tmp2, tmp3;
2606 v4i32 weight_vec, rnd_vec, offset_vec;
2618 weight_vec = __msa_fill_w(
weight);
2619 rnd_vec = __msa_fill_w(rnd_val);
2620 offset_vec = __msa_fill_w(
offset);
2630 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
2632 ADD2(dst0, offset_vec, dst1, offset_vec, dst0, dst1);
2634 vec0 = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
2635 out = (v16u8) __msa_pckev_b((v16i8) vec0, (v16i8) vec0);
2637 dst += (4 * dst_stride);
2652 v16i8 mask1, vec0, vec1, vec2, vec3;
2653 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2654 v4i32 dst0, dst1, dst2, dst3;
2656 v4i32 weight_vec, rnd_vec, offset_vec;
2667 weight_vec = __msa_fill_w(
weight);
2668 rnd_vec = __msa_fill_w(rnd_val);
2669 offset_vec = __msa_fill_w(
offset);
2685 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
2686 weight_vec, dst0, dst1, dst2, dst3);
2688 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
2689 offset_vec, dst0, dst1, dst2, dst3);
2691 tmp0 = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
2692 tmp1 = __msa_pckev_h((v8i16) dst3, (v8i16) dst2);
2694 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2695 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2712 v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2713 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2714 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2716 v4i32 weight_vec, rnd_vec, offset_vec;
2726 weight_vec = __msa_fill_w(
weight);
2727 rnd_vec = __msa_fill_w(rnd_val);
2728 offset_vec = __msa_fill_w(
offset);
2732 for (loop_cnt = (
height >> 3); loop_cnt--;) {
2734 src += (8 * src_stride);
2738 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
2739 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
2757 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
2758 weight_vec, dst0, dst1, dst2, dst3);
2759 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
2760 dst7, weight_vec, dst4, dst5, dst6, dst7);
2763 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
2764 offset_vec, dst0, dst1, dst2, dst3);
2765 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
2766 dst7, offset_vec, dst4, dst5, dst6, dst7);
2768 tmp0 = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
2769 tmp1 = __msa_pckev_h((v8i16) dst3, (v8i16) dst2);
2770 tmp2 = __msa_pckev_h((v8i16) dst5, (v8i16) dst4);
2771 tmp3 = __msa_pckev_h((v8i16) dst7, (v8i16) dst6);
2773 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2774 dst += (8 * dst_stride);
2791 }
else if (4 ==
height) {
2811 v16u8 out0, out1, out2, out3;
2812 v8i16 filter_vec, filt0, filt1;
2816 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2817 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2818 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2819 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
2820 v4i32 weight_vec, rnd_vec, offset_vec;
2829 weight_vec = __msa_fill_w(
weight);
2830 rnd_vec = __msa_fill_w(rnd_val);
2831 offset_vec = __msa_fill_w(
offset);
2839 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2856 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2857 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2858 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2859 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2877 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
2878 weight_vec, dst0, dst1, dst2, dst3);
2879 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
2880 dst7, weight_vec, dst4, dst5, dst6, dst7);
2883 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
2884 offset_vec, dst0, dst1, dst2, dst3);
2885 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
2886 dst7, offset_vec, dst4, dst5, dst6, dst7);
2889 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
2890 weight_vec, dst8, dst9, dst10, dst11);
2891 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
2892 dst15, weight_vec, dst12, dst13, dst14, dst15);
2893 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
2894 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
2895 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
2896 offset_vec, dst8, dst9, dst10, dst11);
2897 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
2898 dst15, offset_vec, dst12, dst13, dst14, dst15);
2899 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
2903 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
2904 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
2907 ST_W2(out0, 0, 2, dst, dst_stride);
2908 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2909 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2910 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2911 dst += (4 * dst_stride);
2912 ST_W2(out2, 0, 2, dst, dst_stride);
2913 ST_H2(out2, 2, 6, dst + 4, dst_stride);
2914 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
2915 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2928 v8i16 filter_vec, filt0, filt1;
2932 v16i8 vec0, vec1, vec2, vec3;
2933 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2934 v4i32 weight_vec, rnd_vec, offset_vec;
2935 v4i32 dst0, dst1, dst2, dst3;
2944 weight_vec = __msa_fill_w(
weight);
2945 rnd_vec = __msa_fill_w(rnd_val);
2946 offset_vec = __msa_fill_w(
offset);
2962 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
2963 weight_vec, dst0, dst1, dst2, dst3);
2965 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
2966 offset_vec, dst0, dst1, dst2, dst3);
2970 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2985 v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2986 v8i16 filter_vec, filt0, filt1;
2987 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2988 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2989 v4i32 weight_vec, rnd_vec, offset_vec;
2998 weight_vec = __msa_fill_w(
weight);
2999 rnd_vec = __msa_fill_w(rnd_val);
3000 offset_vec = __msa_fill_w(
offset);
3009 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3027 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3028 weight_vec, dst0, dst1, dst2, dst3);
3029 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3030 dst7, weight_vec, dst4, dst5, dst6, dst7);
3033 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3034 offset_vec, dst0, dst1, dst2, dst3);
3035 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3036 dst7, offset_vec, dst4, dst5, dst6, dst7);
3043 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3055 v16u8 out0, out1, out2;
3056 v8i16 filter_vec, filt0, filt1;
3061 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3062 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3063 v4i32 dst8, dst9, dst10, dst11;
3064 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3065 v4i32 weight_vec, rnd_vec, offset_vec;
3074 weight_vec = __msa_fill_w(
weight);
3075 rnd_vec = __msa_fill_w(rnd_val);
3076 offset_vec = __msa_fill_w(
offset);
3085 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3086 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3087 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3113 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3114 weight_vec, dst0, dst1, dst2, dst3);
3115 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3116 dst7, weight_vec, dst4, dst5, dst6, dst7);
3119 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3120 offset_vec, dst0, dst1, dst2, dst3);
3121 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3122 dst7, offset_vec, dst4, dst5, dst6, dst7);
3125 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3126 weight_vec, dst8, dst9, dst10, dst11);
3127 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3128 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3129 offset_vec, dst8, dst9, dst10, dst11);
3134 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3136 PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
3137 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3138 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3152 v8i16 filter_vec, filt0, filt1;
3153 v16u8 out0, out1, out2, out3;
3157 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3158 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3159 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3160 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
3161 v4i32 weight_vec, rnd_vec, offset_vec;
3170 weight_vec = __msa_fill_w(
weight);
3171 rnd_vec = __msa_fill_w(rnd_val);
3172 offset_vec = __msa_fill_w(
offset);
3176 for (loop_cnt = (
height >> 3); loop_cnt--;) {
3178 src += (8 * src_stride);
3183 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3201 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3202 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3203 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3204 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3222 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3223 weight_vec, dst0, dst1, dst2, dst3);
3224 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3225 dst7, weight_vec, dst4, dst5, dst6, dst7);
3228 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3229 offset_vec, dst0, dst1, dst2, dst3);
3230 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3231 dst7, offset_vec, dst4, dst5, dst6, dst7);
3234 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3235 weight_vec, dst8, dst9, dst10, dst11);
3236 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
3237 dst15, weight_vec, dst12, dst13, dst14, dst15);
3238 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3239 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
3240 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3241 offset_vec, dst8, dst9, dst10, dst11);
3242 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
3243 dst15, offset_vec, dst12, dst13, dst14, dst15);
3244 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
3248 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3249 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
3253 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3254 dst += (8 * dst_stride);
3271 }
else if (4 ==
height) {
3274 }
else if (6 ==
height) {
3295 v16u8 out0, out1, out2;
3296 v8i16 filter_vec, filt0, filt1;
3299 v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3302 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3303 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3304 v4i32 dst8, dst9, dst10, dst11;
3305 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3307 v4i32 weight_vec, rnd_vec, offset_vec;
3316 weight_vec = __msa_fill_w(
weight);
3317 rnd_vec = __msa_fill_w(rnd_val);
3318 offset_vec = __msa_fill_w(
offset);
3323 for (loop_cnt = 4; loop_cnt--;) {
3325 src += (4 * src_stride);
3330 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3359 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3360 weight_vec, dst0, dst1, dst2, dst3);
3361 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3362 dst7, weight_vec, dst4, dst5, dst6, dst7);
3365 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3366 offset_vec, dst0, dst1, dst2, dst3);
3367 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3368 dst7, offset_vec, dst4, dst5, dst6, dst7);
3371 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3372 weight_vec, dst8, dst9, dst10, dst11);
3373 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3374 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3375 offset_vec, dst8, dst9, dst10, dst11);
3380 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3382 PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
3383 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3384 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
3385 dst += (4 * dst_stride);
3400 v16u8 out0, out1, out2, out3;
3402 v8i16 filter_vec, filt0, filt1;
3405 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3406 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3407 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
3408 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3409 v4i32 weight_vec, rnd_vec, offset_vec;
3418 weight_vec = __msa_fill_w(
weight);
3419 rnd_vec = __msa_fill_w(rnd_val);
3420 offset_vec = __msa_fill_w(
offset);
3424 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3427 src += (4 * src_stride);
3432 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3450 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3451 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3452 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3453 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3471 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3472 weight_vec, dst0, dst1, dst2, dst3);
3473 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3474 dst7, weight_vec, dst4, dst5, dst6, dst7);
3477 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3478 offset_vec, dst0, dst1, dst2, dst3);
3479 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3480 dst7, offset_vec, dst4, dst5, dst6, dst7);
3483 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3484 weight_vec, dst8, dst9, dst10, dst11);
3485 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
3486 dst15, weight_vec, dst12, dst13, dst14, dst15);
3487 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3488 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
3489 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3490 offset_vec, dst8, dst9, dst10, dst11);
3491 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
3492 dst15, offset_vec, dst12, dst13, dst14, dst15);
3493 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
3497 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3498 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
3499 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
3500 out0, out1, out2, out3);
3502 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3503 dst += (4 * dst_stride);
3518 v16u8 out0, out1, out2;
3520 v8i16 filter_vec, filt0, filt1;
3521 v16i8 mask0, mask1, mask2, mask3;
3522 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3523 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3524 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3525 v4i32 dst8, dst9, dst10, dst11;
3526 v4i32 weight_vec, rnd_vec, offset_vec;
3535 weight_vec = __msa_fill_w(
weight);
3536 rnd_vec = __msa_fill_w(rnd_val);
3537 offset_vec = __msa_fill_w(
offset);
3544 for (loop_cnt = 16; loop_cnt--;) {
3547 src += (2 * src_stride);
3571 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
3581 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3582 weight_vec, dst0, dst1, dst2, dst3);
3583 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3584 dst7, weight_vec, dst4, dst5, dst6, dst7);
3587 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3588 offset_vec, dst0, dst1, dst2, dst3);
3589 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3590 dst7, offset_vec, dst4, dst5, dst6, dst7);
3593 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3594 weight_vec, dst8, dst9, dst10, dst11);
3595 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3596 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3597 offset_vec, dst8, dst9, dst10, dst11);
3602 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3604 PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
3605 ST_UB2(out0, out1, dst, dst_stride);
3606 ST_D2(out2, 0, 1, dst + 16, dst_stride);
3607 dst += (2 * dst_stride);
3622 v16u8 out0, out1, out2, out3;
3624 v8i16 filter_vec, filt0, filt1;
3626 v16i8 mask1, mask2, mask3;
3627 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3628 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3629 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3630 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
3631 v4i32 weight_vec, rnd_vec, offset_vec;
3640 weight_vec = __msa_fill_w(
weight);
3641 rnd_vec = __msa_fill_w(rnd_val);
3642 offset_vec = __msa_fill_w(
offset);
3648 for (loop_cnt = (
height >> 1); loop_cnt--;) {
3676 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3677 VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
3678 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
3679 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
3697 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3698 weight_vec, dst0, dst1, dst2, dst3);
3699 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3700 dst7, weight_vec, dst4, dst5, dst6, dst7);
3703 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3704 offset_vec, dst0, dst1, dst2, dst3);
3705 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3706 dst7, offset_vec, dst4, dst5, dst6, dst7);
3709 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
3710 weight_vec, dst8, dst9, dst10, dst11);
3711 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
3712 dst15, weight_vec, dst12, dst13, dst14, dst15);
3713 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
3714 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
3715 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
3716 offset_vec, dst8, dst9, dst10, dst11);
3717 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
3718 dst15, offset_vec, dst12, dst13, dst14, dst15);
3719 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
3723 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
3724 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
3725 PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6,
3726 out0, out1, out2, out3);
3727 ST_UB2(out0, out1, dst, 16);
3729 ST_UB2(out2, out3, dst, 16);
3745 v16i8 src10_r, src32_r, src21_r, src43_r;
3746 v16i8 src2110, src4332;
3748 v4i32 dst0_r, dst0_l;
3749 v8i16 filter_vec, filt0, filt1;
3750 v8i16 tmp0, tmp1, tmp2, tmp3;
3751 v4i32 weight_vec, rnd_vec, offset_vec;
3756 weight_vec = __msa_fill_w(
weight);
3757 rnd_vec = __msa_fill_w(rnd_val);
3758 offset_vec = __msa_fill_w(
offset);
3767 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3774 MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
3776 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
3778 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
3779 out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
3794 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3795 v16i8 src2110, src4332, src6554;
3796 v4i32 dst0, dst1, dst2, dst3;
3797 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3798 v8i16 filter_vec, filt0, filt1;
3799 v4i32 weight_vec, rnd_vec, offset_vec;
3804 weight_vec = __msa_fill_w(
weight);
3805 rnd_vec = __msa_fill_w(rnd_val);
3806 offset_vec = __msa_fill_w(
offset);
3815 src32_r, src43_r, src54_r, src65_r);
3816 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3817 src2110, src4332, src6554);
3828 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3829 weight_vec, dst0, dst1, dst2, dst3);
3831 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3832 offset_vec, dst0, dst1, dst2, dst3);
3836 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3837 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
3852 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
3853 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3854 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3855 v16i8 src2110, src4332, src6554, src8776;
3857 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3858 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3859 v8i16 filter_vec, filt0, filt1;
3860 v4i32 weight_vec, rnd_vec, offset_vec;
3865 weight_vec = __msa_fill_w(
weight);
3866 rnd_vec = __msa_fill_w(rnd_val);
3867 offset_vec = __msa_fill_w(
offset);
3874 src += (3 * src_stride);
3876 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3878 for (loop_cnt = (
height >> 3); loop_cnt--;) {
3880 src3, src4, src5, src6, src7, src8, src9, src10);
3881 src += (8 * src_stride);
3883 src32_r, src43_r, src54_r, src65_r);
3884 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3885 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3886 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3887 src109_r, src98_r, src4332, src6554, src8776, src10998);
3903 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
3904 weight_vec, dst0, dst1, dst2, dst3);
3905 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
3906 dst7, weight_vec, dst4, dst5, dst6, dst7);
3909 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
3910 offset_vec, dst0, dst1, dst2, dst3);
3911 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
3912 dst7, offset_vec, dst4, dst5, dst6, dst7);
3918 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3919 dst += (8 * dst_stride);
3939 }
else if (4 ==
height) {
3942 }
else if (0 == (
height % 8)) {
3959 v16u8 out0, out1, out2, out3;
3960 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
3961 v16i8 src10_r, src32_r, src21_r, src43_r;
3962 v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3963 v8i16 filter_vec, filt0, filt1;
3964 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3965 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3966 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
3967 v4i32 weight_vec, rnd_vec, offset_vec;
3972 weight_vec = __msa_fill_w(
weight);
3973 rnd_vec = __msa_fill_w(rnd_val);
3974 offset_vec = __msa_fill_w(
offset);
3981 src += (3 * src_stride);
3982 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3985 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3986 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3987 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4016 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4017 weight_vec, dst0, dst1, dst2, dst3);
4018 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4019 dst7, weight_vec, dst4, dst5, dst6, dst7);
4022 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4023 offset_vec, dst0, dst1, dst2, dst3);
4024 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4025 dst7, offset_vec, dst4, dst5, dst6, dst7);
4028 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4029 weight_vec, dst8, dst9, dst10, dst11);
4030 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4031 dst15, weight_vec, dst12, dst13, dst14, dst15);
4032 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4033 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4034 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4035 offset_vec, dst8, dst9, dst10, dst11);
4036 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4037 dst15, offset_vec, dst12, dst13, dst14, dst15);
4038 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4042 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4043 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4047 ST_W2(out0, 0, 2, dst, dst_stride);
4048 ST_H2(out0, 2, 6, dst + 4, dst_stride);
4049 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
4050 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
4051 dst += (4 * dst_stride);
4052 ST_W2(out2, 0, 2, dst, dst_stride);
4053 ST_H2(out2, 2, 6, dst + 4, dst_stride);
4054 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
4055 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
4069 v16i8 src10_r, src32_r, src21_r, src43_r;
4070 v4i32 dst0, dst1, dst2, dst3;
4071 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4072 v8i16 filter_vec, filt0, filt1;
4073 v4i32 weight_vec, rnd_vec, offset_vec;
4078 weight_vec = __msa_fill_w(
weight);
4079 rnd_vec = __msa_fill_w(rnd_val);
4080 offset_vec = __msa_fill_w(
offset);
4099 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4100 weight_vec, dst0, dst1, dst2, dst3);
4102 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4103 offset_vec, dst0, dst1, dst2, dst3);
4107 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4122 v16i8 src10_r, src32_r, src21_r, src43_r;
4123 v16i8 src5, src6, src54_r, src65_r;
4124 v8i16 filter_vec, filt0, filt1;
4125 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4126 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4127 v4i32 weight_vec, rnd_vec, offset_vec;
4132 weight_vec = __msa_fill_w(
weight);
4133 rnd_vec = __msa_fill_w(rnd_val);
4134 offset_vec = __msa_fill_w(
offset);
4143 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
4160 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4161 weight_vec, dst0, dst1, dst2, dst3);
4162 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4163 dst7, weight_vec, dst4, dst5, dst6, dst7);
4166 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4167 offset_vec, dst0, dst1, dst2, dst3);
4168 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4169 dst7, offset_vec, dst4, dst5, dst6, dst7);
4174 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4186 v16u8 out0, out1, out2;
4187 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
4188 v16i8 src10_r, src32_r, src54_r, src76_r;
4189 v16i8 src21_r, src43_r, src65_r, src87_r;
4190 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4191 v4i32 dst8, dst9, dst10, dst11;
4192 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4193 v8i16 filter_vec, filt0, filt1;
4194 v4i32 weight_vec, rnd_vec, offset_vec;
4198 weight_vec = __msa_fill_w(
weight);
4199 rnd_vec = __msa_fill_w(rnd_val);
4200 offset_vec = __msa_fill_w(
offset);
4207 src += (3 * src_stride);
4208 LD_SB6(
src, src_stride, src3, src4, src5, src6, src7, src8);
4210 ILVR_B4_SB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3, src10_r, src21_r,
4212 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4236 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4237 weight_vec, dst0, dst1, dst2, dst3);
4238 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4239 dst7, weight_vec, dst4, dst5, dst6, dst7);
4242 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4243 offset_vec, dst0, dst1, dst2, dst3);
4244 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4245 dst7, offset_vec, dst4, dst5, dst6, dst7);
4247 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4248 weight_vec, dst8, dst9, dst10, dst11);
4249 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4250 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4251 offset_vec, dst8, dst9, dst10, dst11);
4256 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4257 PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4258 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4259 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4273 v16u8 out0, out1, out2, out3;
4274 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4275 v16i8 src10_r, src32_r, src21_r, src43_r;
4276 v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
4277 v8i16 filter_vec, filt0, filt1;
4278 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4279 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4280 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4281 v4i32 weight_vec, rnd_vec, offset_vec;
4286 weight_vec = __msa_fill_w(
weight);
4287 rnd_vec = __msa_fill_w(rnd_val);
4288 offset_vec = __msa_fill_w(
offset);
4295 src += (3 * src_stride);
4298 for (loop_cnt = (
height >> 3); loop_cnt--;) {
4300 src3, src4, src5, src6, src7, src8, src9, src10);
4301 src += (8 * src_stride);
4303 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
4304 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4305 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4334 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4335 weight_vec, dst0, dst1, dst2, dst3);
4336 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4337 dst7, weight_vec, dst4, dst5, dst6, dst7);
4340 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4341 offset_vec, dst0, dst1, dst2, dst3);
4342 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4343 dst7, offset_vec, dst4, dst5, dst6, dst7);
4346 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4347 weight_vec, dst8, dst9, dst10, dst11);
4348 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4349 dst15, weight_vec, dst12, dst13, dst14, dst15);
4350 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4351 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4352 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4353 offset_vec, dst8, dst9, dst10, dst11);
4354 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4355 dst15, offset_vec, dst12, dst13, dst14, dst15);
4356 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4360 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4361 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4365 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4366 dst += (8 * dst_stride);
4387 }
else if (4 ==
height) {
4390 }
else if (6 ==
height) {
4411 v16u8 out0, out1, out2, out3, out4, out5;
4412 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4413 v16i8 src10_r, src32_r, src21_r, src43_r;
4414 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4415 v16i8 src2110, src4332;
4416 v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
4417 v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
4418 v8i16 filter_vec, filt0, filt1;
4419 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4420 v4i32 dst9, dst10, dst11;
4421 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmp9;
4422 v4i32 weight_vec, rnd_vec, offset_vec;
4425 src -= (1 * src_stride);
4427 weight_vec = __msa_fill_w(
weight);
4428 rnd_vec = __msa_fill_w(rnd_val);
4429 offset_vec = __msa_fill_w(
offset);
4436 src += (3 * src_stride);
4439 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4441 for (loop_cnt = 2; loop_cnt--;) {
4442 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
4443 src += (8 * src_stride);
4448 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4449 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4473 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4474 weight_vec, dst0, dst1, dst2, dst3);
4475 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4476 dst7, weight_vec, dst4, dst5, dst6, dst7);
4479 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4480 offset_vec, dst0, dst1, dst2, dst3);
4481 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4482 dst7, offset_vec, dst4, dst5, dst6, dst7);
4485 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4486 weight_vec, dst8, dst9, dst10, dst11);
4487 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4488 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4489 offset_vec, dst8, dst9, dst10, dst11);
4494 PCKEV_H2_SH(dst9, dst8, dst11, dst10, dst4, dst5);
4496 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
4497 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4498 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
4499 dst += (4 * dst_stride);
4505 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
4506 src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
4528 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4529 weight_vec, dst0, dst1, dst2, dst3);
4530 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4531 dst7, weight_vec, dst4, dst5, dst6, dst7);
4534 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4535 offset_vec, dst0, dst1, dst2, dst3);
4536 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4537 dst7, offset_vec, dst4, dst5, dst6, dst7);
4540 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4541 weight_vec, dst8, dst9, dst10, dst11);
4542 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4543 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4544 offset_vec, dst8, dst9, dst10, dst11);
4549 PCKEV_H2_SH(dst9, dst8, dst11, dst10, dst4, dst5);
4551 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out3, out4, out5);
4552 ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride);
4553 ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride);
4554 dst += (4 * dst_stride);
4574 v16u8 out0, out1, out2, out3;
4576 v16i8 src10_r, src32_r, src21_r, src43_r;
4577 v16i8 src10_l, src32_l, src21_l, src43_l;
4578 v16i8 src54_r, src54_l, src65_r, src65_l, src6;
4579 v8i16 filter_vec, filt0, filt1;
4580 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4581 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4582 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4583 v4i32 weight_vec, rnd_vec, offset_vec;
4588 weight_vec = __msa_fill_w(
weight);
4589 rnd_vec = __msa_fill_w(rnd_val);
4590 offset_vec = __msa_fill_w(
offset);
4597 src += (3 * src_stride);
4601 for (loop_cnt = (
height >> 2); loop_cnt--;) {
4602 LD_SB4(
src, src_stride, src3, src4, src5, src6);
4603 src += (4 * src_stride);
4638 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4639 weight_vec, dst0, dst1, dst2, dst3);
4640 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4641 dst7, weight_vec, dst4, dst5, dst6, dst7);
4644 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4645 offset_vec, dst0, dst1, dst2, dst3);
4646 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4647 dst7, offset_vec, dst4, dst5, dst6, dst7);
4650 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4651 weight_vec, dst8, dst9, dst10, dst11);
4652 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4653 dst15, weight_vec, dst12, dst13, dst14, dst15);
4654 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4655 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4656 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4657 offset_vec, dst8, dst9, dst10, dst11);
4658 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4659 dst15, offset_vec, dst12, dst13, dst14, dst15);
4660 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4664 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4665 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4666 PCKEV_B4_UB(tmp4, tmp0, tmp5, tmp1, tmp6, tmp2, tmp7, tmp3, out0, out1,
4668 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4669 dst += (4 * dst_stride);
4690 v16u8 out0, out1, out2, out3, out4, out5;
4692 v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
4693 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
4694 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4695 v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
4696 v8i16 filter_vec, filt0, filt1;
4697 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4698 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4699 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4700 v4i32 dst16, dst17, dst18, dst19, dst20, dst21, dst22, dst23;
4701 v4i32 weight_vec, rnd_vec, offset_vec;
4706 weight_vec = __msa_fill_w(
weight);
4707 rnd_vec = __msa_fill_w(rnd_val);
4708 offset_vec = __msa_fill_w(
offset);
4715 LD_SB3(
src + 16, src_stride, src7, src8, src9);
4716 src += (3 * src_stride);
4719 ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
4721 for (loop_cnt = 8; loop_cnt--;) {
4722 LD_SB4(
src, src_stride, src3, src4, src5, src6);
4723 LD_SB4(
src + 16, src_stride, src10, src11, src12, src13);
4724 src += (4 * src_stride);
4729 ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
4730 ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
4775 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4776 weight_vec, dst0, dst1, dst2, dst3);
4777 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4778 dst7, weight_vec, dst4, dst5, dst6, dst7);
4781 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4782 offset_vec, dst0, dst1, dst2, dst3);
4783 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4784 dst7, offset_vec, dst4, dst5, dst6, dst7);
4787 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4788 weight_vec, dst8, dst9, dst10, dst11);
4789 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4790 dst15, weight_vec, dst12, dst13, dst14, dst15);
4791 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4792 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4793 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4794 offset_vec, dst8, dst9, dst10, dst11);
4795 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4796 dst15, offset_vec, dst12, dst13, dst14, dst15);
4797 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4799 MUL4(dst16, weight_vec, dst17, weight_vec, dst18, weight_vec, dst19,
4800 weight_vec, dst16, dst17, dst18, dst19);
4801 MUL4(dst20, weight_vec, dst21, weight_vec, dst22, weight_vec,
4802 dst23, weight_vec, dst20, dst21, dst22, dst23);
4803 SRAR_W4_SW(dst16, dst17, dst18, dst19, rnd_vec);
4804 SRAR_W4_SW(dst20, dst21, dst22, dst23, rnd_vec);
4805 ADD4(dst16, offset_vec, dst17, offset_vec, dst18, offset_vec, dst19,
4806 offset_vec, dst16, dst17, dst18, dst19);
4807 ADD4(dst20, offset_vec, dst21, offset_vec, dst22, offset_vec,
4808 dst23, offset_vec, dst20, dst21, dst22, dst23);
4809 CLIP_SW8_0_255(dst16, dst17, dst18, dst19, dst20, dst21, dst22, dst23);
4813 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4814 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4815 PCKEV_B4_UB(tmp4, tmp0, tmp5, tmp1, tmp6, tmp2, tmp7, tmp3, out0, out1,
4818 PCKEV_H2_SH(dst17, dst16, dst19, dst18, tmp0, tmp1);
4819 PCKEV_H2_SH(dst21, dst20, dst23, dst22, tmp2, tmp3);
4822 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4823 ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride);
4824 dst += (4 * dst_stride);
4832 src87_r = src1211_r;
4833 src98_r = src1312_r;
4848 v16u8 out0, out1, out2, out3;
4849 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9;
4850 v16i8 src10_r, src32_r, src76_r, src98_r;
4851 v16i8 src21_r, src43_r, src65_r, src87_r;
4852 v16i8 src10_l, src32_l, src76_l, src98_l;
4853 v16i8 src21_l, src43_l, src65_l, src87_l;
4854 v8i16 filter_vec, filt0, filt1;
4855 v4i32 weight_vec, rnd_vec, offset_vec;
4856 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4857 v4i32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4858 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4863 weight_vec = __msa_fill_w(
weight);
4864 rnd_vec = __msa_fill_w(rnd_val);
4865 offset_vec = __msa_fill_w(
offset);
4872 LD_SB3(
src + 16, src_stride, src5, src6, src7);
4873 src += (3 * src_stride);
4876 ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
4877 ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
4879 for (loop_cnt = (
height >> 1); loop_cnt--;) {
4881 LD_SB2(
src + 16, src_stride, src8, src9);
4882 src += (2 * src_stride);
4923 MUL4(dst0, weight_vec, dst1, weight_vec, dst2, weight_vec, dst3,
4924 weight_vec, dst0, dst1, dst2, dst3);
4925 MUL4(dst4, weight_vec, dst5, weight_vec, dst6, weight_vec,
4926 dst7, weight_vec, dst4, dst5, dst6, dst7);
4929 ADD4(dst0, offset_vec, dst1, offset_vec, dst2, offset_vec, dst3,
4930 offset_vec, dst0, dst1, dst2, dst3);
4931 ADD4(dst4, offset_vec, dst5, offset_vec, dst6, offset_vec,
4932 dst7, offset_vec, dst4, dst5, dst6, dst7);
4935 MUL4(dst8, weight_vec, dst9, weight_vec, dst10, weight_vec, dst11,
4936 weight_vec, dst8, dst9, dst10, dst11);
4937 MUL4(dst12, weight_vec, dst13, weight_vec, dst14, weight_vec,
4938 dst15, weight_vec, dst12, dst13, dst14, dst15);
4939 SRAR_W4_SW(dst8, dst9, dst10, dst11, rnd_vec);
4940 SRAR_W4_SW(dst12, dst13, dst14, dst15, rnd_vec);
4941 ADD4(dst8, offset_vec, dst9, offset_vec, dst10, offset_vec, dst11,
4942 offset_vec, dst8, dst9, dst10, dst11);
4943 ADD4(dst12, offset_vec, dst13, offset_vec, dst14, offset_vec,
4944 dst15, offset_vec, dst12, dst13, dst14, dst15);
4945 CLIP_SW8_0_255(dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15);
4949 PCKEV_H2_SH(dst9, dst8, dst11, dst10, tmp4, tmp5);
4950 PCKEV_H2_SH(dst13, dst12, dst15, dst14, tmp6, tmp7);
4951 PCKEV_B4_UB(tmp2, tmp0, tmp3, tmp1, tmp6, tmp4, tmp7, tmp5, out0, out1,
4953 ST_UB2(out0, out2, dst, 16);
4955 ST_UB2(out1, out3, dst, 16);
4975 const int8_t *filter_x,
4976 const int8_t *filter_y,
4986 v8i16 filt_h0, filt_h1, filter_vec,
tmp;
4987 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4988 v8i16 dst10, dst21, dst32, dst43;
4989 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4990 v4i32 dst0, dst1, dst2, dst3, dst4;
4991 v4i32 weight_vec, rnd_vec, offset_vec;
4994 src -= (src_stride + 1);
4996 filter_vec =
LD_SH(filter_x);
5000 filter_vec =
LD_SH(filter_y);
5006 weight_vec = __msa_fill_w(
weight);
5007 rnd_vec = __msa_fill_w(rnd_val);
5008 offset_vec = __msa_fill_w(
offset);