27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
32 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \ 35 v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \ 37 ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \ 38 ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \ 39 DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \ 40 wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \ 41 SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \ 42 PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \ 43 ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \ 44 CLIP_SH2_0_255(out0_h, out1_h); \ 47 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \ 48 offset_h, rnd_w, out0_h, out1_h, \ 51 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \ 53 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \ 66 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 v8i16 dst0, dst1, dst2, dst3, offset_vec;
71 v4i32 weight_vec, rnd_vec;
73 weight = weight & 0x0000FFFF;
74 weight_vec = __msa_fill_w(weight);
75 offset_vec = __msa_fill_h(offset);
76 rnd_vec = __msa_fill_w(rnd_val);
81 LW2(src, src_stride, tp0, tp1);
83 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
87 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
89 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
92 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
93 ST_W2(out0, 0, 1, dst, dst_stride);
94 }
else if (4 == height) {
95 LW4(src, src_stride, tp0, tp1, tp2, tp3);
100 rnd_vec, dst0, dst1);
101 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
102 ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
103 }
else if (0 == (height % 8)) {
104 for (loop_cnt = (height >> 3); loop_cnt--;) {
105 LW4(src, src_stride, tp0, tp1, tp2, tp3);
106 src += 4 * src_stride;
108 LW4(src, src_stride, tp0, tp1, tp2, tp3);
109 src += 4 * src_stride;
113 SLLI_4V(dst0, dst1, dst2, dst3, 6);
115 offset_vec, rnd_vec, dst0, dst1,
118 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
119 dst += 8 * dst_stride;
134 uint64_t tp0, tp1, tp2, tp3;
136 v16u8 out0, out1, out2, out3;
138 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
139 v4i32 weight_vec, rnd_vec;
141 weight = weight & 0x0000FFFF;
142 weight_vec = __msa_fill_w(weight);
143 offset_vec = __msa_fill_h(offset);
144 rnd_vec = __msa_fill_w(rnd_val);
146 for (loop_cnt = (height >> 3); loop_cnt--;) {
147 LD4(src, src_stride, tp0, tp1, tp2, tp3);
148 src += (4 * src_stride);
151 LD4(src, src_stride, tp0, tp1, tp2, tp3);
152 src += (4 * src_stride);
161 SLLI_4V(dst0, dst1, dst2, dst3, 6);
162 SLLI_4V(dst4, dst5, dst6, dst7, 6);
165 offset_vec, rnd_vec, dst0, dst1, dst2,
168 offset_vec, rnd_vec, dst4, dst5, dst6,
173 ST_W2(out0, 0, 2, dst, dst_stride);
174 ST_H2(out0, 2, 6, dst + 4, dst_stride);
175 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
176 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177 dst += (4 * dst_stride);
178 ST_W2(out2, 0, 2, dst, dst_stride);
179 ST_H2(out2, 2, 6, dst + 4, dst_stride);
180 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
181 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
182 dst += (4 * dst_stride);
196 uint64_t tp0, tp1, tp2, tp3;
197 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
199 v16u8 out0, out1, out2, out3;
200 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
201 v4i32 weight_vec, rnd_vec;
203 weight = weight & 0x0000FFFF;
204 weight_vec = __msa_fill_w(weight);
205 offset_vec = __msa_fill_h(offset);
206 rnd_vec = __msa_fill_w(rnd_val);
209 LD2(src, src_stride, tp0, tp1);
214 rnd_vec, dst0, dst1);
215 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
216 ST_D2(out0, 0, 1, dst, dst_stride);
217 }
else if (4 == height) {
218 LD4(src, src_stride, tp0, tp1, tp2, tp3);
223 SLLI_4V(dst0, dst1, dst2, dst3, 6);
225 offset_vec, rnd_vec, dst0, dst1, dst2,
228 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
229 }
else if (6 == height) {
230 LD4(src, src_stride, tp0, tp1, tp2, tp3);
231 src += 4 * src_stride;
234 LD2(src, src_stride, tp0, tp1);
239 SLLI_4V(dst0, dst1, dst2, dst3, 6);
242 offset_vec, rnd_vec, dst0, dst1, dst2,
245 rnd_vec, dst4, dst5);
246 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
247 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
248 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
249 }
else if (0 == height % 8) {
250 for (loop_cnt = (height >> 3); loop_cnt--;) {
251 LD4(src, src_stride, tp0, tp1, tp2, tp3);
252 src += 4 * src_stride;
255 LD4(src, src_stride, tp0, tp1, tp2, tp3);
256 src += 4 * src_stride;
264 SLLI_4V(dst0, dst1, dst2, dst3, 6);
265 SLLI_4V(dst4, dst5, dst6, dst7, 6);
267 offset_vec, rnd_vec, dst0, dst1,
270 offset_vec, rnd_vec, dst4, dst5,
274 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
276 dst += (8 * dst_stride);
291 v16u8 out0, out1, out2;
293 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
296 v4i32 weight_vec, rnd_vec;
298 weight = weight & 0x0000FFFF;
299 weight_vec = __msa_fill_w(weight);
300 offset_vec = __msa_fill_h(offset);
301 rnd_vec = __msa_fill_w(rnd_val);
303 for (loop_cnt = 4; loop_cnt--;) {
304 LD_SB4(src, src_stride, src0, src1, src2, src3);
305 src += (4 * src_stride);
306 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
307 dst0, dst1, dst2, dst3);
309 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
310 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
311 SLLI_4V(dst0, dst1, dst2, dst3, 6);
314 offset_vec, rnd_vec, dst0, dst1, dst2,
317 rnd_vec, dst4, dst5);
319 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
320 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
321 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
322 dst += (4 * dst_stride);
336 v16u8 out0, out1, out2, out3;
339 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
340 v4i32 weight_vec, rnd_vec;
342 weight = weight & 0x0000FFFF;
343 weight_vec = __msa_fill_w(weight);
344 offset_vec = __msa_fill_h(offset);
345 rnd_vec = __msa_fill_w(rnd_val);
347 for (loop_cnt = height >> 2; loop_cnt--;) {
348 LD_SB4(src, src_stride, src0, src1, src2, src3);
349 src += (4 * src_stride);
354 SLLI_4V(dst0, dst1, dst2, dst3, 6);
355 SLLI_4V(dst4, dst5, dst6, dst7, 6);
357 offset_vec, rnd_vec, dst0, dst1, dst2,
360 offset_vec, rnd_vec, dst4, dst5, dst6,
364 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
365 dst += (4 * dst_stride);
379 v16u8 out0, out1, out2, out3, out4, out5;
380 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
382 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
383 v8i16 dst8, dst9, dst10, dst11;
384 v4i32 weight_vec, rnd_vec;
386 weight = weight & 0x0000FFFF;
387 weight_vec = __msa_fill_w(weight);
388 offset_vec = __msa_fill_h(offset);
389 rnd_vec = __msa_fill_w(rnd_val);
391 for (loop_cnt = (height >> 2); loop_cnt--;) {
392 LD_SB4(src, src_stride, src0, src1, src4, src5);
393 LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
394 src += (4 * src_stride);
398 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
401 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
402 SLLI_4V(dst0, dst1, dst2, dst3, 6);
403 SLLI_4V(dst4, dst5, dst6, dst7, 6);
404 SLLI_4V(dst8, dst9, dst10, dst11, 6);
406 offset_vec, rnd_vec, dst0, dst1, dst2,
409 offset_vec, rnd_vec, dst4, dst5, dst6,
412 offset_vec, rnd_vec, dst8, dst9, dst10,
414 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
415 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
416 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
417 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
418 dst += (4 * dst_stride);
432 v16u8 out0, out1, out2, out3;
435 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
436 v4i32 weight_vec, rnd_vec;
438 weight = weight & 0x0000FFFF;
439 weight_vec = __msa_fill_w(weight);
440 offset_vec = __msa_fill_h(offset);
441 rnd_vec = __msa_fill_w(rnd_val);
443 for (loop_cnt = (height >> 1); loop_cnt--;) {
444 LD_SB2(src, src_stride, src0, src1);
445 LD_SB2(src + 16, src_stride, src2, src3);
446 src += (2 * src_stride);
452 SLLI_4V(dst0, dst1, dst2, dst3, 6);
453 SLLI_4V(dst4, dst5, dst6, dst7, 6);
455 offset_vec, rnd_vec, dst0, dst1, dst2,
458 offset_vec, rnd_vec, dst4, dst5, dst6,
462 ST_UB2(out0, out1, dst, dst_stride);
463 ST_UB2(out2, out3, dst + 16, dst_stride);
464 dst += (2 * dst_stride);
478 v16u8 out0, out1, out2, out3, out4, out5;
479 v16i8
src0,
src1, src2, src3, src4, src5;
481 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
482 v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
483 v4i32 weight_vec, rnd_vec;
485 weight = weight & 0x0000FFFF;
486 weight_vec = __msa_fill_w(weight);
487 offset_vec = __msa_fill_h(offset);
488 rnd_vec = __msa_fill_w(rnd_val);
490 for (loop_cnt = (height >> 1); loop_cnt--;) {
491 LD_SB3(src, 16, src0, src1, src2);
493 LD_SB3(src, 16, src3, src4, src5);
502 SLLI_4V(dst0, dst1, dst2, dst3, 6);
503 SLLI_4V(dst4, dst5, dst6, dst7, 6);
504 SLLI_4V(dst8, dst9, dst10, dst11, 6);
506 offset_vec, rnd_vec, dst0, dst1, dst2,
509 offset_vec, rnd_vec, dst4, dst5, dst6,
512 offset_vec, rnd_vec, dst8, dst9, dst10,
514 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
515 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
516 ST_UB2(out0, out1, dst, 16);
517 ST_UB(out2, dst + 32);
519 ST_UB2(out3, out4, dst, 16);
520 ST_UB(out5, dst + 32);
535 v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
536 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
538 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
539 v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
540 v4i32 weight_vec, rnd_vec;
542 weight = weight & 0x0000FFFF;
543 weight_vec = __msa_fill_w(weight);
544 offset_vec = __msa_fill_h(offset);
545 rnd_vec = __msa_fill_w(rnd_val);
547 for (loop_cnt = (height >> 1); loop_cnt--;) {
548 LD_SB4(src, 16, src0, src1, src2, src3);
550 LD_SB4(src, 16, src4, src5, src6, src7);
561 SLLI_4V(dst0, dst1, dst2, dst3, 6);
562 SLLI_4V(dst4, dst5, dst6, dst7, 6);
563 SLLI_4V(dst8, dst9, dst10, dst11, 6);
564 SLLI_4V(dst12, dst13, dst14, dst15, 6);
566 offset_vec, rnd_vec, dst0, dst1, dst2,
569 offset_vec, rnd_vec, dst4, dst5, dst6,
572 offset_vec, rnd_vec, dst8, dst9, dst10,
575 offset_vec, rnd_vec, dst12, dst13, dst14,
580 PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
581 ST_UB4(out0, out1, out2, out3, dst, 16);
583 ST_UB4(out4, out5, out6, out7, dst, 16);
600 v8i16 filt0, filt1, filt2, filt3;
601 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
602 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
603 v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
604 v8i16 filter_vec, dst01, dst23, dst45, dst67;
605 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
606 v4i32 weight_vec, rnd_vec;
609 weight = weight & 0x0000FFFF;
611 weight_vec = __msa_fill_w(weight);
612 rnd_vec = __msa_fill_w(rnd_val);
617 weight_vec_h = __msa_fill_h(weight);
618 offset_vec = __msa_fill_h(offset);
619 denom_vec = __msa_fill_h(rnd_val);
621 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
622 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
624 filter_vec =
LD_SH(filter);
625 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
632 for (loop_cnt = (height >> 3); loop_cnt--;) {
633 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
634 src += (8 * src_stride);
637 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
638 vec0, vec1, vec2, vec3);
639 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
640 vec4, vec5, vec6, vec7);
641 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
642 vec8, vec9, vec10, vec11);
643 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
644 vec12, vec13, vec14, vec15);
655 offset_vec, rnd_vec, dst0, dst1, dst2,
659 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
660 dst += (8 * dst_stride);
677 v8i16 filt0, filt1, filt2, filt3;
678 v16i8 mask0, mask1, mask2, mask3;
680 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
681 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
682 v8i16 dst0, dst1, dst2, dst3;
683 v8i16 weight_vec_h, offset_vec, denom_vec;
684 v4i32 weight_vec, rnd_vec;
687 weight = weight & 0x0000FFFF;
689 weight_vec = __msa_fill_w(weight);
690 rnd_vec = __msa_fill_w(rnd_val);
695 weight_vec_h = __msa_fill_h(weight);
696 offset_vec = __msa_fill_h(offset);
697 denom_vec = __msa_fill_h(rnd_val);
699 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
700 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
702 filter_vec =
LD_SH(filter);
703 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
710 for (loop_cnt = (height >> 2); loop_cnt--;) {
711 LD_SB4(src, src_stride, src0, src1, src2, src3);
712 src += (4 * src_stride);
715 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
716 vec0, vec1, vec2, vec3);
717 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
718 vec4, vec5, vec6, vec7);
719 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
720 vec8, vec9, vec10, vec11);
721 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
722 vec12, vec13, vec14, vec15);
733 offset_vec, rnd_vec, dst0, dst1, dst2,
737 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
738 dst += (4 * dst_stride);
753 v16u8 out0, out1, out2;
754 v8i16 filt0, filt1, filt2, filt3;
755 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
756 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
757 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
758 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
760 v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
761 v8i16 weight_vec_h, offset_vec, denom_vec;
762 v4i32 weight_vec, rnd_vec;
765 weight = weight & 0x0000FFFF;
767 weight_vec = __msa_fill_w(weight);
768 rnd_vec = __msa_fill_w(rnd_val);
773 weight_vec_h = __msa_fill_h(weight);
774 offset_vec = __msa_fill_h(offset);
775 denom_vec = __msa_fill_h(rnd_val);
777 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
778 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
780 filter_vec =
LD_SH(filter);
781 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
792 for (loop_cnt = (height >> 2); loop_cnt--;) {
793 LD_SB4(src, src_stride, src0, src1, src2, src3);
794 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
795 src += (4 * src_stride);
798 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
799 vec0, vec1, vec2, vec3);
800 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
801 vec4, vec5, vec6, vec7);
802 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
803 vec8, vec9, vec10, vec11);
804 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
805 vec12, vec13, vec14, vec15);
814 VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
815 vec0, vec1, vec2, vec3);
816 VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
817 vec4, vec5, vec6, vec7);
824 offset_vec, rnd_vec, dst0, dst1, dst2,
827 rnd_vec, dst4, dst5);
829 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
830 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
831 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
832 dst += (4 * dst_stride);
849 v8i16 filt0, filt1, filt2, filt3;
850 v16i8 mask0, mask1, mask2, mask3;
852 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
853 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
854 v8i16 dst0, dst1, dst2, dst3;
855 v8i16 weight_vec_h, offset_vec, denom_vec;
856 v4i32 weight_vec, rnd_vec;
860 weight_vec = __msa_fill_w(weight);
861 rnd_vec = __msa_fill_w(rnd_val);
866 weight_vec_h = __msa_fill_h(weight);
867 offset_vec = __msa_fill_h(offset);
868 denom_vec = __msa_fill_h(rnd_val);
870 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
871 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
873 filter_vec =
LD_SH(filter);
874 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
881 for (loop_cnt = (height >> 1); loop_cnt--;) {
882 LD_SB2(src, src_stride, src0, src2);
883 LD_SB2(src + 8, src_stride, src1, src3);
884 src += (2 * src_stride);
887 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
888 vec0, vec1, vec2, vec3);
889 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
890 vec4, vec5, vec6, vec7);
891 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
892 vec8, vec9, vec10, vec11);
893 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
894 vec12, vec13, vec14, vec15);
905 offset_vec, rnd_vec, dst0, dst1, dst2,
909 ST_UB2(out0, out1, dst, dst_stride);
910 dst += (2 * dst_stride);
925 v16u8 out0, out1, out2;
927 v8i16 filt0, filt1, filt2, filt3;
928 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
929 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
930 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
931 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
932 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
933 v4i32 weight_vec, rnd_vec;
937 weight_vec = __msa_fill_w(weight);
938 rnd_vec = __msa_fill_w(rnd_val);
943 weight_vec_h = __msa_fill_h(weight);
944 offset_vec = __msa_fill_h(offset);
945 denom_vec = __msa_fill_h(rnd_val);
947 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
948 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
950 filter_vec =
LD_SH(filter);
951 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
962 for (loop_cnt = 16; loop_cnt--;) {
963 LD_SB2(src, 16, src0, src1);
965 LD_SB2(src, 16, src2, src3);
968 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
969 vec0, vec1, vec2, vec3);
970 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
971 vec4, vec5, vec6, vec7);
972 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
973 vec8, vec9, vec10, vec11);
974 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
975 vec12, vec13, vec14, vec15);
985 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
986 vec0, vec1, vec2, vec3);
987 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
988 vec4, vec5, vec6, vec7);
995 offset_vec, rnd_vec, dst0, dst1, dst2,
998 rnd_vec, dst4, dst5);
1000 PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
1001 ST_UB2(out0, out1, dst, dst_stride);
1002 ST_D2(out2, 0, 1, dst + 16, dst_stride);
1003 dst += (2 * dst_stride);
1018 v16u8 out0, out1, out2, out3;
1019 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1020 v8i16 filt0, filt1, filt2, filt3;
1021 v16i8 mask0, mask1, mask2, mask3;
1022 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1023 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1025 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1026 v8i16 weight_vec_h, offset_vec, denom_vec;
1027 v4i32 weight_vec, rnd_vec;
1031 weight_vec = __msa_fill_w(weight);
1032 rnd_vec = __msa_fill_w(rnd_val);
1037 weight_vec_h = __msa_fill_h(weight);
1038 offset_vec = __msa_fill_h(offset);
1039 denom_vec = __msa_fill_h(rnd_val);
1041 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1042 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1044 filter_vec =
LD_SH(filter);
1045 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1052 for (loop_cnt = height >> 1; loop_cnt--;) {
1053 LD_SB4(src, 8, src0, src1, src2, src3);
1055 LD_SB4(src, 8, src4, src5, src6, src7);
1059 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1060 vec0, vec1, vec2, vec3);
1061 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1062 vec4, vec5, vec6, vec7);
1063 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1064 vec8, vec9, vec10, vec11);
1065 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1066 vec12, vec13, vec14, vec15);
1076 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1077 vec0, vec1, vec2, vec3);
1078 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1079 vec4, vec5, vec6, vec7);
1080 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1081 vec8, vec9, vec10, vec11);
1082 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1083 vec12, vec13, vec14, vec15);
1094 offset_vec, rnd_vec, dst0, dst1, dst2,
1097 offset_vec, rnd_vec, dst4, dst5, dst6,
1102 ST_UB2(out0, out1, dst, 16);
1104 ST_UB2(out2, out3, dst, 16);
1120 v16u8 out0, out1, out2;
1122 v8i16 filt0, filt1, filt2, filt3;
1123 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1124 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1125 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1126 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1127 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1128 v4i32 weight_vec, rnd_vec;
1132 weight = weight & 0x0000FFFF;
1133 weight_vec = __msa_fill_w(weight);
1134 rnd_vec = __msa_fill_w(rnd_val);
1139 weight_vec_h = __msa_fill_h(weight);
1140 offset_vec = __msa_fill_h(offset);
1141 denom_vec = __msa_fill_h(rnd_val);
1143 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1144 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1146 filter_vec =
LD_SH(filter);
1147 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1158 for (loop_cnt = 64; loop_cnt--;) {
1159 LD_SB3(src, 16, src0, src1, src2);
1160 src3 =
LD_SB(src + 40);
1164 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1165 vec0, vec1, vec2, vec3);
1166 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1167 vec4, vec5, vec6, vec7);
1168 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1169 vec8, vec9, vec10, vec11);
1170 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1171 vec12, vec13, vec14, vec15);
1181 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1182 vec0, vec1, vec2, vec3);
1183 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1184 vec4, vec5, vec6, vec7);
1191 offset_vec, rnd_vec, dst0, dst1, dst2,
1194 rnd_vec, dst4, dst5);
1196 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1197 ST_UB2(out0, out1, dst, 16);
1198 ST_UB(out2, dst + 32);
1215 uint32_t loop_cnt, cnt;
1218 v8i16 filt0, filt1, filt2, filt3;
1219 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1220 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1221 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1222 v8i16 dst0, dst1, dst2, dst3;
1223 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1224 v4i32 weight_vec, rnd_vec;
1228 weight_vec = __msa_fill_w(weight);
1229 rnd_vec = __msa_fill_w(rnd_val);
1234 weight_vec_h = __msa_fill_h(weight);
1235 offset_vec = __msa_fill_h(offset);
1236 denom_vec = __msa_fill_h(rnd_val);
1238 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1239 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1241 filter_vec =
LD_SH(filter);
1242 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1253 for (loop_cnt = height; loop_cnt--;) {
1257 for (cnt = 2; cnt--;) {
1258 LD_SB2(src_tmp, 16, src0, src1);
1259 src2 =
LD_SB(src_tmp + 24);
1263 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1264 vec0, vec1, vec2, vec3);
1265 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1266 vec4, vec5, vec6, vec7);
1267 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1268 vec8, vec9, vec10, vec11);
1269 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1270 vec12, vec13, vec14, vec15);
1281 offset_vec, rnd_vec, dst0, dst1,
1285 ST_UB2(out0, out1, dst_tmp, 16);
1306 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1307 v16i8 src9, src10, src11, src12, src13, src14;
1308 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1309 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1310 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1311 v16i8 src2110, src4332, src6554, src8776, src10998;
1312 v16i8 src12111110, src14131312;
1313 v8i16 filter_vec, dst01, dst23, dst45, dst67;
1314 v8i16 filt0, filt1, filt2, filt3;
1315 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1316 v4i32 weight_vec, rnd_vec;
1318 src -= (3 * src_stride);
1321 weight_vec = __msa_fill_w(weight);
1322 rnd_vec = __msa_fill_w(rnd_val);
1327 weight_vec_h = __msa_fill_h(weight);
1328 offset_vec = __msa_fill_h(offset);
1329 denom_vec = __msa_fill_h(rnd_val);
1331 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1332 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1334 filter_vec =
LD_SH(filter);
1335 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1337 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1338 src += (7 * src_stride);
1340 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1341 src10_r, src32_r, src54_r, src21_r);
1343 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1346 src32_r, src65_r, src54_r, src2110, src4332, src6554);
1350 for (loop_cnt = (height >> 3); loop_cnt--;) {
1352 src7, src8, src9, src10, src11, src12, src13, src14);
1353 src += (8 * src_stride);
1354 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1355 src76_r, src87_r, src98_r, src109_r);
1356 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1357 src1110_r, src1211_r, src1312_r, src1413_r);
1358 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1359 src1413_r, src1312_r,
1360 src8776, src10998, src12111110, src14131312);
1363 filt1, filt2, filt3);
1365 filt1, filt2, filt3);
1367 filt0, filt1, filt2, filt3);
1369 filt0, filt1, filt2, filt3);
1372 offset_vec, rnd_vec, dst0, dst1, dst2,
1376 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1377 dst += (8 * dst_stride);
1380 src4332 = src12111110;
1381 src6554 = src14131312;
1398 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1399 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1400 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1401 v8i16 filt0, filt1, filt2, filt3;
1403 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1404 v4i32 weight_vec, rnd_vec;
1406 src -= (3 * src_stride);
1408 weight_vec = __msa_fill_w(weight);
1409 rnd_vec = __msa_fill_w(rnd_val);
1414 weight_vec_h = __msa_fill_h(weight);
1415 offset_vec = __msa_fill_h(offset);
1416 denom_vec = __msa_fill_h(rnd_val);
1418 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1419 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1421 filter_vec =
LD_SH(filter);
1422 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1424 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1425 src += (7 * src_stride);
1428 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1429 src10_r, src32_r, src54_r, src21_r);
1430 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1432 for (loop_cnt = (height >> 2); loop_cnt--;) {
1433 LD_SB4(src, src_stride, src7, src8, src9, src10);
1434 src += (4 * src_stride);
1436 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1437 src76_r, src87_r, src98_r, src109_r);
1439 filt1, filt2, filt3);
1441 filt1, filt2, filt3);
1443 filt1, filt2, filt3);
1445 filt1, filt2, filt3);
1448 offset_vec, rnd_vec, dst0, dst1, dst2,
1452 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1453 dst += (4 * dst_stride);
1476 v16u8 out0, out1, out2;
1477 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1478 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1479 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1480 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1481 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1482 v16i8 src2110, src4332, src6554, src8776, src10998;
1483 v8i16 filt0, filt1, filt2, filt3;
1484 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1485 v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
1486 v4i32 weight_vec, rnd_vec;
1488 src -= (3 * src_stride);
1490 weight = weight & 0x0000FFFF;
1491 weight_vec = __msa_fill_w(weight);
1492 rnd_vec = __msa_fill_w(rnd_val);
1497 weight_vec_h = __msa_fill_h(weight);
1498 offset_vec = __msa_fill_h(offset);
1499 denom_vec = __msa_fill_h(rnd_val);
1501 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1502 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1504 filter_vec =
LD_SH(filter);
1505 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1507 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1508 src += (7 * src_stride);
1511 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1512 src10_r, src32_r, src54_r, src21_r);
1513 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1514 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1515 src10_l, src32_l, src54_l, src21_l);
1516 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1517 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1518 src2110, src4332, src6554);
1520 for (loop_cnt = 4; loop_cnt--;) {
1521 LD_SB4(src, src_stride, src7, src8, src9, src10);
1522 src += (4 * src_stride);
1525 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1526 src76_r, src87_r, src98_r, src109_r);
1527 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1528 src76_l, src87_l, src98_l, src109_l);
1529 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1532 filt1, filt2, filt3);
1534 filt1, filt2, filt3);
1536 filt1, filt2, filt3);
1538 filt1, filt2, filt3);
1540 filt1, filt2, filt3);
1542 filt1, filt2, filt3);
1545 offset_vec, rnd_vec, dst0, dst1, dst2,
1548 rnd_vec, dst4, dst5);
1550 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1551 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1552 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
1553 dst += (4 * dst_stride);
1582 v16u8 out0, out1, out2, out3;
1583 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1584 v16i8 src10_r, src32_r, src54_r, src76_r;
1585 v16i8 src21_r, src43_r, src65_r, src87_r;
1586 v16i8 src10_l, src32_l, src54_l, src76_l;
1587 v16i8 src21_l, src43_l, src65_l, src87_l;
1588 v16i8 src98_r, src109_r, src98_l, src109_l;
1589 v8i16 filt0, filt1, filt2, filt3;
1591 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1592 v8i16 weight_vec_h, offset_vec, denom_vec;
1593 v4i32 weight_vec, rnd_vec;
1595 src -= (3 * src_stride);
1597 weight_vec = __msa_fill_w(weight);
1598 rnd_vec = __msa_fill_w(rnd_val);
1603 weight_vec_h = __msa_fill_h(weight);
1604 offset_vec = __msa_fill_h(offset);
1605 denom_vec = __msa_fill_h(rnd_val);
1607 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1608 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1610 filter_vec =
LD_SH(filter);
1611 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1613 for (cnt = weightmul16; cnt--;) {
1617 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1618 src_tmp += (7 * src_stride);
1621 for (loop_cnt = (height >> 2); loop_cnt--;) {
1622 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1623 src_tmp += (4 * src_stride);
1626 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1627 src10_r, src32_r, src54_r, src21_r);
1628 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1629 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1630 src10_l, src32_l, src54_l, src21_l);
1631 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1632 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1633 src76_r, src87_r, src98_r, src109_r);
1634 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1635 src76_l, src87_l, src98_l, src109_l);
1638 filt1, filt2, filt3);
1640 filt1, filt2, filt3);
1642 filt1, filt2, filt3);
1644 filt1, filt2, filt3);
1646 filt1, filt2, filt3);
1648 filt1, filt2, filt3);
1650 filt1, filt2, filt3);
1652 filt1, filt2, filt3);
1655 offset_vec, rnd_vec, dst0, dst1,
1658 offset_vec, rnd_vec, dst4, dst5,
1662 ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
1663 dst_tmp += (4 * dst_stride);
1690 filter, height, weight,
1691 offset, rnd_val, 1);
1706 offset, rnd_val, 1);
1709 filter, 32, weight, offset, rnd_val);
1723 filter, height, weight,
1724 offset, rnd_val, 2);
1739 offset, rnd_val, 3);
1753 filter, height, weight,
1754 offset, rnd_val, 4);
1761 const int8_t *filter_x,
1762 const int8_t *filter_y,
1770 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1771 v8i16 filt0, filt1, filt2, filt3;
1772 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1773 v16i8 mask1, mask2, mask3;
1775 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1776 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1777 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1778 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1779 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1780 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1781 v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1784 src -= ((3 * src_stride) + 3);
1785 filter_vec =
LD_SH(filter_x);
1786 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1788 filter_vec =
LD_SH(filter_y);
1791 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1797 weight_vec = __msa_fill_w(weight);
1798 offset_vec = __msa_fill_w(offset);
1799 rnd_vec = __msa_fill_w(rnd_val);
1800 denom_vec = rnd_vec - 6;
1802 const_128 = __msa_ldi_w(128);
1803 const_128 *= weight_vec;
1804 offset_vec += __msa_srar_w(const_128, denom_vec);
1806 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1807 src += (7 * src_stride);
1811 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1812 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1813 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1814 vec8, vec9, vec10, vec11);
1815 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1816 vec12, vec13, vec14, vec15);
1830 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1832 for (loop_cnt = height >> 2; loop_cnt--;) {
1833 LD_SB4(src, src_stride, src7, src8, src9, src10);
1834 src += (4 * src_stride);
1837 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1838 vec0, vec1, vec2, vec3);
1839 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1840 vec4, vec5, vec6, vec7);
1846 dst76_r = __msa_ilvr_h(dst97, dst66);
1848 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1849 dst98_r = __msa_ilvr_h(dst66, dst108);
1851 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1852 filt_h1, filt_h2, filt_h3);
1853 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1854 filt_h1, filt_h2, filt_h3);
1855 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1856 filt_h1, filt_h2, filt_h3);
1857 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1858 filt_h1, filt_h2, filt_h3);
1860 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1861 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1862 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
1863 SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
1864 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1865 ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
1867 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1868 out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
1869 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1870 dst += (4 * dst_stride);
1878 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1886 const int8_t *filter_x,
1887 const int8_t *filter_y,
1894 uint32_t loop_cnt, cnt;
1897 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1898 v8i16 filt0, filt1, filt2, filt3;
1899 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1900 v16i8 mask1, mask2, mask3;
1902 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1903 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1904 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1905 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1906 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1907 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1908 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1909 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1910 v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
1913 src -= ((3 * src_stride) + 3);
1915 weight_vec = __msa_fill_w(weight);
1916 offset_vec = __msa_fill_w(offset);
1917 rnd_vec = __msa_fill_w(rnd_val);
1918 denom_vec = rnd_vec - 6;
1920 const_128 = __msa_ldi_w(128);
1921 const_128 *= weight_vec;
1922 offset_vec += __msa_srar_w(const_128, denom_vec);
1924 filter_vec =
LD_SH(filter_x);
1925 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1927 filter_vec =
LD_SH(filter_y);
1929 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1935 for (cnt = width >> 3; cnt--;) {
1939 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1940 src_tmp += (7 * src_stride);
1943 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1944 vec0, vec1, vec2, vec3);
1945 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1946 vec4, vec5, vec6, vec7);
1947 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1948 vec8, vec9, vec10, vec11);
1949 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1950 vec12, vec13, vec14, vec15);
1960 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1961 vec0, vec1, vec2, vec3);
1962 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1963 vec4, vec5, vec6, vec7);
1964 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1965 vec8, vec9, vec10, vec11);
1973 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1974 dst10_r, dst32_r, dst54_r, dst21_r);
1975 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1976 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1977 dst10_l, dst32_l, dst54_l, dst21_l);
1978 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1980 for (loop_cnt = height >> 1; loop_cnt--;) {
1981 LD_SB2(src_tmp, src_stride, src7, src8);
1982 src_tmp += 2 * src_stride;
1985 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1986 vec0, vec1, vec2, vec3);
1992 filt_h0, filt_h1, filt_h2, filt_h3);
1994 filt_h0, filt_h1, filt_h2, filt_h3);
1999 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2000 vec0, vec1, vec2, vec3);
2006 filt_h0, filt_h1, filt_h2, filt_h3);
2008 filt_h0, filt_h1, filt_h2, filt_h3);
2012 MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2013 MUL2(dst1_r, weight_vec, dst1_l, weight_vec, dst1_r, dst1_l);
2014 SRAR_W4_SW(dst0_r, dst1_r, dst0_l, dst1_l, rnd_vec);
2015 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2016 ADD2(dst1_r, offset_vec, dst1_l, offset_vec, dst1_r, dst1_l);
2019 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
2020 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2021 ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride);
2022 dst_tmp += (2 * dst_stride);
2048 const int8_t *filter_x,
2049 const int8_t *filter_y,
2056 filter_x, filter_y, height, weight,
2057 offset, rnd_val, 8);
2064 const int8_t *filter_x,
2065 const int8_t *filter_y,
2074 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2075 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2076 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2077 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2078 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2079 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
2080 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2081 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst10_l, dst32_l, dst54_l;
2082 v8i16 dst98_r, dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
2083 v8i16 dst76_l, filter_vec;
2084 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
2085 v4i32 weight_vec, offset_vec, rnd_vec, const_128, denom_vec;
2087 src -= ((3 * src_stride) + 3);
2089 filter_vec =
LD_SH(filter_x);
2090 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2092 filter_vec =
LD_SH(filter_y);
2095 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2097 weight_vec = __msa_fill_w(weight);
2098 offset_vec = __msa_fill_w(offset);
2099 rnd_vec = __msa_fill_w(rnd_val);
2100 denom_vec = rnd_vec - 6;
2102 const_128 = __msa_ldi_w(128);
2103 const_128 *= weight_vec;
2104 offset_vec += __msa_srar_w(const_128, denom_vec);
2114 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2115 src_tmp += (7 * src_stride);
2119 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2120 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2121 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2123 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2133 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2134 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2135 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2144 for (loop_cnt = 16; loop_cnt--;) {
2145 src7 =
LD_SB(src_tmp);
2146 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
2147 src_tmp += src_stride;
2149 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2159 filt_h0, filt_h1, filt_h2, filt_h3);
2161 filt_h0, filt_h1, filt_h2, filt_h3);
2165 MUL2(dst0_r, weight_vec, dst0_l, weight_vec, dst0_r, dst0_l);
2167 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2169 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2170 out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2171 ST_D1(out, 0, dst_tmp);
2172 dst_tmp += dst_stride;
2191 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2192 src += (7 * src_stride);
2195 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2196 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2197 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2199 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2213 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2215 for (loop_cnt = 4; loop_cnt--;) {
2216 LD_SB4(src, src_stride, src7, src8, src9, src10);
2217 src += (4 * src_stride);
2220 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2222 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2229 dst76_r = __msa_ilvr_h(dst97, dst66);
2231 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2232 dst98_r = __msa_ilvr_h(dst66, dst108);
2234 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2235 filt_h1, filt_h2, filt_h3);
2236 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2237 filt_h1, filt_h2, filt_h3);
2238 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
2239 filt_h1, filt_h2, filt_h3);
2240 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
2241 filt_h1, filt_h2, filt_h3);
2243 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2244 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
2245 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
2246 SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
2247 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
2248 ADD2(dst2_r, offset_vec, dst3_r, offset_vec, dst2_r, dst3_r);
2250 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2251 out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2252 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2253 dst += (4 * dst_stride);
2261 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2269 const int8_t *filter_x,
2270 const int8_t *filter_y,
2277 filter_x, filter_y, height, weight,
2278 offset, rnd_val, 16);
2285 const int8_t *filter_x,
2286 const int8_t *filter_y,
2293 filter_x, filter_y, height, weight,
2294 offset, rnd_val, 24);
2301 const int8_t *filter_x,
2302 const int8_t *filter_y,
2309 filter_x, filter_y, height, weight,
2310 offset, rnd_val, 32);
2317 const int8_t *filter_x,
2318 const int8_t *filter_y,
2325 filter_x, filter_y, height, weight,
2326 offset, rnd_val, 48);
2333 const int8_t *filter_x,
2334 const int8_t *filter_y,
2341 filter_x, filter_y, height, weight,
2342 offset, rnd_val, 64);
2359 v4i32 dst0_r, dst0_l;
2360 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2361 v4i32 weight_vec, rnd_vec;
2366 filter_vec =
LD_SH(filter);
2371 weight = weight & 0x0000FFFF;
2373 weight_vec = __msa_fill_w(weight);
2374 rnd_vec = __msa_fill_w(rnd_val);
2379 weight_vec_h = __msa_fill_h(weight);
2380 offset_vec = __msa_fill_h(offset);
2381 denom_vec = __msa_fill_h(rnd_val);
2383 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2384 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2386 LD_SB2(src, src_stride, src0, src1);
2389 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2393 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2395 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2396 dst0 = __msa_adds_s_h(dst0, offset_vec);
2398 out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
2399 ST_W2(out, 0, 1, dst, dst_stride);
2400 dst += (4 * dst_stride);
2415 v16i8 mask1, vec0, vec1, vec2, vec3;
2417 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2418 v4i32 weight_vec, rnd_vec;
2424 filter_vec =
LD_SH(filter);
2429 weight = weight & 0x0000FFFF;
2431 weight_vec = __msa_fill_w(weight);
2432 rnd_vec = __msa_fill_w(rnd_val);
2437 weight_vec_h = __msa_fill_h(weight);
2438 offset_vec = __msa_fill_h(offset);
2439 denom_vec = __msa_fill_h(rnd_val);
2441 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2442 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2444 LD_SB4(src, src_stride, src0, src1, src2, src3);
2447 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2448 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2455 out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2457 dst += (4 * dst_stride);
2473 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2474 v16i8 mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2475 v8i16 dst0, dst1, dst2, dst3;
2477 v8i16 weight_vec_h, offset_vec, denom_vec;
2478 v4i32 weight_vec, rnd_vec;
2483 filter_vec =
LD_SH(filter);
2486 weight = weight & 0x0000FFFF;
2488 weight_vec = __msa_fill_w(weight);
2489 rnd_vec = __msa_fill_w(rnd_val);
2494 weight_vec_h = __msa_fill_h(weight);
2495 offset_vec = __msa_fill_h(offset);
2496 denom_vec = __msa_fill_h(rnd_val);
2498 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2499 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2503 for (loop_cnt = (height >> 3); loop_cnt--;) {
2504 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2505 src += (8 * src_stride);
2509 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2510 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec2, vec3);
2511 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec4, vec5);
2512 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec6, vec7);
2519 weight_vec, offset_vec, rnd_vec,
2520 dst0, dst1, dst2, dst3);
2523 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2524 dst += (8 * dst_stride);
2540 filter, weight, offset, rnd_val);
2541 }
else if (4 == height) {
2543 filter, weight, offset, rnd_val);
2544 }
else if (8 == height || 16 == height) {
2546 filter, height, weight,
2561 v16u8 out0, out1, out2, out3;
2563 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2566 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2567 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2568 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2569 v4i32 weight_vec, rnd_vec;
2573 filter_vec =
LD_SH(filter);
2576 weight = weight & 0x0000FFFF;
2578 weight_vec = __msa_fill_w(weight);
2579 rnd_vec = __msa_fill_w(rnd_val);
2584 weight_vec_h = __msa_fill_h(weight);
2585 offset_vec = __msa_fill_h(offset);
2586 denom_vec = __msa_fill_h(rnd_val);
2588 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2589 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2593 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2595 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2596 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2597 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2598 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2603 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2604 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2605 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2606 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2613 weight_vec, offset_vec, rnd_vec,
2614 dst0, dst1, dst2, dst3);
2616 weight_vec, offset_vec, rnd_vec,
2617 dst4, dst5, dst6, dst7);
2621 ST_W2(out0, 0, 2, dst, dst_stride);
2622 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2623 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2624 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2625 dst += (4 * dst_stride);
2626 ST_W2(out2, 0, 2, dst, dst_stride);
2627 ST_H2(out2, 2, 6, dst + 4, dst_stride);
2628 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
2629 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2642 v8i16 filt0, filt1, dst0, dst1;
2646 v16i8 vec0, vec1, vec2, vec3;
2647 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2648 v4i32 weight_vec, rnd_vec;
2652 filter_vec =
LD_SH(filter);
2655 weight = weight & 0x0000FFFF;
2657 weight_vec = __msa_fill_w(weight);
2658 rnd_vec = __msa_fill_w(rnd_val);
2663 weight_vec_h = __msa_fill_h(weight);
2664 offset_vec = __msa_fill_h(offset);
2665 denom_vec = __msa_fill_h(rnd_val);
2667 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2668 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2672 LD_SB2(src, src_stride, src0, src1);
2675 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2676 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2683 out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2684 ST_D2(out, 0, 1, dst, dst_stride);
2698 v16i8 mask0, mask1, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2699 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
2700 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2701 v4i32 weight_vec, rnd_vec;
2705 filter_vec =
LD_SH(filter);
2708 weight = weight & 0x0000FFFF;
2709 weight_vec = __msa_fill_w(weight);
2710 rnd_vec = __msa_fill_w(rnd_val);
2715 weight_vec_h = __msa_fill_h(weight);
2716 offset_vec = __msa_fill_h(offset);
2717 denom_vec = __msa_fill_h(rnd_val);
2719 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2720 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2725 LD_SB4(src, src_stride, src0, src1, src2, src3);
2727 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2728 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2729 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2730 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2737 weight_vec, offset_vec, rnd_vec,
2738 dst0, dst1, dst2, dst3);
2741 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2753 v16u8 out0, out1, out2;
2755 v16i8
src0,
src1, src2, src3, src4, src5;
2759 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2760 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2761 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2762 v4i32 weight_vec, rnd_vec;
2766 filter_vec =
LD_SH(filter);
2769 weight = weight & 0x0000FFFF;
2771 weight_vec = __msa_fill_w(weight);
2772 rnd_vec = __msa_fill_w(rnd_val);
2777 weight_vec_h = __msa_fill_h(weight);
2778 offset_vec = __msa_fill_h(offset);
2779 denom_vec = __msa_fill_h(rnd_val);
2781 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2782 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2786 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2789 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2790 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2791 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2792 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2793 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
2794 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
2803 weight_vec, offset_vec, rnd_vec,
2804 dst0, dst1, dst2, dst3);
2809 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2810 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2811 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
2826 v16u8 out0, out1, out2, out3;
2827 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2830 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2831 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2832 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2833 v4i32 weight_vec, rnd_vec;
2837 filter_vec =
LD_SH(filter);
2840 weight = weight & 0x0000FFFF;
2842 weight_vec = __msa_fill_w(weight);
2843 rnd_vec = __msa_fill_w(rnd_val);
2848 weight_vec_h = __msa_fill_h(weight);
2849 offset_vec = __msa_fill_h(offset);
2850 denom_vec = __msa_fill_h(rnd_val);
2852 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2853 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2857 for (loop_cnt = (height >> 3); loop_cnt--;) {
2858 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2859 src += (8 * src_stride);
2862 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2863 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2864 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2865 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2870 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2871 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
2872 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
2873 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
2880 weight_vec, offset_vec, rnd_vec,
2881 dst0, dst1, dst2, dst3);
2884 weight_vec, offset_vec, rnd_vec,
2885 dst4, dst5, dst6, dst7);
2889 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2890 dst += (8 * dst_stride);
2906 filter, weight, offset, rnd_val);
2907 }
else if (4 == height) {
2909 filter, weight, offset, rnd_val);
2910 }
else if (6 == height) {
2912 filter, weight, offset, rnd_val);
2915 filter, height, weight, offset,
2931 v16u8 out0, out1, out2;
2935 v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2938 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2939 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2940 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
2942 v4i32 weight_vec, rnd_vec;
2946 filter_vec =
LD_SH(filter);
2949 weight = weight & 0x0000FFFF;
2951 weight_vec = __msa_fill_w(weight);
2952 rnd_vec = __msa_fill_w(rnd_val);
2957 weight_vec_h = __msa_fill_h(weight);
2958 offset_vec = __msa_fill_h(offset);
2959 denom_vec = __msa_fill_h(rnd_val);
2961 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
2962 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
2967 for (loop_cnt = 4; loop_cnt--;) {
2968 LD_SB4(src, src_stride, src0, src1, src2, src3);
2969 src += (4 * src_stride);
2973 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2974 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2975 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2976 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
2977 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec8, vec9);
2978 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec10, vec11);
2987 weight_vec, offset_vec, rnd_vec,
2988 dst0, dst1, dst2, dst3);
2991 rnd_vec, dst4, dst5);
2993 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
2994 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2995 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
2996 dst += (4 * dst_stride);
3011 v16u8 out0, out1, out2, out3;
3012 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
3016 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3017 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3018 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3019 v4i32 weight_vec, rnd_vec;
3023 filter_vec =
LD_SH(filter);
3026 weight = weight & 0x0000FFFF;
3028 weight_vec = __msa_fill_w(weight);
3029 rnd_vec = __msa_fill_w(rnd_val);
3034 weight_vec_h = __msa_fill_h(weight);
3035 offset_vec = __msa_fill_h(offset);
3036 denom_vec = __msa_fill_h(rnd_val);
3038 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3039 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3043 for (loop_cnt = (height >> 2); loop_cnt--;) {
3044 LD_SB4(src, src_stride, src0, src2, src4, src6);
3045 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3046 src += (4 * src_stride);
3050 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3051 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3052 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3053 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3058 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3059 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec2, vec3);
3060 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec4, vec5);
3061 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec6, vec7);
3068 weight_vec, offset_vec, rnd_vec,
3069 dst0, dst1, dst2, dst3);
3072 weight_vec, offset_vec, rnd_vec,
3073 dst4, dst5, dst6, dst7);
3075 PCKEV_B4_UB(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3076 out0, out1, out2, out3);
3078 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3079 dst += (4 * dst_stride);
3094 v16u8 out0, out1, out2;
3097 v16i8 mask0, mask1, mask2, mask3;
3098 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3099 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3100 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3101 v4i32 weight_vec, rnd_vec;
3105 filter_vec =
LD_SH(filter);
3108 weight = weight & 0x0000FFFF;
3109 weight_vec = __msa_fill_w(weight);
3110 rnd_vec = __msa_fill_w(rnd_val);
3115 weight_vec_h = __msa_fill_h(weight);
3116 offset_vec = __msa_fill_h(offset);
3117 denom_vec = __msa_fill_h(rnd_val);
3119 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3120 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3127 for (loop_cnt = 16; loop_cnt--;) {
3128 LD_SB2(src, src_stride, src0, src2);
3129 LD_SB2(src + 16, src_stride, src1, src3);
3130 src += (2 * src_stride);
3134 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3135 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3136 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3137 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec6, vec7);
3142 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3143 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec2, vec3);
3148 weight_vec, offset_vec, rnd_vec,
3149 dst0, dst1, dst2, dst3);
3152 rnd_vec, dst4, dst5);
3154 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3155 ST_UB2(out0, out1, dst, dst_stride);
3156 ST_D2(out2, 0, 1, dst + 16, dst_stride);
3157 dst += (2 * dst_stride);
3172 v16u8 out0, out1, out2, out3;
3173 v16i8
src0,
src1, src2, src3, src4, src5;
3176 v16i8 mask1, mask2, mask3;
3177 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3178 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3179 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3180 v4i32 weight_vec, rnd_vec;
3184 filter_vec =
LD_SH(filter);
3187 weight = weight & 0x0000FFFF;
3189 weight_vec = __msa_fill_w(weight);
3190 rnd_vec = __msa_fill_w(rnd_val);
3195 weight_vec_h = __msa_fill_h(weight);
3196 offset_vec = __msa_fill_h(offset);
3197 denom_vec = __msa_fill_h(rnd_val);
3199 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3200 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3206 for (loop_cnt = (height >> 1); loop_cnt--;) {
3207 LD_SB2(src, 16, src0, src1);
3208 src2 =
LD_SB(src + 24);
3210 LD_SB2(src, 16, src3, src4);
3211 src5 =
LD_SB(src + 24);
3214 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3215 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec2, vec3);
3216 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec4, vec5);
3217 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec6, vec7);
3222 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3223 VSHF_B2_SB(src3, src4, src3, src4, mask2, mask3, vec2, vec3);
3224 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec4, vec5);
3225 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec6, vec7);
3232 weight_vec, offset_vec, rnd_vec,
3233 dst0, dst1, dst2, dst3);
3236 weight_vec, offset_vec, rnd_vec,
3237 dst4, dst5, dst6, dst7);
3241 ST_UB2(out0, out1, dst, 16);
3243 ST_UB2(out2, out3, dst, 16);
3258 v16i8
src0,
src1, src2, src3, src4;
3259 v16i8 src10_r, src32_r, src21_r, src43_r;
3260 v16i8 src2110, src4332;
3262 v4i32 dst0_r, dst0_l;
3264 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3265 v4i32 weight_vec, rnd_vec;
3269 weight = weight & 0x0000FFFF;
3271 weight_vec = __msa_fill_w(weight);
3272 rnd_vec = __msa_fill_w(rnd_val);
3277 weight_vec_h = __msa_fill_h(weight);
3278 offset_vec = __msa_fill_h(offset);
3279 denom_vec = __msa_fill_h(rnd_val);
3281 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3282 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3284 filter_vec =
LD_SH(filter);
3287 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3288 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3289 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3290 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3294 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
3296 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
3297 dst0 = __msa_adds_s_h(dst0, offset_vec);
3299 out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
3300 ST_W2(out, 0, 1, dst, dst_stride);
3313 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3314 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3315 v16i8 src2110, src4332, src6554;
3318 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3319 v4i32 weight_vec, rnd_vec;
3323 weight = weight & 0x0000FFFF;
3325 weight_vec = __msa_fill_w(weight);
3326 rnd_vec = __msa_fill_w(rnd_val);
3331 weight_vec_h = __msa_fill_h(weight);
3332 offset_vec = __msa_fill_h(offset);
3333 denom_vec = __msa_fill_h(rnd_val);
3335 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3336 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3338 filter_vec =
LD_SH(filter);
3341 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3342 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3343 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3344 src32_r, src43_r, src54_r, src65_r);
3345 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3346 src2110, src4332, src6554);
3353 out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3354 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3369 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3370 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3371 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3372 v16i8 src2110, src4332, src6554, src8776;
3374 v8i16 dst0, dst1, dst2, dst3, filt0, filt1;
3375 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3376 v4i32 weight_vec, rnd_vec;
3380 weight = weight & 0x0000FFFF;
3382 weight_vec = __msa_fill_w(weight);
3383 rnd_vec = __msa_fill_w(rnd_val);
3388 weight_vec_h = __msa_fill_h(weight);
3389 offset_vec = __msa_fill_h(offset);
3390 denom_vec = __msa_fill_h(rnd_val);
3392 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3393 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3395 filter_vec =
LD_SH(filter);
3398 LD_SB3(src, src_stride, src0, src1, src2);
3399 src += (3 * src_stride);
3400 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3401 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3402 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3404 for (loop_cnt = (height >> 3); loop_cnt--;) {
3406 src3, src4, src5, src6, src7, src8, src9, src10);
3407 src += (8 * src_stride);
3408 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3409 src32_r, src43_r, src54_r, src65_r);
3410 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3411 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3412 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3413 src109_r, src98_r, src4332, src6554, src8776, src10998);
3421 weight_vec, offset_vec, rnd_vec,
3422 dst0, dst1, dst2, dst3);
3425 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3426 dst += (8 * dst_stride);
3445 filter, weight, offset, rnd_val);
3446 }
else if (4 == height) {
3448 filter, weight, offset, rnd_val);
3449 }
else if (0 == (height % 8)) {
3451 filter, height, weight, offset,
3466 v16u8 out0, out1, out2, out3;
3467 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3468 v16i8 src10_r, src32_r, src21_r, src43_r;
3469 v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3471 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3472 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3473 v4i32 weight_vec, rnd_vec;
3477 weight = weight & 0x0000FFFF;
3479 weight_vec = __msa_fill_w(weight);
3480 rnd_vec = __msa_fill_w(rnd_val);
3485 weight_vec_h = __msa_fill_h(weight);
3486 offset_vec = __msa_fill_h(offset);
3487 denom_vec = __msa_fill_h(rnd_val);
3489 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3490 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3492 filter_vec =
LD_SH(filter);
3495 LD_SB3(src, src_stride, src0, src1, src2);
3496 src += (3 * src_stride);
3497 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3500 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3501 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3502 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3503 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3504 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3515 weight_vec, offset_vec, rnd_vec,
3516 dst0, dst1, dst2, dst3);
3518 weight_vec, offset_vec, rnd_vec,
3519 dst4, dst5, dst6, dst7);
3523 ST_W2(out0, 0, 2, dst, dst_stride);
3524 ST_H2(out0, 2, 6, dst + 4, dst_stride);
3525 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
3526 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3527 dst += (4 * dst_stride);
3528 ST_W2(out2, 0, 2, dst, dst_stride);
3529 ST_H2(out2, 2, 6, dst + 4, dst_stride);
3530 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
3531 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3544 v16i8
src0,
src1, src2, src3, src4;
3545 v16i8 src10_r, src32_r, src21_r, src43_r;
3548 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3549 v4i32 weight_vec, rnd_vec;
3553 weight = weight & 0x0000FFFF;
3555 weight_vec = __msa_fill_w(weight);
3556 rnd_vec = __msa_fill_w(rnd_val);
3561 weight_vec_h = __msa_fill_h(weight);
3562 offset_vec = __msa_fill_h(offset);
3563 denom_vec = __msa_fill_h(rnd_val);
3565 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3566 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3568 filter_vec =
LD_SH(filter);
3571 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3573 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3574 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3581 out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3582 ST_D2(out, 0, 1, dst, dst_stride);
3595 v16i8
src0,
src1, src2, src3, src4;
3596 v16i8 src10_r, src32_r, src21_r, src43_r;
3597 v16i8 src5, src6, src54_r, src65_r;
3599 v8i16 dst0, dst1, dst2, dst3;
3600 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3601 v4i32 weight_vec, rnd_vec;
3605 weight = weight & 0x0000FFFF;
3607 weight_vec = __msa_fill_w(weight);
3608 rnd_vec = __msa_fill_w(rnd_val);
3613 weight_vec_h = __msa_fill_h(weight);
3614 offset_vec = __msa_fill_h(offset);
3615 denom_vec = __msa_fill_h(rnd_val);
3617 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3618 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3620 filter_vec =
LD_SH(filter);
3623 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3624 src += (3 * src_stride);
3626 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3627 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3628 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3634 offset_vec, rnd_vec, dst0, dst1, dst2,
3637 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3649 v16u8 out0, out1, out2;
3650 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3651 v16i8 src10_r, src32_r, src54_r, src76_r;
3652 v16i8 src21_r, src43_r, src65_r, src87_r;
3653 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3655 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3656 v4i32 weight_vec, rnd_vec;
3660 weight = weight & 0x0000FFFF;
3662 weight_vec = __msa_fill_w(weight);
3663 rnd_vec = __msa_fill_w(rnd_val);
3668 weight_vec_h = __msa_fill_h(weight);
3669 offset_vec = __msa_fill_h(offset);
3670 denom_vec = __msa_fill_h(rnd_val);
3672 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3673 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3675 filter_vec =
LD_SH(filter);
3678 LD_SB3(src, src_stride, src0, src1, src2);
3679 src += (3 * src_stride);
3680 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3684 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3686 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3695 offset_vec, rnd_vec, dst0, dst1, dst2, dst3);
3698 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3699 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3700 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3714 v16u8 out0, out1, out2, out3;
3715 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3716 v16i8 src10_r, src32_r, src21_r, src43_r;
3717 v16i8 src54_r, src65_r, src76_r, src87_r, src98_r, src109_r;
3719 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3720 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3721 v4i32 weight_vec, rnd_vec;
3725 weight = weight & 0x0000FFFF;
3727 weight_vec = __msa_fill_w(weight);
3728 rnd_vec = __msa_fill_w(rnd_val);
3733 weight_vec_h = __msa_fill_h(weight);
3734 offset_vec = __msa_fill_h(offset);
3735 denom_vec = __msa_fill_h(rnd_val);
3737 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3738 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3740 filter_vec =
LD_SH(filter);
3743 LD_SB3(src, src_stride, src0, src1, src2);
3744 src += (3 * src_stride);
3746 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3748 for (loop_cnt = (height >> 3); loop_cnt--;) {
3750 src3, src4, src5, src6, src7, src8, src9, src10);
3751 src += (8 * src_stride);
3753 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3754 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3755 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3756 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3766 offset_vec, rnd_vec, dst0, dst1, dst2,
3769 offset_vec, rnd_vec, dst4, dst5, dst6,
3773 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3774 dst += (8 * dst_stride);
3794 filter, weight, offset, rnd_val);
3795 }
else if (4 == height) {
3797 filter, weight, offset, rnd_val);
3798 }
else if (6 == height) {
3800 filter, weight, offset, rnd_val);
3803 filter, height, weight, offset,
3819 v16u8 out0, out1, out2, out3, out4, out5;
3820 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3821 v16i8 src10_r, src32_r, src21_r, src43_r;
3822 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3823 v16i8 src2110, src4332;
3824 v16i8 src54_r, src76_r, src98_r, src65_r, src87_r, src109_r;
3825 v16i8 src76_l, src98_l, src87_l, src109_l, src6554, src8776, src10998;
3827 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3828 v8i16 dst9, dst10, dst11, filter_vec, weight_vec_h, offset_vec, denom_vec;
3829 v4i32 weight_vec, rnd_vec;
3831 src -= (1 * src_stride);
3833 weight = weight & 0x0000FFFF;
3835 weight_vec = __msa_fill_w(weight);
3836 rnd_vec = __msa_fill_w(rnd_val);
3841 weight_vec_h = __msa_fill_h(weight);
3842 offset_vec = __msa_fill_h(offset);
3843 denom_vec = __msa_fill_h(rnd_val);
3845 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3846 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3848 filter_vec =
LD_SH(filter);
3851 LD_SB3(src, src_stride, src0, src1, src2);
3852 src += (3 * src_stride);
3854 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3855 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3856 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3858 for (loop_cnt = 2; loop_cnt--;) {
3859 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3860 src += (8 * src_stride);
3866 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3867 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3875 offset_vec, rnd_vec, dst0, dst1, dst2,
3878 rnd_vec, dst4, dst5);
3879 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
3880 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3881 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
3882 dst += (4 * dst_stride);
3888 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
3889 src10998 = (v16i8) __msa_ilvr_d((v2i64) src109_l, (v2i64) src98_l);
3897 offset_vec, rnd_vec, dst6, dst7, dst8,
3900 rnd_vec, dst10, dst11);
3901 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
3902 ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride);
3903 ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride);
3904 dst += (4 * dst_stride);
3924 v16u8 out0, out1, out2, out3;
3925 v16i8
src0,
src1, src2, src3, src4, src5;
3926 v16i8 src10_r, src32_r, src21_r, src43_r;
3927 v16i8 src10_l, src32_l, src21_l, src43_l;
3928 v16i8 src54_r, src54_l, src65_r, src65_l, src6;
3930 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3931 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
3932 v4i32 weight_vec, rnd_vec;
3936 weight = weight & 0x0000FFFF;
3938 weight_vec = __msa_fill_w(weight);
3939 rnd_vec = __msa_fill_w(rnd_val);
3944 weight_vec_h = __msa_fill_h(weight);
3945 offset_vec = __msa_fill_h(offset);
3946 denom_vec = __msa_fill_h(rnd_val);
3948 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
3949 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
3951 filter_vec =
LD_SH(filter);
3954 LD_SB3(src, src_stride, src0, src1, src2);
3955 src += (3 * src_stride);
3957 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3958 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3960 for (loop_cnt = (height >> 2); loop_cnt--;) {
3961 LD_SB4(src, src_stride, src3, src4, src5, src6);
3962 src += (4 * src_stride);
3977 offset_vec, rnd_vec, dst0, dst1, dst2,
3980 offset_vec, rnd_vec, dst4, dst5, dst6,
3982 PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
3984 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
3985 dst += (4 * dst_stride);
4006 v16u8 out0, out1, out2, out3, out4, out5;
4007 v16i8
src0,
src1, src2, src3, src4, src5;
4008 v16i8 src6, src7, src8, src9, src10, src11, src12, src13;
4009 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
4010 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4011 v16i8 src87_r, src98_r, src109_r, src1110_r, src1211_r, src1312_r;
4013 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
4014 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec, dst11;
4015 v4i32 weight_vec, rnd_vec;
4019 weight = weight & 0x0000FFFF;
4021 weight_vec = __msa_fill_w(weight);
4022 rnd_vec = __msa_fill_w(rnd_val);
4027 weight_vec_h = __msa_fill_h(weight);
4028 offset_vec = __msa_fill_h(offset);
4029 denom_vec = __msa_fill_h(rnd_val);
4031 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4032 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4034 filter_vec =
LD_SH(filter);
4037 LD_SB3(src, src_stride, src0, src1, src2);
4038 LD_SB3(src + 16, src_stride, src7, src8, src9);
4039 src += (3 * src_stride);
4042 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4043 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4044 ILVR_B2_SB(src8, src7, src9, src8, src87_r, src98_r);
4046 for (loop_cnt = 8; loop_cnt--;) {
4047 LD_SB4(src, src_stride, src3, src4, src5, src6);
4048 LD_SB4(src + 16, src_stride, src10, src11, src12, src13);
4049 src += (4 * src_stride);
4052 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4053 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4056 ILVR_B2_SB(src10, src9, src11, src10, src109_r, src1110_r);
4057 ILVR_B2_SB(src12, src11, src13, src12, src1211_r, src1312_r);
4071 offset_vec, rnd_vec, dst0, dst1, dst2,
4074 offset_vec, rnd_vec, dst4, dst5, dst6,
4077 offset_vec, rnd_vec, dst8, dst9, dst10,
4079 PCKEV_B4_UB(dst4, dst0, dst5, dst1, dst6, dst2, dst7, dst3, out0, out1,
4081 PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
4082 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
4083 ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride);
4084 dst += (4 * dst_stride);
4092 src87_r = src1211_r;
4093 src98_r = src1312_r;
4108 v16u8 out0, out1, out2, out3;
4109 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
4110 v16i8 src10_r, src32_r, src76_r, src98_r;
4111 v16i8 src21_r, src43_r, src65_r, src87_r;
4112 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4113 v16i8 src10_l, src32_l, src76_l, src98_l;
4114 v16i8 src21_l, src43_l, src65_l, src87_l;
4116 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
4117 v4i32 weight_vec, rnd_vec;
4121 weight = weight & 0x0000FFFF;
4123 weight_vec = __msa_fill_w(weight);
4124 rnd_vec = __msa_fill_w(rnd_val);
4129 weight_vec_h = __msa_fill_h(weight);
4130 offset_vec = __msa_fill_h(offset);
4131 denom_vec = __msa_fill_h(rnd_val);
4133 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
4134 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
4136 filter_vec =
LD_SH(filter);
4139 LD_SB3(src, src_stride, src0, src1, src2);
4140 LD_SB3(src + 16, src_stride, src5, src6, src7);
4141 src += (3 * src_stride);
4143 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4144 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4145 ILVR_B2_SB(src6, src5, src7, src6, src65_r, src76_r);
4146 ILVL_B2_SB(src6, src5, src7, src6, src65_l, src76_l);
4148 for (loop_cnt = (height >> 1); loop_cnt--;) {
4149 LD_SB2(src, src_stride, src3, src4);
4150 LD_SB2(src + 16, src_stride, src8, src9);
4151 src += (2 * src_stride);
4153 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4154 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4166 offset_vec, rnd_vec, dst0, dst1, dst2,
4169 offset_vec, rnd_vec, dst4, dst5, dst6,
4171 PCKEV_B4_UB(dst2, dst0, dst3, dst1, dst6, dst4, dst7, dst5, out0, out1,
4173 ST_UB2(out0, out2, dst, 16);
4175 ST_UB2(out1, out3, dst, 16);
4195 const int8_t *filter_x,
4196 const int8_t *filter_y,
4202 v16i8
src0,
src1, src2, src3, src4;
4206 v8i16 filt_h0, filt_h1, filter_vec,
tmp;
4207 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4208 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
4209 v8i16 offset_vec, const_128, denom_vec;
4210 v4i32 dst0, dst1, weight_vec, rnd_vec;
4212 src -= (src_stride + 1);
4214 filter_vec =
LD_SH(filter_x);
4217 filter_vec =
LD_SH(filter_y);
4224 weight_vec = __msa_fill_w(weight);
4225 rnd_vec = __msa_fill_w(rnd_val);
4227 offset_vec = __msa_fill_h(offset);
4228 denom_vec = __msa_fill_h(rnd_val - 6);
4229 const_128 = __msa_fill_h((128 * weight));
4230 offset_vec += __msa_srar_h(const_128, denom_vec);
4232 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4234 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4235 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4236 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4246 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4248 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4251 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8) tmp);
4252 ST_W2(out, 0, 1, dst, dst_stride);
4259 const int8_t *filter_x,
4260 const int8_t *filter_y,
4266 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4268 v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1;
4271 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4272 v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
4273 v8i16 offset_vec, const_128, denom_vec;
4274 v4i32 dst0, dst1, dst2, dst3, weight_vec, rnd_vec;
4276 src -= (src_stride + 1);
4278 filter_vec =
LD_SH(filter_x);
4281 filter_vec =
LD_SH(filter_y);
4288 weight_vec = __msa_fill_w(weight);
4289 rnd_vec = __msa_fill_w(rnd_val);
4291 offset_vec = __msa_fill_h(offset);
4292 denom_vec = __msa_fill_h(rnd_val - 6);
4293 const_128 = __msa_fill_h((128 * weight));
4294 offset_vec += __msa_srar_h(const_128, denom_vec);
4296 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4298 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4299 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4300 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4301 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4313 SRA_4V(dst0, dst1, dst2, dst3, 6);
4314 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4315 MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4318 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4320 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4321 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
4328 const int8_t *filter_x,
4329 const int8_t *filter_y,
4337 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4341 v8i16 filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4342 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4343 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4344 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4345 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4346 v8i16 dst98_r, dst109_r, offset_vec, const_128, denom_vec;
4347 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
4349 src -= (src_stride + 1);
4351 filter_vec =
LD_SH(filter_x);
4354 filter_vec =
LD_SH(filter_y);
4361 weight_vec = __msa_fill_w(weight);
4362 rnd_vec = __msa_fill_w(rnd_val);
4364 offset_vec = __msa_fill_h(offset);
4365 denom_vec = __msa_fill_h(rnd_val - 6);
4366 const_128 = __msa_fill_h((128 * weight));
4367 offset_vec += __msa_srar_h(const_128, denom_vec);
4369 LD_SB3(src, src_stride, src0, src1, src2);
4370 src += (3 * src_stride);
4373 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4374 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4378 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4380 for (loop_cnt = height >> 3; loop_cnt--;) {
4382 src3, src4, src5, src6, src7, src8, src9, src10);
4383 src += (8 * src_stride);
4386 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4387 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4388 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4389 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4394 dst32_r = __msa_ilvr_h(dst73, dst22);
4398 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4399 dst76_r = __msa_ilvr_h(dst22, dst106);
4408 SRA_4V(dst0, dst1, dst2, dst3, 6);
4409 SRA_4V(dst4, dst5, dst6, dst7, 6);
4410 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
4411 MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
4412 MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
4413 MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
4416 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4418 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4419 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4422 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4423 dst += (8 * dst_stride);
4427 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4435 const int8_t *filter_x,
4436 const int8_t *filter_y,
4444 filter_x, filter_y, weight,
4446 }
else if (4 == height) {
4448 filter_x,filter_y, weight,
4450 }
else if (0 == (height % 8)) {
4452 filter_x, filter_y, height, weight,
4461 const int8_t *filter_x,
4462 const int8_t *filter_y,
4468 v16u8 out0, out1, out2;
4469 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4473 v8i16 filt_h0, filt_h1, filter_vec;
4474 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4475 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4476 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4477 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4478 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4479 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4480 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4481 v8i16 offset_vec, const_128, denom_vec;
4482 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4483 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, weight_vec, rnd_vec;
4485 src -= (src_stride + 1);
4487 filter_vec =
LD_SH(filter_x);
4490 filter_vec =
LD_SH(filter_y);
4497 weight_vec = __msa_fill_w(weight);
4498 rnd_vec = __msa_fill_w(rnd_val);
4500 offset_vec = __msa_fill_h(offset);
4501 denom_vec = __msa_fill_h(rnd_val - 6);
4502 const_128 = __msa_fill_h((128 * weight));
4503 offset_vec += __msa_srar_h(const_128, denom_vec);
4505 LD_SB3(src, src_stride, src0, src1, src2);
4506 src += (3 * src_stride);
4509 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4510 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4511 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4518 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
4520 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4521 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4522 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4523 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4528 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4529 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4530 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4531 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4544 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4545 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4546 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4558 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4559 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4560 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4561 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4562 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4563 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4564 MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4565 MUL2(dst6_r, weight_vec, dst7_r, weight_vec, dst6_r, dst7_r);
4566 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4567 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4568 SRAR_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, rnd_vec);
4569 SRAR_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, rnd_vec);
4570 SRAR_W4_SW(dst0_l, dst1_l, dst2_l, dst3_l, rnd_vec);
4571 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4572 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4573 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4574 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4575 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4576 ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4579 PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4580 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4581 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4588 const int8_t *filter_x,
4589 const int8_t *filter_y,
4595 v16i8
src0,
src1, src2, src3, src4;
4597 v8i16 filt_h0, filt_h1, filter_vec;
4600 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4601 v8i16 dst0, dst1, dst2, dst3, dst4;
4602 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4603 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4604 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4606 v8i16 offset_vec, const_128, denom_vec;
4607 v4i32 weight_vec, rnd_vec;
4609 src -= (src_stride + 1);
4611 filter_vec =
LD_SH(filter_x);
4614 filter_vec =
LD_SH(filter_y);
4621 weight_vec = __msa_fill_w(weight);
4622 rnd_vec = __msa_fill_w(rnd_val);
4624 offset_vec = __msa_fill_h(offset);
4625 denom_vec = __msa_fill_h(rnd_val - 6);
4626 const_128 = __msa_fill_h((128 * weight));
4627 offset_vec += __msa_srar_h(const_128, denom_vec);
4629 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4631 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4632 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4633 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4634 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4635 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4649 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4650 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4651 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4652 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4653 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4654 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4656 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4657 ST_D2(out, 0, 1, dst, dst_stride);
4664 const int8_t *filter_x,
4665 const int8_t *filter_y,
4673 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
4674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
4676 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4677 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4678 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4679 v8i16 offset_vec, const_128, denom_vec;
4680 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4681 v4i32 weight_vec, rnd_vec;
4683 src -= (src_stride + 1);
4685 filter_vec =
LD_SH(filter_x);
4688 filter_vec =
LD_SH(filter_y);
4696 weight_vec = __msa_fill_w(weight);
4697 rnd_vec = __msa_fill_w(rnd_val);
4699 offset_vec = __msa_fill_h(offset);
4700 denom_vec = __msa_fill_h(rnd_val - 6);
4701 const_128 = __msa_fill_h((128 * weight));
4702 offset_vec += __msa_srar_h(const_128, denom_vec);
4704 for (cnt = width8mult; cnt--;) {
4705 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
4708 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4709 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4710 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4716 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4717 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4718 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4719 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4736 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4737 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4738 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4739 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4740 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4741 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4742 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4743 SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4744 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4745 dst3_r, tmp0, tmp1, tmp2, tmp3);
4746 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4747 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4750 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4759 const int8_t *filter_x,
4760 const int8_t *filter_y,
4765 v16u8 out0, out1, out2;
4766 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4768 v8i16 filt_h0, filt_h1, filter_vec;
4771 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4772 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4773 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4774 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4775 v4i32 dst4_r, dst4_l, dst5_r, dst5_l, weight_vec, rnd_vec;
4776 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4777 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4778 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4779 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4780 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4781 v8i16 offset_vec, const_128, denom_vec;
4783 src -= (src_stride + 1);
4785 filter_vec =
LD_SH(filter_x);
4788 filter_vec =
LD_SH(filter_y);
4795 weight_vec = __msa_fill_w(weight);
4796 rnd_vec = __msa_fill_w(rnd_val);
4798 offset_vec = __msa_fill_h(offset);
4799 denom_vec = __msa_fill_h(rnd_val - 6);
4800 const_128 = __msa_fill_h((128 * weight));
4801 offset_vec += __msa_srar_h(const_128, denom_vec);
4803 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
4804 src += (5 * src_stride);
4805 LD_SB4(src, src_stride, src5, src6, src7, src8);
4808 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4809 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4810 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4811 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4812 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4813 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4814 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4815 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4816 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4846 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4847 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4848 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4849 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4850 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4851 MUL2(dst4_r, weight_vec, dst5_r, weight_vec, dst4_r, dst5_r);
4852 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4853 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4854 MUL2(dst4_l, weight_vec, dst5_l, weight_vec, dst4_l, dst5_l);
4855 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4856 SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4857 SRAR_W4_SW(dst4_r, dst4_l, dst5_r, dst5_l, rnd_vec);
4858 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4859 tmp0, tmp1, tmp2, tmp3);
4860 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4861 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4862 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4863 ADD2(tmp4, offset_vec, tmp5, offset_vec, tmp4, tmp5);
4866 PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
4867 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4868 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4875 const int8_t *filter_x,
4876 const int8_t *filter_y,
4883 uint32_t loop_cnt, cnt;
4887 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4889 v8i16 filt_h0, filt_h1, filter_vec;
4892 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4893 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4894 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4895 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4896 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4897 v8i16 offset_vec, const_128, denom_vec;
4898 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4899 v4i32 weight_vec, rnd_vec;
4901 src -= (src_stride + 1);
4903 filter_vec =
LD_SH(filter_x);
4906 filter_vec =
LD_SH(filter_y);
4913 weight_vec = __msa_fill_w(weight);
4914 rnd_vec = __msa_fill_w(rnd_val);
4916 offset_vec = __msa_fill_h(offset);
4917 denom_vec = __msa_fill_h(rnd_val - 6);
4918 const_128 = __msa_fill_h((128 * weight));
4919 offset_vec += __msa_srar_h(const_128, denom_vec);
4921 for (cnt = width8mult; cnt--;) {
4925 LD_SB3(src_tmp, src_stride, src0, src1, src2);
4926 src_tmp += (3 * src_stride);
4929 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4930 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4931 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4939 for (loop_cnt = height >> 2; loop_cnt--;) {
4940 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4941 src_tmp += (4 * src_stride);
4944 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4945 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4946 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4947 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4964 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4965 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4966 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4967 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
4968 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
4969 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
4970 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
4971 SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
4972 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4973 dst3_r, tmp0, tmp1, tmp2, tmp3);
4974 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
4975 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
4978 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4979 dst_tmp += (4 * dst_stride);
4997 const int8_t *filter_x,
4998 const int8_t *filter_y,
5007 filter_x, filter_y, weight,
5009 }
else if (4 == height) {
5011 filter_x, filter_y, 1, weight,
5013 }
else if (6 == height) {
5015 filter_x, filter_y, weight,
5017 }
else if (0 == (height % 4)) {
5019 filter_x, filter_y, height, weight,
5020 offset, rnd_val, 1);
5028 const int8_t *filter_x,
5029 const int8_t *filter_y,
5038 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5039 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5040 v16i8 mask0, mask1, mask2, mask3;
5041 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
5042 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5043 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5044 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5045 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5046 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5047 v8i16 offset_vec, const_128, denom_vec;
5048 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5049 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, weight_vec, rnd_vec;
5051 src -= (src_stride + 1);
5053 filter_vec =
LD_SH(filter_x);
5056 filter_vec =
LD_SH(filter_y);
5064 weight_vec = __msa_fill_w(weight);
5065 rnd_vec = __msa_fill_w(rnd_val);
5067 offset_vec = __msa_fill_h(offset);
5068 denom_vec = __msa_fill_h(rnd_val - 6);
5069 const_128 = __msa_fill_h((128 * weight));
5070 offset_vec += __msa_srar_h(const_128, denom_vec);
5075 LD_SB3(src_tmp, src_stride, src0, src1, src2);
5076 src_tmp += (3 * src_stride);
5078 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5079 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5080 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5087 for (loop_cnt = 4; loop_cnt--;) {
5088 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
5089 src_tmp += (4 * src_stride);
5091 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5092 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5093 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5094 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5111 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5112 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5113 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
5114 MUL2(dst2_r, weight_vec, dst3_r, weight_vec, dst2_r, dst3_r);
5115 MUL2(dst0_l, weight_vec, dst1_l, weight_vec, dst0_l, dst1_l);
5116 MUL2(dst2_l, weight_vec, dst3_l, weight_vec, dst2_l, dst3_l);
5117 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5118 SRAR_W4_SW(dst2_r, dst2_l, dst3_r, dst3_l, rnd_vec);
5119 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5120 dst3_r, tmp0, tmp1, tmp2, tmp3);
5121 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5122 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5125 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
5126 dst_tmp += (4 * dst_stride);
5141 LD_SB3(src, src_stride, src0, src1, src2);
5142 src += (3 * src_stride);
5144 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5145 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5149 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5151 for (loop_cnt = 2; loop_cnt--;) {
5152 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
5154 src += (8 * src_stride);
5156 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5157 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5158 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5159 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5164 dst32_r = __msa_ilvr_h(dst73, dst22);
5168 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5169 dst76_r = __msa_ilvr_h(dst22, dst106);
5178 SRA_4V(dst0, dst1, dst2, dst3, 6);
5179 SRA_4V(dst4, dst5, dst6, dst7, 6);
5180 MUL2(dst0, weight_vec, dst1, weight_vec, dst0, dst1);
5181 MUL2(dst2, weight_vec, dst3, weight_vec, dst2, dst3);
5182 MUL2(dst4, weight_vec, dst5, weight_vec, dst4, dst5);
5183 MUL2(dst6, weight_vec, dst7, weight_vec, dst6, dst7);
5186 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5188 ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
5189 ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
5192 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5193 dst += (8 * dst_stride);
5197 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5205 const int8_t *filter_x,
5206 const int8_t *filter_y,
5214 filter_x, filter_y, 2, weight, offset,
5218 filter_x, filter_y, height, weight,
5219 offset, rnd_val, 2);
5227 const int8_t *filter_x,
5228 const int8_t *filter_y,
5235 filter_x, filter_y, height, weight,
5236 offset, rnd_val, 3);
5243 const int8_t *filter_x,
5244 const int8_t *filter_y,
5251 filter_x, filter_y, height, weight,
5252 offset, rnd_val, 4);
5255 #define UNIWGT_MC_COPY(WIDTH) \ 5256 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 5257 ptrdiff_t dst_stride, \ 5259 ptrdiff_t src_stride, \ 5268 int shift = denom + 14 - 8; \ 5269 hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 5270 height, weight, offset, shift); \ 5283 #undef UNIWGT_MC_COPY 5285 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 5286 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 5300 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 5301 int shift = denom + 14 - 8; \ 5303 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \ 5304 dst_stride, filter, height, \ 5305 weight, offset, shift); \ 5344 #define UNI_W_MC_HV(PEL, WIDTH, TAP) \ 5345 void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 5346 ptrdiff_t dst_stride, \ 5348 ptrdiff_t src_stride, \ 5357 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 5358 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 5359 int shift = denom + 14 - 8; \ 5361 hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \ 5362 filter_x, filter_y, height, \ 5363 weight, offset, shift); \
static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B5_128_SB(...)
#define CLIP_SW4_0_255(in0, in1, in2, in3)
#define XORI_B8_128_SB(...)
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_D2(in, idx0, idx1, pdst, stride)
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC_HV(PEL, WIDTH, TAP)
#define XORI_B2_128_SB(...)
#define MUL2(in0, in1, in2, in3, out0, out1)
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B3_128_SB(...)
static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNPCK_R_SB_SH(in, out)
#define SPLATI_H2_SH(...)
#define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w,out0_h, out1_h)
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width8mult)
static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SRA_4V(in0, in1, in2, in3, shift)
static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LD4(psrc, stride, out0, out1, out2, out3)
it s the only field you need to keep assuming you have a context There is some magic you don t need to care about around this just let it vf offset
#define SPLATI_W2_SH(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define CLIP_SH_0_255(in)
static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_H4_SH(...)
#define ST_D1(in, idx, pdst)
static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_H2(in, idx0, idx1, pdst, stride)
#define CLIP_SH2_0_255(in0, in1)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,idx6, idx7, pdst, stride)
static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static int aligned(int val)
static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B7_128_SB(...)
static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define LW2(psrc, stride, out0, out1)
#define XORI_B4_128_SB(...)
#define SPLATI_W4_SH(...)
static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t weightmul16)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define CLIP_SW2_0_255(in0, in1)
static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult, int32_t weight, int32_t offset, int32_t rnd_val)
#define SPLATI_W4_SW(...)
static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val, int32_t width)
static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static const uint8_t ff_hevc_mask_arr[16 *2]
static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_uniwgt_copy_48w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
#define ADD2(in0, in1, in2, in3, out0, out1)
#define INSERT_W2_SB(...)
static int weight(int i, int blen, int offset)
static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t weight, int32_t offset, int32_t rnd_val)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define UNIWGT_MC_COPY(WIDTH)
#define INSERT_D2_SB(...)
static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define XORI_B6_128_SB(...)
#define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w,offset_h, rnd_w, out0_h, out1_h,out2_h, out3_h)
static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
#define ST_W2(in, idx0, idx1, pdst, stride)
#define SLLI_2V(in0, in1, shift)
#define INSERT_W4_SB(...)
#define LD2(psrc, stride, out0, out1)
static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_32w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_uniwgt_copy_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t weight, int32_t offset, int32_t rnd_val)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)