27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
52 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \ 53 filt0, filt1, filt2, filt3) \ 57 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ 58 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ 59 tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \ 60 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \ 61 tmp0 = __msa_adds_s_h(tmp0, tmp1); \ 66 #define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, \ 67 filt_h0, filt_h1, filt_h2, filt_h3) \ 69 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 72 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3, \ 73 vec0_m, vec1_m, vec2_m, vec3_m); \ 74 hz_out_m = FILT_8TAP_DPADD_S_H(vec0_m, vec1_m, vec2_m, vec3_m, \ 75 filt_h0, filt_h1, filt_h2, filt_h3); \ 77 hz_out_m = __msa_srari_h(hz_out_m, 7); \ 78 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 83 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 84 mask0, mask1, mask2, mask3, \ 85 filt0, filt1, filt2, filt3, \ 88 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 89 v8i16 res0_m, res1_m, res2_m, res3_m; \ 91 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 92 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \ 93 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 94 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \ 95 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 96 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \ 97 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \ 98 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \ 99 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \ 102 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 103 mask0, mask1, mask2, mask3, \ 104 filt0, filt1, filt2, filt3, \ 105 out0, out1, out2, out3) \ 107 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 108 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \ 110 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 111 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 112 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 113 res0_m, res1_m, res2_m, res3_m); \ 114 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \ 115 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \ 116 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \ 117 res4_m, res5_m, res6_m, res7_m); \ 118 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \ 119 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \ 120 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \ 121 res0_m, res1_m, res2_m, res3_m); \ 122 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \ 123 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \ 124 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \ 125 res4_m, res5_m, res6_m, res7_m); \ 126 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \ 127 res7_m, out0, out1, out2, out3); \ 130 #define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst) \ 134 tmp_m = PCKEV_XORI128_UB(in1, in0); \ 135 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \ 136 ST_UB(tmp_m, (pdst)); \ 139 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \ 143 tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \ 144 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \ 145 ST_UB(tmp_m, (pdst)); \ 148 #define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3, dst0, dst1, \ 151 v16u8 tmp0_m, tmp1_m; \ 152 uint8_t *pdst_m = (uint8_t *) (pdst); \ 154 PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \ 155 AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \ 156 ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \ 163 v16u8 mask0, mask1, mask2, mask3,
out;
164 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
165 v8i16
filt, out0, out1;
171 filt =
LD_SH(filter);
172 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
178 LD_SB4(src, src_stride, src0, src1, src2, src3);
181 mask3, filt0, filt1, filt2, filt3, out0, out1);
185 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
192 v16i8 filt0, filt1, filt2, filt3;
194 v16u8 mask0, mask1, mask2, mask3,
out;
195 v8i16
filt, out0, out1, out2, out3;
201 filt =
LD_SH(filter);
202 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
208 LD_SB4(src, src_stride, src0, src1, src2, src3);
210 src += (4 * src_stride);
212 mask3, filt0, filt1, filt2, filt3, out0, out1);
213 LD_SB4(src, src_stride, src0, src1, src2, src3);
216 mask3, filt0, filt1, filt2, filt3, out2, out3);
220 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
222 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
231 }
else if (8 == height) {
240 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
241 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
242 v8i16
filt, out0, out1, out2, out3;
248 filt =
LD_SH(filter);
249 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
255 LD_SB4(src, src_stride, src0, src1, src2, src3);
258 mask3, filt0, filt1, filt2, filt3, out0, out1,
264 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
272 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
273 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
274 v8i16
filt, out0, out1, out2, out3;
280 filt =
LD_SH(filter);
281 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
287 for (loop_cnt = (height >> 2); loop_cnt--;) {
288 LD_SB4(src, src_stride, src0, src1, src2, src3);
290 src += (4 * src_stride);
292 mask3, filt0, filt1, filt2, filt3, out0,
298 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
299 dst += (4 * dst_stride);
320 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
321 v16u8 mask0, mask1, mask2, mask3,
out;
322 v8i16
filt, out0, out1, out2, out3;
328 filt =
LD_SH(filter);
329 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
335 for (loop_cnt = (height >> 1); loop_cnt--;) {
336 LD_SB2(src, src_stride, src0, src2);
337 LD_SB2(src + 8, src_stride, src1, src3);
339 src += (2 * src_stride);
341 mask3, filt0, filt1, filt2, filt3, out0,
359 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
360 v16u8 mask0, mask1, mask2, mask3,
out;
361 v8i16
filt, out0, out1, out2, out3;
367 filt =
LD_SH(filter);
368 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
374 for (loop_cnt = (height >> 1); loop_cnt--;) {
376 src2 =
LD_SB(src + 16);
377 src3 =
LD_SB(src + 24);
378 src1 = __msa_sldi_b(src2, src0, 8);
382 mask3, filt0, filt1, filt2, filt3, out0,
388 src2 =
LD_SB(src + 16);
389 src3 =
LD_SB(src + 24);
390 src1 = __msa_sldi_b(src2, src0, 8);
396 ST_UB(out, dst + 16);
401 mask3, filt0, filt1, filt2, filt3, out0,
408 ST_UB(out, dst + 16);
418 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
419 v16u8 mask0, mask1, mask2, mask3,
out;
420 v8i16
filt, out0, out1, out2, out3;
426 filt =
LD_SH(filter);
427 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
433 for (loop_cnt = height; loop_cnt--;) {
435 src2 =
LD_SB(src + 16);
436 src3 =
LD_SB(src + 24);
437 src1 = __msa_sldi_b(src2, src0, 8);
441 mask2, mask3, filt0, filt1, filt2, filt3,
442 out0, out1, out2, out3);
448 ST_UB(out, dst + 16);
450 src0 =
LD_SB(src + 32);
451 src2 =
LD_SB(src + 48);
452 src3 =
LD_SB(src + 56);
453 src1 = __msa_sldi_b(src2, src0, 8);
458 mask2, mask3, filt0, filt1, filt2, filt3,
459 out0, out1, out2, out3);
463 ST_UB(out, dst + 32);
465 ST_UB(out, dst + 48);
475 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
476 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
477 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
478 v16i8 src10998, filt0, filt1, filt2, filt3;
480 v8i16
filt, out10, out32;
482 src -= (3 * src_stride);
484 filt =
LD_SH(filter);
485 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
487 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
488 src += (7 * src_stride);
490 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
492 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
493 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
497 for (loop_cnt = (height >> 2); loop_cnt--;) {
498 LD_SB4(src, src_stride, src7, src8, src9, src10);
499 src += (4 * src_stride);
501 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
502 src87_r, src98_r, src109_r);
503 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
506 filt1, filt2, filt3);
508 filt1, filt2, filt3);
512 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
513 dst += (4 * dst_stride);
527 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
528 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
529 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
531 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
533 src -= (3 * src_stride);
535 filt =
LD_SH(filter);
536 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
538 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
540 src += (7 * src_stride);
541 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
543 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
545 for (loop_cnt = (height >> 2); loop_cnt--;) {
546 LD_SB4(src, src_stride, src7, src8, src9, src10);
548 src += (4 * src_stride);
550 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
551 src87_r, src98_r, src109_r);
553 filt1, filt2, filt3);
555 filt1, filt2, filt3);
557 filt1, filt2, filt3);
559 filt1, filt2, filt3);
561 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
564 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
565 dst += (4 * dst_stride);
582 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
583 v16i8 filt0, filt1, filt2, filt3;
584 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
585 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
586 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
587 v16u8 tmp0, tmp1, tmp2, tmp3;
588 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
590 src -= (3 * src_stride);
592 filt =
LD_SH(filter);
593 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
595 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
597 src += (7 * src_stride);
598 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
600 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
601 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
603 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
605 for (loop_cnt = (height >> 2); loop_cnt--;) {
606 LD_SB4(src, src_stride, src7, src8, src9, src10);
608 src += (4 * src_stride);
610 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
611 src87_r, src98_r, src109_r);
612 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
613 src87_l, src98_l, src109_l);
615 filt1, filt2, filt3);
617 filt1, filt2, filt3);
619 filt1, filt2, filt3);
621 filt1, filt2, filt3);
623 filt1, filt2, filt3);
625 filt1, filt2, filt3);
627 filt1, filt2, filt3);
629 filt1, filt2, filt3);
632 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
633 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
634 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
635 out3_r, tmp0, tmp1, tmp2, tmp3);
637 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
638 dst += (4 * dst_stride);
663 uint32_t loop_cnt, cnt;
664 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
665 v16i8 filt0, filt1, filt2, filt3;
666 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
667 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
668 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
669 v16u8 tmp0, tmp1, tmp2, tmp3;
670 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
672 src -= (3 * src_stride);
674 filt =
LD_SH(filter);
675 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
677 for (cnt = (width >> 4); cnt--;) {
681 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
683 src_tmp += (7 * src_stride);
684 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
685 src32_r, src54_r, src21_r);
686 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
687 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
688 src32_l, src54_l, src21_l);
689 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
691 for (loop_cnt = (height >> 2); loop_cnt--;) {
692 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
694 src_tmp += (4 * src_stride);
695 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
696 src87_r, src98_r, src109_r);
697 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
698 src87_l, src98_l, src109_l);
700 filt0, filt1, filt2, filt3);
702 filt0, filt1, filt2, filt3);
704 filt0, filt1, filt2, filt3);
706 filt0, filt1, filt2, filt3);
708 filt0, filt1, filt2, filt3);
710 filt0, filt1, filt2, filt3);
712 filt0, filt1, filt2, filt3);
714 filt0, filt1, filt2, filt3);
717 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
718 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
719 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
720 out3_r, tmp0, tmp1, tmp2, tmp3);
722 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
723 dst_tmp += (4 * dst_stride);
763 const int8_t *filter_horiz,
764 const int8_t *filter_vert,
768 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
769 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
770 v16u8 mask0, mask1, mask2, mask3,
out;
771 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
772 v8i16 hz_out7, hz_out8, hz_out9, tmp0, tmp1, out0, out1, out2, out3, out4;
773 v8i16
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
776 src -= (3 + 3 * src_stride);
779 filt =
LD_SH(filter_horiz);
780 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
786 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
788 src += (7 * src_stride);
790 hz_out0 =
HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
791 filt_hz1, filt_hz2, filt_hz3);
792 hz_out2 =
HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
793 filt_hz1, filt_hz2, filt_hz3);
794 hz_out4 =
HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
795 filt_hz1, filt_hz2, filt_hz3);
796 hz_out5 =
HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
797 filt_hz1, filt_hz2, filt_hz3);
798 SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
800 filt =
LD_SH(filter_vert);
801 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
803 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
804 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
806 for (loop_cnt = (height >> 2); loop_cnt--;) {
807 LD_SB4(src, src_stride, src7, src8, src9, src10);
809 src += (4 * src_stride);
812 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
813 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
814 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
819 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
820 hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
821 out4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
827 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
828 dst += (4 * dst_stride);
839 const int8_t *filter_horiz,
840 const int8_t *filter_vert,
844 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
845 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
846 v16u8 mask0, mask1, mask2, mask3, vec0, vec1;
847 v8i16
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
848 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
849 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
850 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
853 src -= (3 + 3 * src_stride);
856 filt =
LD_SH(filter_horiz);
857 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
863 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
864 src += (7 * src_stride);
867 hz_out0 =
HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
868 filt_hz1, filt_hz2, filt_hz3);
869 hz_out1 =
HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
870 filt_hz1, filt_hz2, filt_hz3);
871 hz_out2 =
HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
872 filt_hz1, filt_hz2, filt_hz3);
873 hz_out3 =
HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
874 filt_hz1, filt_hz2, filt_hz3);
875 hz_out4 =
HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
876 filt_hz1, filt_hz2, filt_hz3);
877 hz_out5 =
HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
878 filt_hz1, filt_hz2, filt_hz3);
879 hz_out6 =
HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
880 filt_hz1, filt_hz2, filt_hz3);
882 filt =
LD_SH(filter_vert);
883 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
885 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
886 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
887 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
889 for (loop_cnt = (height >> 2); loop_cnt--;) {
890 LD_SB4(src, src_stride, src7, src8, src9, src10);
891 src += (4 * src_stride);
896 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
897 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
902 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
903 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
908 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
909 out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
911 filt_vt1, filt_vt2, filt_vt3);
914 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
915 out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
922 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
923 dst += (4 * dst_stride);
937 const int8_t *filter_horiz,
938 const int8_t *filter_vert,
943 for (multiple8_cnt = 2; multiple8_cnt--;) {
945 filter_vert, height);
954 const int8_t *filter_horiz,
955 const int8_t *filter_vert,
960 for (multiple8_cnt = 4; multiple8_cnt--;) {
962 filter_vert, height);
971 const int8_t *filter_horiz,
972 const int8_t *filter_vert,
977 for (multiple8_cnt = 8; multiple8_cnt--;) {
979 filter_vert, height);
991 uint32_t tp0, tp1, tp2, tp3;
992 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
994 v16u8 mask0, mask1, mask2, mask3;
995 v8i16
filt, res0, res1;
1001 filt =
LD_SH(filter);
1002 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1008 LD_SB4(src, src_stride, src0, src1, src2, src3);
1011 mask3, filt0, filt1, filt2, filt3, res0, res1);
1012 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1017 res = (v16u8) __msa_aver_u_b(res, dst0);
1018 ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
1026 uint32_t tp0, tp1, tp2, tp3;
1027 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
1028 v16u8 mask0, mask1, mask2, mask3, res0, res1, res2, res3;
1030 v8i16
filt, vec0, vec1, vec2, vec3;
1036 filt =
LD_SH(filter);
1037 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1043 LD_SB4(src, src_stride, src0, src1, src2, src3);
1045 src += (4 * src_stride);
1046 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1048 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
1051 mask3, filt0, filt1, filt2, filt3, vec0, vec1);
1052 LD_SB4(src, src_stride, src0, src1, src2, src3);
1055 mask3, filt0, filt1, filt2, filt3, vec2, vec3);
1058 PCKEV_B4_UB(vec0, vec0, vec1, vec1, vec2, vec2, vec3, vec3,
1059 res0, res1, res2, res3);
1060 ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
1063 ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1075 }
else if (8 == height) {
1088 int64_t tp0, tp1, tp2, tp3;
1089 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
1090 v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1091 v8i16
filt, out0, out1, out2, out3;
1097 filt =
LD_SH(filter);
1098 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1104 for (loop_cnt = (height >> 2); loop_cnt--;) {
1105 LD_SB4(src, src_stride, src0, src1, src2, src3);
1107 src += (4 * src_stride);
1109 mask3, filt0, filt1, filt2, filt3, out0,
1111 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1118 dst += (4 * dst_stride);
1129 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
1130 v16u8 mask0, mask1, mask2, mask3, dst0, dst1;
1131 v8i16
filt, out0, out1, out2, out3;
1132 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1133 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1139 filt =
LD_SH(filter);
1140 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1146 for (loop_cnt = height >> 1; loop_cnt--;) {
1147 LD_SB2(src, src_stride, src0, src2);
1148 LD_SB2(src + 8, src_stride, src1, src3);
1149 src += (2 * src_stride);
1152 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1154 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1156 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1158 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1160 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1162 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1163 vec9, vec10, vec11);
1164 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1166 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1167 vec8, vec9, vec10, vec11);
1168 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1170 LD_UB2(dst, dst_stride, dst0, dst1);
1187 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
1188 v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1189 v8i16
filt, out0, out1, out2, out3;
1190 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1191 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1197 filt =
LD_SH(filter);
1198 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1204 for (loop_cnt = height; loop_cnt--;) {
1206 src2 =
LD_SB(src + 16);
1207 src3 =
LD_SB(src + 24);
1208 src1 = __msa_sldi_b(src2, src0, 8);
1212 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1214 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1216 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6, vec10,
1218 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7, vec11,
1220 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
1222 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2, vec8,
1223 vec9, vec10, vec11);
1224 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, vec0,
1226 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1227 vec8, vec9, vec10, vec11);
1228 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1232 LD_UB2(dst, 16, dst1, dst2);
1245 uint32_t loop_cnt, cnt;
1246 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2, filt3;
1247 v16u8 dst1, dst2, mask0, mask1, mask2, mask3;
1248 v8i16
filt, out0, out1, out2, out3;
1249 v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1250 v8i16 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1256 filt =
LD_SH(filter);
1257 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1263 for (loop_cnt = height; loop_cnt--;) {
1264 for (cnt = 0; cnt < 2; ++cnt) {
1265 src0 =
LD_SB(&src[cnt << 5]);
1266 src2 =
LD_SB(&src[16 + (cnt << 5)]);
1267 src3 =
LD_SB(&src[24 + (cnt << 5)]);
1268 src1 = __msa_sldi_b(src2, src0, 8);
1271 VSHF_B4_SH(src0, src0, mask0, mask1, mask2, mask3, vec0, vec4, vec8,
1273 VSHF_B4_SH(src1, src1, mask0, mask1, mask2, mask3, vec1, vec5, vec9,
1275 VSHF_B4_SH(src2, src2, mask0, mask1, mask2, mask3, vec2, vec6,
1277 VSHF_B4_SH(src3, src3, mask0, mask1, mask2, mask3, vec3, vec7,
1279 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1280 vec0, vec1, vec2, vec3);
1281 DOTP_SB4_SH(vec8, vec9, vec10, vec11, filt2, filt2, filt2, filt2,
1282 vec8, vec9, vec10, vec11);
1283 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
1284 vec0, vec1, vec2, vec3);
1285 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt3, filt3, filt3, filt3,
1286 vec8, vec9, vec10, vec11);
1287 ADDS_SH4_SH(vec0, vec8, vec1, vec9, vec2, vec10, vec3, vec11, out0,
1291 LD_UB2(&dst[cnt << 5], 16, dst1, dst2);
1308 uint32_t tp0, tp1, tp2, tp3;
1309 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1311 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1312 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
1313 v16i8 src10998, filt0, filt1, filt2, filt3;
1314 v8i16
filt, out10, out32;
1316 src -= (3 * src_stride);
1318 filt =
LD_SH(filter);
1319 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1321 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1322 src += (7 * src_stride);
1324 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1326 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1327 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1331 for (loop_cnt = (height >> 2); loop_cnt--;) {
1332 LD_SB4(src, src_stride, src7, src8, src9, src10);
1333 src += (4 * src_stride);
1335 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1337 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1338 src87_r, src98_r, src109_r);
1339 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1342 filt1, filt2, filt3);
1344 filt1, filt2, filt3);
1348 out = __msa_aver_u_b(out, dst0);
1350 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1351 dst += (4 * dst_stride);
1367 uint64_t tp0, tp1, tp2, tp3;
1368 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1370 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1371 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1372 v8i16
filt, out0, out1, out2, out3;
1374 src -= (3 * src_stride);
1376 filt =
LD_SH(filter);
1377 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1379 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1380 src += (7 * src_stride);
1383 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1385 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1387 for (loop_cnt = (height >> 2); loop_cnt--;) {
1388 LD_SB4(src, src_stride, src7, src8, src9, src10);
1389 src += (4 * src_stride);
1391 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1395 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1396 src87_r, src98_r, src109_r);
1398 filt1, filt2, filt3);
1400 filt1, filt2, filt3);
1402 filt1, filt2, filt3);
1404 filt1, filt2, filt3);
1409 dst += (4 * dst_stride);
1431 uint32_t loop_cnt, cnt;
1432 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1433 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1434 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1435 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1436 v16i8 filt0, filt1, filt2, filt3;
1437 v16u8 dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
1438 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l,
filt;
1440 src -= (3 * src_stride);
1442 filt =
LD_SH(filter);
1443 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1445 for (cnt = (width >> 4); cnt--;) {
1449 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1451 src_tmp += (7 * src_stride);
1453 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1454 src32_r, src54_r, src21_r);
1455 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1456 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1457 src32_l, src54_l, src21_l);
1458 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1460 for (loop_cnt = (height >> 2); loop_cnt--;) {
1461 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1462 src_tmp += (4 * src_stride);
1464 LD_UB4(dst_tmp, dst_stride, dst0, dst1, dst2, dst3);
1466 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1467 src87_r, src98_r, src109_r);
1468 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1469 src87_l, src98_l, src109_l);
1471 filt0, filt1, filt2, filt3);
1473 filt0, filt1, filt2, filt3);
1475 filt0, filt1, filt2, filt3);
1477 filt0, filt1, filt2, filt3);
1479 filt0, filt1, filt2, filt3);
1481 filt0, filt1, filt2, filt3);
1483 filt0, filt1, filt2, filt3);
1485 filt0, filt1, filt2, filt3);
1488 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1489 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1490 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1491 out3_r, tmp0, tmp1, tmp2, tmp3);
1493 AVER_UB4_UB(tmp0, dst0, tmp1, dst1, tmp2, dst2, tmp3, dst3,
1494 dst0, dst1, dst2, dst3);
1495 ST_UB4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
1496 dst_tmp += (4 * dst_stride);
1525 filter, height, 16);
1535 filter, height, 32);
1545 filter, height, 64);
1552 const int8_t *filter_horiz,
1553 const int8_t *filter_vert,
1557 uint32_t tp0, tp1, tp2, tp3;
1558 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1559 v16u8 dst0, res, mask0, mask1, mask2, mask3;
1560 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1561 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1562 v8i16 hz_out7, hz_out8, hz_out9, res0, res1, vec0, vec1, vec2, vec3, vec4;
1563 v8i16
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1566 src -= (3 + 3 * src_stride);
1569 filt =
LD_SH(filter_horiz);
1570 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1576 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1578 src += (7 * src_stride);
1580 hz_out0 =
HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
1581 filt_hz1, filt_hz2, filt_hz3);
1582 hz_out2 =
HORIZ_8TAP_FILT(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
1583 filt_hz1, filt_hz2, filt_hz3);
1584 hz_out4 =
HORIZ_8TAP_FILT(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
1585 filt_hz1, filt_hz2, filt_hz3);
1586 hz_out5 =
HORIZ_8TAP_FILT(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
1587 filt_hz1, filt_hz2, filt_hz3);
1588 SLDI_B2_SH(hz_out2, hz_out0, hz_out4, hz_out2, 8, hz_out1, hz_out3);
1590 filt =
LD_SH(filter_vert);
1591 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1593 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1594 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1596 for (loop_cnt = (height >> 2); loop_cnt--;) {
1597 LD_SB4(src, src_stride, src7, src8, src9, src10);
1599 src += (4 * src_stride);
1601 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
1604 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1605 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1606 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1608 filt_vt2, filt_vt3);
1611 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1612 hz_out8 = (v8i16) __msa_sldi_b((v16i8) hz_out9, (v16i8) hz_out7, 8);
1613 vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1615 filt_vt2, filt_vt3);
1620 res = (v16u8) __msa_aver_u_b(res, dst0);
1621 ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
1622 dst += (4 * dst_stride);
1635 const int8_t *filter_horiz,
1636 const int8_t *filter_vert,
1640 uint64_t tp0, tp1, tp2, tp3;
1641 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1642 v16i8 filt_hz0, filt_hz1, filt_hz2, filt_hz3;
1643 v8i16
filt, filt_vt0, filt_vt1, filt_vt2, filt_vt3;
1644 v16u8 dst0, dst1, mask0, mask1, mask2, mask3;
1645 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1646 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, tmp0, tmp1, tmp2, tmp3;
1647 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9;
1650 src -= (3 + 3 * src_stride);
1653 filt =
LD_SH(filter_horiz);
1654 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1660 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1661 src += (7 * src_stride);
1664 hz_out0 =
HORIZ_8TAP_FILT(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
1665 filt_hz1, filt_hz2, filt_hz3);
1666 hz_out1 =
HORIZ_8TAP_FILT(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
1667 filt_hz1, filt_hz2, filt_hz3);
1668 hz_out2 =
HORIZ_8TAP_FILT(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
1669 filt_hz1, filt_hz2, filt_hz3);
1670 hz_out3 =
HORIZ_8TAP_FILT(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
1671 filt_hz1, filt_hz2, filt_hz3);
1672 hz_out4 =
HORIZ_8TAP_FILT(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
1673 filt_hz1, filt_hz2, filt_hz3);
1674 hz_out5 =
HORIZ_8TAP_FILT(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
1675 filt_hz1, filt_hz2, filt_hz3);
1676 hz_out6 =
HORIZ_8TAP_FILT(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
1677 filt_hz1, filt_hz2, filt_hz3);
1679 filt =
LD_SH(filter_vert);
1680 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
1682 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1683 ILVEV_B2_SH(hz_out4, hz_out5, hz_out1, hz_out2, out2, out4);
1684 ILVEV_B2_SH(hz_out3, hz_out4, hz_out5, hz_out6, out5, out6);
1686 for (loop_cnt = (height >> 2); loop_cnt--;) {
1687 LD_SB4(src, src_stride, src7, src8, src9, src10);
1689 src += (4 * src_stride);
1691 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
1696 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1697 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1699 filt_vt2, filt_vt3);
1702 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1703 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1705 filt_vt2, filt_vt3);
1708 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1709 out8 = (v8i16) __msa_ilvev_b((v16i8) hz_out9, (v16i8) hz_out8);
1711 filt_vt2, filt_vt3);
1714 filt_hz0, filt_hz1, filt_hz2, filt_hz3);
1715 out9 = (v8i16) __msa_ilvev_b((v16i8) hz_out10, (v16i8) hz_out9);
1717 filt_vt2, filt_vt3);
1723 dst += (4 * dst_stride);
1739 const int8_t *filter_horiz,
1740 const int8_t *filter_vert,
1745 for (multiple8_cnt = 2; multiple8_cnt--;) {
1747 filter_horiz, filter_vert,
1759 const int8_t *filter_horiz,
1760 const int8_t *filter_vert,
1765 for (multiple8_cnt = 4; multiple8_cnt--;) {
1767 filter_horiz, filter_vert,
1779 const int8_t *filter_horiz,
1780 const int8_t *filter_vert,
1785 for (multiple8_cnt = 8; multiple8_cnt--;) {
1787 filter_horiz, filter_vert,
1800 v16u8 filt0, vec0, vec1, res0, res1;
1801 v8u16 vec2, vec3,
filt;
1806 filt =
LD_UH(filter);
1807 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1809 LD_SB4(src, src_stride, src0, src1, src2, src3);
1810 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1811 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1814 ST_W2(res0, 0, 1, dst, dst_stride);
1815 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1822 v16u8 vec0, vec1, vec2, vec3, filt0;
1823 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
1824 v16i8 res0, res1, res2, res3;
1825 v8u16 vec4, vec5, vec6, vec7,
filt;
1830 filt =
LD_UH(filter);
1831 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1833 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1834 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1835 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1836 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1837 vec4, vec5, vec6, vec7);
1839 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1840 res0, res1, res2, res3);
1841 ST_W2(res0, 0, 1, dst, dst_stride);
1842 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1843 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1844 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1849 int height,
int mx,
int my)
1855 }
else if (8 == height) {
1866 v8u16 vec0, vec1, vec2, vec3,
filt;
1871 filt =
LD_UH(filter);
1872 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1874 LD_SB4(src, src_stride, src0, src1, src2, src3);
1875 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1876 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1877 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1878 vec0, vec1, vec2, vec3);
1881 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1890 v8u16 vec0, vec1, vec2, vec3,
filt;
1895 filt =
LD_UH(filter);
1896 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1898 LD_SB4(src, src_stride, src0, src1, src2, src3);
1899 src += (4 * src_stride);
1901 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1902 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1903 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1904 vec0, vec1, vec2, vec3);
1906 LD_SB4(src, src_stride, src0, src1, src2, src3);
1907 src += (4 * src_stride);
1910 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1912 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1913 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1914 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1915 vec0, vec1, vec2, vec3);
1918 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1919 dst += (8 * dst_stride);
1922 LD_SB4(src, src_stride, src0, src1, src2, src3);
1923 src += (4 * src_stride);
1925 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1926 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1927 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1928 vec0, vec1, vec2, vec3);
1930 LD_SB4(src, src_stride, src0, src1, src2, src3);
1931 src += (4 * src_stride);
1934 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1936 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1937 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1938 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1939 vec0, vec1, vec2, vec3);
1942 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1948 int height,
int mx,
int my)
1962 int height,
int mx,
int my)
1966 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
1967 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1968 v8u16 out0, out1, out2, out3, out4, out5, out6, out7,
filt;
1972 loop_cnt = (height >> 2) - 1;
1975 filt =
LD_UH(filter);
1976 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1978 LD_SB4(src, src_stride, src0, src2, src4, src6);
1979 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1980 src += (4 * src_stride);
1982 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1983 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1984 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1985 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1986 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1987 out0, out1, out2, out3);
1988 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1989 out4, out5, out6, out7);
2001 for (; loop_cnt--;) {
2002 LD_SB4(src, src_stride, src0, src2, src4, src6);
2003 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2004 src += (4 * src_stride);
2006 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2007 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2008 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2009 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2010 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2011 out0, out1, out2, out3);
2012 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2013 out4, out5, out6, out7);
2029 int height,
int mx,
int my)
2033 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
2034 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2035 v8u16 out0, out1, out2, out3, out4, out5, out6, out7,
filt;
2040 filt =
LD_UH(filter);
2041 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2043 for (loop_cnt = height >> 1; loop_cnt--;) {
2045 src2 =
LD_SB(src + 16);
2046 src3 =
LD_SB(src + 24);
2047 src1 = __msa_sldi_b(src2, src0, 8);
2050 src6 =
LD_SB(src + 16);
2051 src7 =
LD_SB(src + 24);
2052 src5 = __msa_sldi_b(src6, src4, 8);
2055 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2056 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2057 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2058 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2059 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2060 out0, out1, out2, out3);
2061 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2062 out4, out5, out6, out7);
2076 int height,
int mx,
int my)
2080 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
2081 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2082 v8u16 out0, out1, out2, out3, out4, out5, out6, out7,
filt;
2087 filt =
LD_UH(filter);
2088 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2090 for (loop_cnt = height; loop_cnt--;) {
2092 src2 =
LD_SB(src + 16);
2093 src4 =
LD_SB(src + 32);
2094 src6 =
LD_SB(src + 48);
2095 src7 =
LD_SB(src + 56);
2096 SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
2099 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
2100 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
2101 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
2102 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
2103 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2104 out0, out1, out2, out3);
2105 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2106 out4, out5, out6, out7);
2121 v16i8
src0,
src1, src2, src3, src4;
2122 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2127 filt =
LD_SH(filter);
2128 filt0 = (v16u8) __msa_splati_h(filt, 0);
2130 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2131 src += (5 * src_stride);
2133 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2134 src10_r, src21_r, src32_r, src43_r);
2135 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2136 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
2139 src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2140 ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
2147 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2148 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
2149 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
2150 v8u16 tmp0, tmp1, tmp2, tmp3;
2154 filt =
LD_SH(filter);
2155 filt0 = (v16u8) __msa_splati_h(filt, 0);
2157 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2158 src += (8 * src_stride);
2163 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2165 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2167 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2168 src87_r, src76_r, src2110, src4332, src6554, src8776);
2169 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
2170 tmp0, tmp1, tmp2, tmp3);
2173 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
2174 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2179 int height,
int mx,
int my)
2185 }
else if (8 == height) {
2194 v16u8
src0,
src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
2196 v8u16 tmp0, tmp1, tmp2, tmp3;
2200 filt =
LD_SH(filter);
2201 filt0 = (v16u8) __msa_splati_h(filt, 0);
2203 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
2204 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
2205 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
2206 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2207 tmp0, tmp1, tmp2, tmp3);
2211 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2219 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
2220 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2222 v8u16 tmp0, tmp1, tmp2, tmp3;
2226 filt =
LD_SH(filter);
2227 filt0 = (v16u8) __msa_splati_h(filt, 0);
2232 for (loop_cnt = (height >> 3); loop_cnt--;) {
2233 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
2234 src += (8 * src_stride);
2236 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
2237 vec0, vec1, vec2, vec3);
2238 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
2239 vec4, vec5, vec6, vec7);
2240 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2241 tmp0, tmp1, tmp2, tmp3);
2245 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2247 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2248 tmp0, tmp1, tmp2, tmp3);
2252 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2253 dst += (8 * dst_stride);
2261 int height,
int mx,
int my)
2275 int height,
int mx,
int my)
2279 v16u8
src0,
src1, src2, src3, src4;
2280 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2281 v8u16 tmp0, tmp1, tmp2, tmp3;
2285 filt =
LD_SH(filter);
2286 filt0 = (v16u8) __msa_splati_h(filt, 0);
2291 for (loop_cnt = (height >> 2); loop_cnt--;) {
2292 LD_UB4(src, src_stride, src1, src2, src3, src4);
2293 src += (4 * src_stride);
2295 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2296 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2297 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2303 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2304 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2305 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2311 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2317 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2329 int height,
int mx,
int my)
2333 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
2334 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2335 v8u16 tmp0, tmp1, tmp2, tmp3;
2339 filt =
LD_SH(filter);
2340 filt0 = (v16u8) __msa_splati_h(filt, 0);
2343 src5 =
LD_UB(src + 16);
2346 for (loop_cnt = (height >> 2); loop_cnt--;) {
2347 LD_UB4(src, src_stride, src1, src2, src3, src4);
2348 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2349 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2351 LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
2352 src += (4 * src_stride);
2354 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2358 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2363 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
2364 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
2365 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2370 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2375 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
2376 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
2377 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2382 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2387 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
2388 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
2389 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
2392 PCKEV_ST_SB(tmp0, tmp1, dst + 16 + 2 * dst_stride);
2394 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
2397 PCKEV_ST_SB(tmp2, tmp3, dst + 16 + 3 * dst_stride);
2398 dst += (4 * dst_stride);
2407 int height,
int mx,
int my)
2411 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2412 v16u8 src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
2413 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
2417 filt =
LD_SH(filter);
2418 filt0 = (v16u8) __msa_splati_h(filt, 0);
2420 LD_UB4(src, 16, src0, src3, src6, src9);
2423 for (loop_cnt = (height >> 1); loop_cnt--;) {
2424 LD_UB2(src, src_stride, src1, src2);
2425 LD_UB2(src + 16, src_stride, src4, src5);
2426 LD_UB2(src + 32, src_stride, src7, src8);
2427 LD_UB2(src + 48, src_stride, src10, src11);
2428 src += (2 * src_stride);
2430 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
2431 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
2432 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2437 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2442 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
2443 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
2444 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2449 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2454 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
2455 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
2456 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
2461 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
2466 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
2467 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
2468 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
2473 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
2477 dst += (2 * dst_stride);
2488 const int8_t *filter_horiz,
const int8_t *filter_vert)
2491 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
2492 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4,
filt, tmp0, tmp1;
2497 filt =
LD_UH(filter_horiz);
2498 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2500 filt =
LD_UH(filter_vert);
2501 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2503 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2507 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
2508 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
2510 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2511 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
2515 ST_W2(res0, 0, 1, dst, dst_stride);
2516 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2521 const int8_t *filter_horiz,
const int8_t *filter_vert)
2523 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8,
mask;
2524 v16i8 res0, res1, res2, res3;
2525 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2526 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2527 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7,
filt;
2532 filt =
LD_UH(filter_horiz);
2533 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2535 filt =
LD_UH(filter_vert);
2536 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2538 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2539 src += (8 * src_stride);
2547 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
2549 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2551 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2552 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2553 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2554 vec4, vec5, vec6, vec7);
2557 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2558 res0, res1, res2, res3);
2559 ST_W2(res0, 0, 1, dst, dst_stride);
2560 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2561 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2562 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2567 int height,
int mx,
int my)
2574 filter_horiz, filter_vert);
2575 }
else if (8 == height) {
2577 filter_horiz, filter_vert);
2583 const int8_t *filter_horiz,
const int8_t *filter_vert)
2586 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2587 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2593 filt =
LD_SH(filter_horiz);
2594 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2596 filt =
LD_SH(filter_vert);
2597 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2599 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2603 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2604 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2607 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2608 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2611 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2612 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2615 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2616 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2621 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2626 const int8_t *filter_horiz,
const int8_t *filter_vert,
2631 v16u8 filt_hz, filt_vt, vec0;
2632 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2638 filt =
LD_SH(filter_horiz);
2639 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2641 filt =
LD_SH(filter_vert);
2642 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2649 for (loop_cnt = (height >> 3); loop_cnt--;) {
2650 LD_SB4(src, src_stride, src1, src2, src3, src4);
2651 src += (4 * src_stride);
2654 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2655 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2658 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2659 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2665 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2666 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2669 LD_SB4(src, src_stride, src1, src2, src3, src4);
2670 src += (4 * src_stride);
2671 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2672 tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2677 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2680 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2681 tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2684 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2685 tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2688 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2689 tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2692 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2693 tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2698 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2699 dst += (8 * dst_stride);
2705 int height,
int mx,
int my)
2712 filter_horiz, filter_vert);
2715 filter_horiz, filter_vert, height);
2721 int height,
int mx,
int my)
2726 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
2727 v16u8 filt_hz, filt_vt, vec0, vec1;
2728 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2734 filt =
LD_SH(filter_horiz);
2735 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2737 filt =
LD_SH(filter_vert);
2738 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2740 LD_SB2(src, 8, src0, src1);
2747 for (loop_cnt = (height >> 2); loop_cnt--;) {
2748 LD_SB4(src, src_stride, src0, src2, src4, src6);
2749 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2750 src += (4 * src_stride);
2754 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2755 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2763 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2764 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2772 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2773 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2781 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2782 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2792 int height,
int mx,
int my)
2796 for (multiple8_cnt = 2; multiple8_cnt--;) {
2806 int height,
int mx,
int my)
2810 for (multiple8_cnt = 4; multiple8_cnt--;) {
2823 uint32_t tp0, tp1, tp2, tp3;
2825 v16u8 filt0, dst0, vec0, vec1, res;
2826 v8u16 vec2, vec3,
filt;
2831 filt =
LD_UH(filter);
2832 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2834 LD_SB4(src, src_stride, src0, src1, src2, src3);
2835 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2837 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2838 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
2841 res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
2842 res = (v16u8) __msa_aver_u_b(res, dst0);
2844 ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
2852 uint32_t tp0, tp1, tp2, tp3;
2853 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
2854 v16u8 filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
2856 v8u16 vec4, vec5, vec6, vec7,
filt;
2861 filt =
LD_UH(filter);
2862 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2864 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2865 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
2867 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
2869 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
2870 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
2871 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec4, vec5,
2874 PCKEV_B4_UB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1,
2876 ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
2878 ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2883 int height,
int mx,
int my)
2890 }
else if (8 == height) {
2901 int64_t tp0, tp1, tp2, tp3;
2903 v16u8 filt0, dst0, dst1;
2904 v8u16 vec0, vec1, vec2, vec3,
filt;
2909 filt =
LD_UH(filter);
2910 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2912 LD_SB4(src, src_stride, src0, src1, src2, src3);
2913 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2914 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2915 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2916 vec0, vec1, vec2, vec3);
2918 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2931 int64_t tp0, tp1, tp2, tp3;
2933 v16u8 filt0, dst0, dst1;
2934 v8u16 vec0, vec1, vec2, vec3,
filt;
2939 filt =
LD_UH(filter);
2940 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
2942 LD_SB4(src, src_stride, src0, src1, src2, src3);
2943 src += (4 * src_stride);
2944 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2945 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2946 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2949 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2952 LD_SB4(src, src_stride, src0, src1, src2, src3);
2953 src += (4 * src_stride);
2955 dst += (4 * dst_stride);
2957 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2958 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2959 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0, vec1,
2962 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2966 dst += (4 * dst_stride);
2969 LD_SB4(src, src_stride, src0, src1, src2, src3);
2970 src += (4 * src_stride);
2972 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2973 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2974 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2977 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2980 LD_SB4(src, src_stride, src0, src1, src2, src3);
2982 dst += (4 * dst_stride);
2984 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
2985 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
2986 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, vec0,
2989 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
2998 int height,
int mx,
int my)
3013 int height,
int mx,
int my)
3017 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
3018 v16u8 filt0, dst0, dst1, dst2, dst3;
3019 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3020 v8u16 res0, res1, res2, res3, res4, res5, res6, res7,
filt;
3025 filt =
LD_UH(filter);
3026 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
3028 LD_SB4(src, src_stride, src0, src2, src4, src6);
3029 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3030 src += (4 * src_stride);
3032 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3033 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3034 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3035 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3036 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0, res1,
3038 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4, res5,
3042 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3052 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
3053 LD_SB4(src, src_stride, src0, src2, src4, src6);
3054 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3055 src += (4 * src_stride);
3057 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3058 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3059 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3060 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3061 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
3063 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, res4,
3067 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3081 int height,
int mx,
int my)
3085 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
3086 v16u8 filt0, dst0, dst1, dst2, dst3;
3087 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3088 v8u16 res0, res1, res2, res3, res4, res5, res6, res7,
filt;
3093 filt =
LD_UH(filter);
3094 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
3096 for (loop_cnt = (height >> 1); loop_cnt--;) {
3098 src2 =
LD_SB(src + 16);
3099 src3 =
LD_SB(src + 24);
3100 src1 = __msa_sldi_b(src2, src0, 8);
3103 src6 =
LD_SB(src + 16);
3104 src7 =
LD_SB(src + 24);
3105 src5 = __msa_sldi_b(src6, src4, 8);
3108 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3109 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3110 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3111 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3112 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3113 res0, res1, res2, res3);
3114 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3115 res4, res5, res6, res7);
3118 LD_UB2(dst, 16, dst0, dst1);
3122 LD_UB2(dst, 16, dst2, dst3);
3131 int height,
int mx,
int my)
3135 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
3136 v16u8 filt0, dst0, dst1, dst2, dst3;
3137 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3138 v8u16 out0, out1, out2, out3, out4, out5, out6, out7,
filt;
3143 filt =
LD_UH(filter);
3144 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
3146 for (loop_cnt = height; loop_cnt--;) {
3147 LD_SB4(src, 16, src0, src2, src4, src6);
3148 src7 =
LD_SB(src + 56);
3149 SLDI_B3_SB(src2, src0, src4, src2, src6, src4, 8, src1, src3, src5);
3152 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
3153 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
3154 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
3155 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
3156 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3157 out0, out1, out2, out3);
3158 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3159 out4, out5, out6, out7);
3162 LD_UB4(dst, 16, dst0, dst1, dst2, dst3);
3176 uint32_t tp0, tp1, tp2, tp3;
3177 v16i8
src0,
src1, src2, src3, src4;
3178 v16u8 dst0,
out, filt0, src2110, src4332;
3179 v16i8 src10_r, src32_r, src21_r, src43_r;
3183 filt =
LD_SH(filter);
3184 filt0 = (v16u8) __msa_splati_h(filt, 0);
3186 LD_SB4(src, src_stride, src0, src1, src2, src3);
3187 src += (4 * src_stride);
3192 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3194 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
3195 src10_r, src21_r, src32_r, src43_r);
3196 ILVR_D2_UB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3197 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
3201 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3202 out = __msa_aver_u_b(out, dst0);
3204 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3212 uint32_t tp0, tp1, tp2, tp3;
3214 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
3215 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3216 v16u8 src2110, src4332, src6554, src8776, filt0;
3217 v8u16 tmp0, tmp1, tmp2, tmp3;
3220 filt =
LD_SH(filter);
3221 filt0 = (v16u8) __msa_splati_h(filt, 0);
3223 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3224 src += (8 * src_stride);
3227 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3229 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3231 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3233 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3235 ILVR_D4_UB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
3236 src87_r, src76_r, src2110, src4332, src6554, src8776);
3237 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
3238 tmp0, tmp1, tmp2, tmp3);
3241 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
3242 AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
3243 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3248 int height,
int mx,
int my)
3255 }
else if (8 == height) {
3267 int64_t tp0, tp1, tp2, tp3;
3268 v16u8
src0,
src1, src2, src3, src4;
3269 v16u8 dst0, dst1, vec0, vec1, vec2, vec3, filt0;
3270 v8u16 tmp0, tmp1, tmp2, tmp3;
3274 filt =
LD_SH(filter);
3275 filt0 = (v16u8) __msa_splati_h(filt, 0);
3277 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
3278 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3281 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
3282 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
3283 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3284 tmp0, tmp1, tmp2, tmp3);
3298 int64_t tp0, tp1, tp2, tp3;
3299 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3300 v16u8 dst0, dst1, dst2, dst3;
3301 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3302 v8u16 tmp0, tmp1, tmp2, tmp3;
3306 filt =
LD_SH(filter);
3307 filt0 = (v16u8) __msa_splati_h(filt, 0);
3312 for (loop_cnt = (height >> 3); loop_cnt--;) {
3313 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
3314 src += (8 * src_stride);
3316 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3319 LD4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3323 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
3324 vec0, vec1, vec2, vec3);
3325 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
3326 vec4, vec5, vec6, vec7);
3327 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
3328 tmp0, tmp1, tmp2, tmp3);
3332 dst += (4 * dst_stride);
3334 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
3335 tmp0, tmp1, tmp2, tmp3);
3339 dst += (4 * dst_stride);
3347 int height,
int mx,
int my)
3362 int height,
int mx,
int my)
3366 v16u8
src0,
src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
3367 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3368 v8u16 tmp0, tmp1, tmp2, tmp3,
filt;
3371 filt =
LD_UH(filter);
3372 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
3377 for (loop_cnt = (height >> 2); loop_cnt--;) {
3378 LD_UB4(src, src_stride, src1, src2, src3, src4);
3379 src += (4 * src_stride);
3381 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3382 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3383 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3384 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3390 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3391 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3392 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3398 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3404 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3416 int height,
int mx,
int my)
3420 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9;
3421 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3422 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
3423 v8u16 tmp0, tmp1, tmp2, tmp3,
filt;
3426 filt =
LD_UH(filter);
3427 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
3429 LD_UB2(src, 16, src0, src5);
3432 for (loop_cnt = (height >> 2); loop_cnt--;) {
3433 LD_UB4(src, src_stride, src1, src2, src3, src4);
3434 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3435 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3436 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3438 LD_UB4(src + 16, src_stride, src6, src7, src8, src9);
3439 LD_UB4(dst + 16, dst_stride, dst4, dst5, dst6, dst7);
3440 src += (4 * src_stride);
3442 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3447 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3452 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
3453 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
3454 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3459 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3464 ILVR_B2_UB(src6, src5, src7, src6, vec0, vec2);
3465 ILVL_B2_UB(src6, src5, src7, src6, vec1, vec3);
3466 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3471 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3476 ILVR_B2_UB(src8, src7, src9, src8, vec4, vec6);
3477 ILVL_B2_UB(src8, src7, src9, src8, vec5, vec7);
3478 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
3483 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
3487 dst += (4 * dst_stride);
3496 int height,
int mx,
int my)
3500 v16u8
src0,
src1, src2, src3, src4, src5;
3501 v16u8 src6, src7, src8, src9, src10, src11, filt0;
3502 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3503 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3504 v8u16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3508 filt =
LD_UH(filter);
3509 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
3511 LD_UB4(src, 16, src0, src3, src6, src9);
3514 for (loop_cnt = (height >> 1); loop_cnt--;) {
3515 LD_UB2(src, src_stride, src1, src2);
3516 LD_UB2(dst, dst_stride, dst0, dst1);
3517 LD_UB2(src + 16, src_stride, src4, src5);
3518 LD_UB2(dst + 16, dst_stride, dst2, dst3);
3519 LD_UB2(src + 32, src_stride, src7, src8);
3520 LD_UB2(dst + 32, dst_stride, dst4, dst5);
3521 LD_UB2(src + 48, src_stride, src10, src11);
3522 LD_UB2(dst + 48, dst_stride, dst6, dst7);
3523 src += (2 * src_stride);
3525 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
3526 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
3527 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3532 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3537 ILVR_B2_UB(src4, src3, src5, src4, vec4, vec6);
3538 ILVL_B2_UB(src4, src3, src5, src4, vec5, vec7);
3539 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3544 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3549 ILVR_B2_UB(src7, src6, src8, src7, vec0, vec2);
3550 ILVL_B2_UB(src7, src6, src8, src7, vec1, vec3);
3551 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
3556 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
3561 ILVR_B2_UB(src10, src9, src11, src10, vec4, vec6);
3562 ILVL_B2_UB(src10, src9, src11, src10, vec5, vec7);
3563 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp4, tmp5);
3568 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp6, tmp7);
3572 dst += (2 * dst_stride);
3585 const int8_t *filter_horiz,
3586 const int8_t *filter_vert)
3588 uint32_t tp0, tp1, tp2, tp3;
3590 v16u8 filt_hz, filt_vt, vec0, vec1;
3592 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1,
filt;
3597 filt =
LD_UH(filter_horiz);
3598 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
3600 filt =
LD_UH(filter_vert);
3601 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
3603 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3608 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
3609 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
3610 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3612 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3615 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3619 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3620 out = __msa_aver_u_b(out, dst0);
3622 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3629 const int8_t *filter_horiz,
3630 const int8_t *filter_vert)
3632 uint32_t tp0, tp1, tp2, tp3;
3633 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8,
mask;
3634 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
3636 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3637 v8u16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
3643 filt =
LD_SH(filter_horiz);
3644 filt_hz = (v16u8) __msa_splati_h(filt, 0);
3646 filt =
LD_SH(filter_vert);
3647 filt_vt = (v16u8) __msa_splati_h(filt, 0);
3649 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3650 src += (8 * src_stride);
3658 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
3660 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
3662 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
3664 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
3666 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3667 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
3668 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
3669 tmp0, tmp1, tmp2, tmp3);
3674 ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3679 int height,
int mx,
int my)
3686 filter_horiz, filter_vert);
3687 }
else if (8 == height) {
3689 filter_horiz, filter_vert);
3697 const int8_t *filter_horiz,
3698 const int8_t *filter_vert)
3700 uint64_t tp0, tp1, tp2, tp3;
3702 v16u8 filt_hz, filt_vt, dst0, dst1, vec0, vec1, vec2, vec3;
3703 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3709 filt =
LD_SH(filter_horiz);
3710 filt_hz = (v16u8) __msa_splati_h(filt, 0);
3712 filt =
LD_SH(filter_vert);
3713 filt_vt = (v16u8) __msa_splati_h(filt, 0);
3715 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3716 src += (5 * src_stride);
3718 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3723 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3724 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3727 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3728 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
3731 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3732 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
3735 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3736 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
3747 const int8_t *filter_horiz,
3748 const int8_t *filter_vert,
3752 uint64_t tp0, tp1, tp2, tp3;
3754 v16u8 filt_hz, filt_vt, vec0, dst0, dst1;
3755 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
3761 filt =
LD_SH(filter_horiz);
3762 filt_hz = (v16u8) __msa_splati_h(filt, 0);
3764 filt =
LD_SH(filter_vert);
3765 filt_vt = (v16u8) __msa_splati_h(filt, 0);
3772 for (loop_cnt = (height >> 2); loop_cnt--;) {
3773 LD_SB4(src, src_stride, src1, src2, src3, src4);
3774 src += (4 * src_stride);
3777 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3778 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
3781 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3782 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
3788 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
3789 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
3792 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
3793 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
3797 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
3801 dst += (4 * dst_stride);
3807 int height,
int mx,
int my)
3814 filter_horiz, filter_vert);
3818 filter_horiz, filter_vert,
3825 int height,
int mx,
int my)
3830 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
3831 v16u8 filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
3832 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
3838 filt =
LD_SH(filter_horiz);
3839 filt_hz = (v16u8) __msa_splati_h(filt, 0);
3841 filt =
LD_SH(filter_vert);
3842 filt_vt = (v16u8) __msa_splati_h(filt, 0);
3844 LD_SB2(src, 8, src0, src1);
3850 for (loop_cnt = (height >> 2); loop_cnt--;) {
3851 LD_SB4(src, src_stride, src0, src2, src4, src6);
3852 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
3853 src += (4 * src_stride);
3854 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
3858 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3859 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3867 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3868 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3876 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
3877 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3885 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
3886 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
3896 int height,
int mx,
int my)
3900 for (multiple8_cnt = 2; multiple8_cnt--;) {
3910 int height,
int mx,
int my)
3914 for (multiple8_cnt = 4; multiple8_cnt--;) {
3927 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
3929 if (0 == height % 8) {
3930 for (cnt = height >> 3; cnt--;) {
3931 LD4(src, src_stride, out0, out1, out2, out3);
3932 src += (4 * src_stride);
3933 LD4(src, src_stride, out4, out5, out6, out7);
3934 src += (4 * src_stride);
3936 SD4(out0, out1, out2, out3, dst, dst_stride);
3937 dst += (4 * dst_stride);
3938 SD4(out4, out5, out6, out7, dst, dst_stride);
3939 dst += (4 * dst_stride);
3941 }
else if (0 == height % 4) {
3942 for (cnt = (height / 4); cnt--;) {
3943 LD4(src, src_stride, out0, out1, out2, out3);
3944 src += (4 * src_stride);
3946 SD4(out0, out1, out2, out3, dst, dst_stride);
3947 dst += (4 * dst_stride);
3957 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
3960 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3961 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3962 }
else if (16 == height) {
3963 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3964 src += (8 * src_stride);
3965 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3966 dst += (8 * dst_stride);
3967 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3968 src += (8 * src_stride);
3969 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3970 dst += (8 * dst_stride);
3971 }
else if (32 == height) {
3972 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3973 src += (8 * src_stride);
3974 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3975 dst += (8 * dst_stride);
3976 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3977 src += (8 * src_stride);
3978 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3979 dst += (8 * dst_stride);
3980 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3981 src += (8 * src_stride);
3982 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3983 dst += (8 * dst_stride);
3984 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
3985 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
3986 }
else if (0 == height % 4) {
3987 for (cnt = (height >> 2); cnt--;) {
3988 LD_UB4(src, src_stride, src0, src1, src2, src3);
3989 src += (4 * src_stride);
3990 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
3991 dst += (4 * dst_stride);
4001 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
4003 if (0 == height % 8) {
4004 for (cnt = (height >> 3); cnt--;) {
4005 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4006 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
4007 LD_UB8(src + 16, src_stride, src0, src1, src2, src3, src4, src5, src6,
4009 src += (8 * src_stride);
4010 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst + 16,
4012 dst += (8 * dst_stride);
4014 }
else if (0 == height % 4) {
4015 for (cnt = (height >> 2); cnt--;) {
4016 LD_UB4(src, src_stride, src0, src1, src2, src3);
4017 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
4018 src += (4 * src_stride);
4019 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
4020 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
4021 dst += (4 * dst_stride);
4031 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
4032 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4034 for (cnt = (height >> 2); cnt--;) {
4035 LD_UB4(src, 16, src0, src1, src2, src3);
4037 LD_UB4(src, 16, src4, src5, src6, src7);
4039 LD_UB4(src, 16, src8, src9, src10, src11);
4041 LD_UB4(src, 16, src12, src13, src14, src15);
4044 ST_UB4(src0, src1, src2, src3, dst, 16);
4046 ST_UB4(src4, src5, src6, src7, dst, 16);
4048 ST_UB4(src8, src9, src10, src11, dst, 16);
4050 ST_UB4(src12, src13, src14, src15, dst, 16);
4059 uint32_t tp0, tp1, tp2, tp3;
4060 v16u8
src0 = { 0 },
src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
4063 LW4(src, src_stride, tp0, tp1, tp2, tp3);
4064 src += 4 * src_stride;
4066 LW4(src, src_stride, tp0, tp1, tp2, tp3);
4068 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4070 LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
4073 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4074 }
else if (4 == height) {
4075 LW4(src, src_stride, tp0, tp1, tp2, tp3);
4077 LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
4079 dst0 = __msa_aver_u_b(src0, dst0);
4080 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
4089 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
4091 v16u8 dst0, dst1, dst2, dst3;
4093 if (0 == (height % 8)) {
4094 for (cnt = (height >> 3); cnt--;) {
4095 LD4(src, src_stride, tp0, tp1, tp2, tp3);
4096 src += 4 * src_stride;
4097 LD4(src, src_stride, tp4, tp5, tp6, tp7);
4098 src += 4 * src_stride;
4103 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4104 LD4(dst + 4 * dst_stride, dst_stride, tp4, tp5, tp6, tp7);
4109 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
4111 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4112 dst += 8 * dst_stride;
4114 }
else if (4 == height) {
4115 LD4(src, src_stride, tp0, tp1, tp2, tp3);
4118 LD4(dst, dst_stride, tp0, tp1, tp2, tp3);
4122 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
4131 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
4132 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4134 if (0 == (height % 8)) {
4135 for (cnt = (height / 8); cnt--;) {
4136 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
4137 src += (8 * src_stride);
4138 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
4140 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4141 dst0, dst1, dst2, dst3);
4142 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4143 dst4, dst5, dst6, dst7);
4144 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
4145 dst += (8 * dst_stride);
4147 }
else if (0 == (height % 4)) {
4148 for (cnt = (height / 4); cnt--;) {
4149 LD_UB4(src, src_stride, src0, src1, src2, src3);
4150 src += (4 * src_stride);
4151 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
4153 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4154 dst0, dst1, dst2, dst3);
4155 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
4156 dst += (4 * dst_stride);
4167 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
4168 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4169 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4170 v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4172 if (0 == (height % 8)) {
4173 for (cnt = (height / 8); cnt--;) {
4174 LD_UB4(src, src_stride, src0, src2, src4, src6);
4175 LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4176 src += (4 * src_stride);
4177 LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4178 LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4179 dst_dup += (4 * dst_stride);
4180 LD_UB4(src, src_stride, src8, src10, src12, src14);
4181 LD_UB4(src + 16, src_stride, src9, src11, src13, src15);
4182 src += (4 * src_stride);
4183 LD_UB4(dst_dup, dst_stride, dst8, dst10, dst12, dst14);
4184 LD_UB4(dst_dup + 16, dst_stride, dst9, dst11, dst13, dst15);
4185 dst_dup += (4 * dst_stride);
4187 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4188 dst0, dst1, dst2, dst3);
4189 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4190 dst4, dst5, dst6, dst7);
4191 AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4192 dst8, dst9, dst10, dst11);
4193 AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4194 dst12, dst13, dst14, dst15);
4196 ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4197 ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4198 dst += (4 * dst_stride);
4199 ST_UB4(dst8, dst10, dst12, dst14, dst, dst_stride);
4200 ST_UB4(dst9, dst11, dst13, dst15, dst + 16, dst_stride);
4201 dst += (4 * dst_stride);
4203 }
else if (0 == (height % 4)) {
4204 for (cnt = (height / 4); cnt--;) {
4205 LD_UB4(src, src_stride, src0, src2, src4, src6);
4206 LD_UB4(src + 16, src_stride, src1, src3, src5, src7);
4207 src += (4 * src_stride);
4208 LD_UB4(dst_dup, dst_stride, dst0, dst2, dst4, dst6);
4209 LD_UB4(dst_dup + 16, dst_stride, dst1, dst3, dst5, dst7);
4210 dst_dup += (4 * dst_stride);
4212 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4213 dst0, dst1, dst2, dst3);
4214 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4215 dst4, dst5, dst6, dst7);
4217 ST_UB4(dst0, dst2, dst4, dst6, dst, dst_stride);
4218 ST_UB4(dst1, dst3, dst5, dst7, dst + 16, dst_stride);
4219 dst += (4 * dst_stride);
4230 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
4231 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
4232 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4233 v16u8 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
4235 for (cnt = (height / 4); cnt--;) {
4236 LD_UB4(src, 16, src0, src1, src2, src3);
4238 LD_UB4(src, 16, src4, src5, src6, src7);
4240 LD_UB4(src, 16, src8, src9, src10, src11);
4242 LD_UB4(src, 16, src12, src13, src14, src15);
4245 LD_UB4(dst_dup, 16, dst0, dst1, dst2, dst3);
4246 dst_dup += dst_stride;
4247 LD_UB4(dst_dup, 16, dst4, dst5, dst6, dst7);
4248 dst_dup += dst_stride;
4249 LD_UB4(dst_dup, 16, dst8, dst9, dst10, dst11);
4250 dst_dup += dst_stride;
4251 LD_UB4(dst_dup, 16, dst12, dst13, dst14, dst15);
4252 dst_dup += dst_stride;
4254 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
4255 dst0, dst1, dst2, dst3);
4256 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
4257 dst4, dst5, dst6, dst7);
4258 AVER_UB4_UB(src8, dst8, src9, dst9, src10, dst10, src11, dst11,
4259 dst8, dst9, dst10, dst11);
4260 AVER_UB4_UB(src12, dst12, src13, dst13, src14, dst14, src15, dst15,
4261 dst12, dst13, dst14, dst15);
4263 ST_UB4(dst0, dst1, dst2, dst3, dst, 16);
4265 ST_UB4(dst4, dst5, dst6, dst7, dst, 16);
4267 ST_UB4(dst8, dst9, dst10, dst11, dst, 16);
4269 ST_UB4(dst12, dst13, dst14, dst15, dst, 16);
4276 {0, 1, -5, 126, 8, -3, 1, 0},
4277 {-1, 3, -10, 122, 18, -6, 2, 0},
4278 {-1, 4, -13, 118, 27, -9, 3, -1},
4279 {-1, 4, -16, 112, 37, -11, 4, -1},
4280 {-1, 5, -18, 105, 48, -14, 4, -1},
4281 {-1, 5, -19, 97, 58, -16, 5, -1},
4282 {-1, 6, -19, 88, 68, -18, 5, -1},
4283 {-1, 6, -19, 78, 78, -19, 6, -1},
4284 {-1, 5, -18, 68, 88, -19, 6, -1},
4285 {-1, 5, -16, 58, 97, -19, 5, -1},
4286 {-1, 4, -14, 48, 105, -18, 5, -1},
4287 {-1, 4, -11, 37, 112, -16, 4, -1},
4288 {-1, 3, -9, 27, 118, -13, 4, -1},
4289 {0, 2, -6, 18, 122, -10, 3, -1},
4290 {0, 1, -3, 8, 126, -5, 1, 0},
4292 {-1, 3, -7, 127, 8, -3, 1, 0},
4293 {-2, 5, -13, 125, 17, -6, 3, -1},
4294 {-3, 7, -17, 121, 27, -10, 5, -2},
4295 {-4, 9, -20, 115, 37, -13, 6, -2},
4296 {-4, 10, -23, 108, 48, -16, 8, -3},
4297 {-4, 10, -24, 100, 59, -19, 9, -3},
4298 {-4, 11, -24, 90, 70, -21, 10, -4},
4299 {-4, 11, -23, 80, 80, -23, 11, -4},
4300 {-4, 10, -21, 70, 90, -24, 11, -4},
4301 {-3, 9, -19, 59, 100, -24, 10, -4},
4302 {-3, 8, -16, 48, 108, -23, 10, -4},
4303 {-2, 6, -13, 37, 115, -20, 9, -4},
4304 {-2, 5, -10, 27, 121, -17, 7, -3},
4305 {-1, 3, -6, 17, 125, -13, 5, -2},
4306 {0, 1, -3, 8, 127, -7, 3, -1},
4308 {-3, -1, 32, 64, 38, 1, -3, 0},
4309 {-2, -2, 29, 63, 41, 2, -3, 0},
4310 {-2, -2, 26, 63, 43, 4, -4, 0},
4311 {-2, -3, 24, 62, 46, 5, -4, 0},
4312 {-2, -3, 21, 60, 49, 7, -4, 0},
4313 {-1, -4, 18, 59, 51, 9, -4, 0},
4314 {-1, -4, 16, 57, 53, 12, -4, -1},
4315 {-1, -4, 14, 55, 55, 14, -4, -1},
4316 {-1, -4, 12, 53, 57, 16, -4, -1},
4317 {0, -4, 9, 51, 59, 18, -4, -1},
4318 {0, -4, 7, 49, 60, 21, -3, -2},
4319 {0, -4, 5, 46, 62, 24, -3, -2},
4320 {0, -4, 4, 43, 63, 26, -2, -2},
4321 {0, -3, 2, 41, 63, 29, -2, -2},
4322 {0, -3, 1, 38, 64, 32, -1, -3},
4326 #define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx) \ 4327 void ff_put_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ 4328 const uint8_t *src, \ 4329 ptrdiff_t srcstride, \ 4330 int h, int mx, int my) \ 4332 const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \ 4334 common_hz_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \ 4337 void ff_put_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ 4338 const uint8_t *src, \ 4339 ptrdiff_t srcstride, \ 4340 int h, int mx, int my) \ 4342 const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \ 4344 common_vt_8t_##SIZE##w_msa(src, srcstride, dst, dststride, filter, h); \ 4347 void ff_put_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ 4348 const uint8_t *src, \ 4349 ptrdiff_t srcstride, \ 4350 int h, int mx, int my) \ 4352 const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \ 4353 const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \ 4355 common_hv_8ht_8vt_##SIZE##w_msa(src, srcstride, dst, dststride, hfilter, \ 4359 void ff_avg_8tap_##type##_##SIZE##h_msa(uint8_t *dst, ptrdiff_t dststride, \ 4360 const uint8_t *src, \ 4361 ptrdiff_t srcstride, \ 4362 int h, int mx, int my) \ 4364 const int8_t *filter = vp9_subpel_filters_msa[type_idx][mx-1]; \ 4366 common_hz_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \ 4367 dststride, filter, h); \ 4370 void ff_avg_8tap_##type##_##SIZE##v_msa(uint8_t *dst, ptrdiff_t dststride, \ 4371 const uint8_t *src, \ 4372 ptrdiff_t srcstride, \ 4373 int h, int mx, int my) \ 4375 const int8_t *filter = vp9_subpel_filters_msa[type_idx][my-1]; \ 4377 common_vt_8t_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, dststride, \ 4381 void ff_avg_8tap_##type##_##SIZE##hv_msa(uint8_t *dst, ptrdiff_t dststride, \ 4382 const uint8_t *src, \ 4383 ptrdiff_t srcstride, \ 4384 int h, int mx, int my) \ 4386 const int8_t *hfilter = vp9_subpel_filters_msa[type_idx][mx-1]; \ 4387 const int8_t *vfilter = vp9_subpel_filters_msa[type_idx][my-1]; \ 4389 common_hv_8ht_8vt_and_aver_dst_##SIZE##w_msa(src, srcstride, dst, \ 4390 dststride, hfilter, \ 4394 #define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE) \ 4395 void ff_copy##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ 4396 const uint8_t *src, ptrdiff_t srcstride, \ 4397 int h, int mx, int my) \ 4400 copy_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ 4403 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ 4404 const uint8_t *src, ptrdiff_t srcstride, \ 4405 int h, int mx, int my) \ 4408 avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ 4411 #define VP9_AVG_MIPS_MSA_FUNC(SIZE) \ 4412 void ff_avg##SIZE##_msa(uint8_t *dst, ptrdiff_t dststride, \ 4413 const uint8_t *src, ptrdiff_t srcstride, \ 4414 int h, int mx, int my) \ 4417 avg_width##SIZE##_msa(src, srcstride, dst, dststride, h); \ 4444 #undef VP9_8TAP_MIPS_MSA_FUNC 4445 #undef VP9_COPY_AVG_MIPS_MSA_FUNC 4446 #undef VP9_AVG_MIPS_MSA_FUNC void ff_put_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void common_hv_8ht_8vt_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define VP9_COPY_AVG_MIPS_MSA_FUNC(SIZE)
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
void ff_avg_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_SB(...)
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_XORI128_UB(in0, in1)
static void common_hv_2ht_2vt_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
static void common_hz_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B3_128_SB(...)
#define VP9_8TAP_MIPS_MSA_FUNC(SIZE, type, type_idx)
void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define DPADD_SB4_SH(...)
static void copy_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hv_8ht_8vt_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hv_8ht_8vt_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static const int8_t vp9_subpel_filters_msa[3][15][8]
#define XORI_B4_128_UB(...)
static void common_hz_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define PCKEV_ST_SB(in0, in1, pdst)
#define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1, out2, out3)
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hv_2ht_2vt_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define PCKEV_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1,pdst, stride)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define XORI_B2_128_UB(...)
void ff_put_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define SPLATI_H4_SH(...)
static void common_vt_2t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define PCKEV_AVG_ST_UB(in0, in1, dst, pdst)
void ff_put_bilin_8hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_bilin_16v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_8t_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
static void common_vt_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
void ff_put_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_8ht_8vt_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
void ff_put_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
static const uint16_t mask[17]
static void common_hv_8ht_8vt_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hz_2t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_8TAP_FILT(src0, src1, mask0, mask1, mask2, mask3,filt_h0, filt_h1, filt_h2, filt_h3)
static void common_hv_8ht_8vt_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_vt_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B7_128_SB(...)
void ff_avg_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define XORI_B4_128_SB(...)
static void common_hz_8t_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_8ht_8vt_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_hz_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_and_aver_dst_16w_mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
static void common_hz_8t_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,dst0, dst1, pdst, stride)
static void common_hv_2ht_2vt_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_avg_bilin_32h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void common_hv_8ht_8vt_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
void ff_avg_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_2ht_2vt_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
static void avg_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_8ht_8vt_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
static void common_vt_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_H4_SB(...)
#define INSERT_W4_UB(...)
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_put_bilin_64h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_bilin_8h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_8t_and_aver_dst_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_and_aver_dst_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_bilin_32v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_width16_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_hz_8t_32w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static const int8_t vp9_bilinear_filters_msa[15][2]
static float smooth(DeshakeOpenCLContext *deshake_ctx, float *gauss_kernel, int length, float max_val, AVFifoBuffer *values)
static void avg_width8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void avg_width64_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void avg_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3,filt0, filt1, filt2, filt3)
void ff_put_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void copy_width32_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define PCKEV_XORI128_AVG_ST_UB(in0, in1, dst, pdst)
static void common_vt_2t_and_aver_dst_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_8ht_8vt_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_avg_bilin_16h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_2t_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_8t_and_aver_dst_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_2ht_2vt_and_aver_dst_8x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
#define INSERT_D2_UB(...)
#define LW4(psrc, stride, out0, out1, out2, out3)
static const int8_t filt[NUMTAPS *2]
void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_8v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_avg_bilin_64v_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2, mask3,filt0, filt1, filt2, filt3,out0, out1)
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hz_8t_and_aver_dst_8w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
void ff_avg_bilin_64hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_vt_8t_and_aver_dst_16w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_avg_bilin_16hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static const uint8_t mc_filt_mask_arr[16 *3]
#define ST_W2(in, idx0, idx1, pdst, stride)
static void common_hz_2t_4x4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define VP9_AVG_MIPS_MSA_FUNC(SIZE)
void ff_put_bilin_32hv_msa(uint8_t *dst, ptrdiff_t dst_stride, const uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void avg_width4_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void common_vt_8t_and_aver_dst_64w_msa(const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)