27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
35 {-6, 123, 12, -1, 0, 0, 0, 0},
36 {2, -11, 108, 36, -8, 1, 0, 0},
37 {-9, 93, 50, -6, 0, 0, 0, 0},
38 {3, -16, 77, 77, -16, 3, 0, 0},
39 {-6, 50, 93, -9, 0, 0, 0, 0},
40 {1, -8, 36, 108, -11, 2, 0, 0},
41 {-1, 12, 123, -6, 0, 0, 0, 0},
54 #define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2, \ 55 filt_h0, filt_h1, filt_h2) \ 57 v16i8 vec0_m, vec1_m, vec2_m; \ 60 VSHF_B3_SB(src0, src1, src0, src1, src0, src1, mask0, mask1, mask2, \ 61 vec0_m, vec1_m, vec2_m); \ 62 hz_out_m = DPADD_SH3_SH(vec0_m, vec1_m, vec2_m, \ 63 filt_h0, filt_h1, filt_h2); \ 65 hz_out_m = __msa_srari_h(hz_out_m, 7); \ 66 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 71 #define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 72 mask0, mask1, mask2, \ 73 filt0, filt1, filt2, \ 76 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m; \ 78 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 79 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 80 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 81 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 82 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \ 83 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \ 86 #define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 87 mask0, mask1, mask2, \ 88 filt0, filt1, filt2, \ 89 out0, out1, out2, out3) \ 91 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \ 93 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 94 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 95 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 96 out0, out1, out2, out3); \ 97 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 98 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 99 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec4_m, vec5_m); \ 100 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec6_m, vec7_m); \ 101 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 102 out0, out1, out2, out3); \ 103 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt2, filt2, filt2, filt2, \ 104 out0, out1, out2, out3); \ 107 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \ 111 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \ 112 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \ 117 #define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1) \ 119 v16i8 vec0_m, vec1_m; \ 122 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0_m, vec1_m); \ 123 hz_out_m = FILT_4TAP_DPADD_S_H(vec0_m, vec1_m, filt_h0, filt_h1); \ 125 hz_out_m = __msa_srari_h(hz_out_m, 7); \ 126 hz_out_m = __msa_sat_s_h(hz_out_m, 7); \ 131 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \ 132 mask0, mask1, filt0, filt1, \ 135 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 137 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \ 138 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \ 139 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \ 140 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \ 143 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \ 144 mask0, mask1, filt0, filt1, \ 145 out0, out1, out2, out3) \ 147 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \ 149 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \ 150 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \ 151 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \ 152 out0, out1, out2, out3); \ 153 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \ 154 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \ 155 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \ 156 out0, out1, out2, out3); \ 163 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2;
164 v16u8 mask0, mask1, mask2,
out;
165 v8i16
filt, out0, out1;
171 filt =
LD_SH(filter);
177 LD_SB4(src, src_stride, src0, src1, src2, src3);
180 filt0, filt1, filt2, out0, out1);
184 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
191 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2;
192 v16u8 mask0, mask1, mask2,
out;
193 v8i16
filt, out0, out1, out2, out3;
199 filt =
LD_SH(filter);
205 LD_SB4(src, src_stride, src0, src1, src2, src3);
207 src += (4 * src_stride);
209 filt0, filt1, filt2, out0, out1);
210 LD_SB4(src, src_stride, src0, src1, src2, src3);
213 filt0, filt1, filt2, out2, out3);
217 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
219 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
224 int height,
int mx,
int my)
230 }
else if (8 == height) {
237 int height,
int mx,
int my)
241 v16i8
src0,
src1, src2, src3, filt0, filt1, filt2;
242 v16u8 mask0, mask1, mask2, tmp0, tmp1;
243 v8i16
filt, out0, out1, out2, out3;
250 filt =
LD_SH(filter);
256 LD_SB4(src, src_stride, src0, src1, src2, src3);
258 src += (4 * src_stride);
260 filt0, filt1, filt2, out0, out1, out2, out3);
265 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
266 dst += (4 * dst_stride);
268 for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
269 LD_SB4(src, src_stride, src0, src1, src2, src3);
271 src += (4 * src_stride);
273 filt0, filt1, filt2, out0, out1, out2, out3);
278 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
279 dst += (4 * dst_stride);
285 int height,
int mx,
int my)
289 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
290 v16u8 mask0, mask1, mask2,
out;
291 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
297 filt =
LD_SH(filter);
303 for (loop_cnt = (height >> 2); loop_cnt--;) {
304 LD_SB4(src, src_stride, src0, src2, src4, src6);
305 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
307 src += (4 * src_stride);
310 filt0, filt1, filt2, out0, out1, out2, out3);
312 filt0, filt1, filt2, out4, out5, out6, out7);
334 int height,
int mx,
int my)
338 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
339 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
340 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
342 v8i16
filt, out10, out32;
344 src -= (2 * src_stride);
346 filt =
LD_SH(filter);
349 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
350 src += (5 * src_stride);
352 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
354 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
357 for (loop_cnt = (height >> 2); loop_cnt--;) {
358 LD_SB4(src, src_stride, src5, src6, src7, src8);
359 src += (4 * src_stride);
361 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
362 src65_r, src76_r, src87_r);
363 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
365 out10 =
DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
366 out32 =
DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
370 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
371 dst += (4 * dst_stride);
381 int height,
int mx,
int my)
385 v16i8
src0,
src1, src2, src3, src4, src7, src8, src9, src10;
386 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
387 v16i8 src109_r, filt0, filt1, filt2;
389 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
391 src -= (2 * src_stride);
393 filt =
LD_SH(filter);
396 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
397 src += (5 * src_stride);
400 ILVR_B4_SB(src1, src0, src3, src2, src2, src1, src4, src3,
401 src10_r, src32_r, src21_r, src43_r);
403 for (loop_cnt = (height >> 2); loop_cnt--;) {
404 LD_SB4(src, src_stride, src7, src8, src9, src10);
406 src += (4 * src_stride);
408 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
409 src87_r, src98_r, src109_r);
410 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
411 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
412 out2_r =
DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
413 out3_r =
DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
415 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
418 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
419 dst += (4 * dst_stride);
431 int height,
int mx,
int my)
435 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
436 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
437 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
438 v16i8 src65_l, src87_l, filt0, filt1, filt2;
439 v16u8 tmp0, tmp1, tmp2, tmp3;
440 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l,
filt;
442 src -= (2 * src_stride);
444 filt =
LD_SH(filter);
447 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
448 src += (5 * src_stride);
451 ILVR_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_r,
452 src32_r, src43_r, src21_r);
453 ILVL_B4_SB(src1, src0, src3, src2, src4, src3, src2, src1, src10_l,
454 src32_l, src43_l, src21_l);
456 for (loop_cnt = (height >> 2); loop_cnt--;) {
457 LD_SB4(src, src_stride, src5, src6, src7, src8);
458 src += (4 * src_stride);
461 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
462 src65_r, src76_r, src87_r);
463 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
464 src65_l, src76_l, src87_l);
465 out0_r =
DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1,
467 out1_r =
DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1,
469 out2_r =
DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1,
471 out3_r =
DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1,
473 out0_l =
DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1,
475 out1_l =
DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1,
477 out2_l =
DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1,
479 out3_l =
DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1,
483 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
484 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
485 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
486 out3_r, tmp0, tmp1, tmp2, tmp3);
488 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
489 dst += (4 * dst_stride);
505 int height,
int mx,
int my)
510 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
511 v16i8 filt_hz0, filt_hz1, filt_hz2;
512 v16u8 mask0, mask1, mask2,
out;
514 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
515 v8i16 hz_out7,
filt, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
518 src -= (2 + 2 * src_stride);
521 filt =
LD_SH(filter_horiz);
522 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
524 filt =
LD_SH(filter_vert);
525 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
530 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
531 src += (5 * src_stride);
538 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
541 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
543 for (loop_cnt = (height >> 2); loop_cnt--;) {
544 LD_SB2(src, src_stride, src5, src6);
545 src += (2 * src_stride);
550 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
552 LD_SB2(src, src_stride, src7, src8);
553 src += (2 * src_stride);
558 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
560 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
561 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
563 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
564 tmp1 =
DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
569 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
570 dst += (4 * dst_stride);
580 int height,
int mx,
int my)
585 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
586 v16i8 filt_hz0, filt_hz1, filt_hz2;
587 v16u8 mask0, mask1, mask2, vec0, vec1;
588 v8i16
filt, filt_vt0, filt_vt1, filt_vt2;
589 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
590 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
591 v8i16 tmp0, tmp1, tmp2, tmp3;
594 src -= (2 + 2 * src_stride);
597 filt =
LD_SH(filter_horiz);
598 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
603 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
604 src += (5 * src_stride);
618 filt =
LD_SH(filter_vert);
619 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
621 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
622 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
624 for (loop_cnt = (height >> 2); loop_cnt--;) {
625 LD_SB4(src, src_stride, src5, src6, src7, src8);
626 src += (4 * src_stride);
631 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
632 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
636 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
637 tmp1 =
DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
641 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
642 tmp2 =
DPADD_SH3_SH(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
646 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
647 tmp3 =
DPADD_SH3_SH(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
653 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
654 dst += (4 * dst_stride);
667 int height,
int mx,
int my)
671 for (multiple8_cnt = 2; multiple8_cnt--;) {
684 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
685 v8i16
filt, out0, out1;
692 filt =
LD_SH(filter);
697 LD_SB4(src, src_stride, src0, src1, src2, src3);
700 filt0, filt1, out0, out1);
704 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
711 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
713 v8i16
filt, out0, out1, out2, out3;
719 filt =
LD_SH(filter);
724 LD_SB4(src, src_stride, src0, src1, src2, src3);
725 src += (4 * src_stride);
729 filt0, filt1, out0, out1);
730 LD_SB4(src, src_stride, src0, src1, src2, src3);
733 filt0, filt1, out2, out3);
737 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
739 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
746 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
747 v16i8 filt0, filt1, mask0, mask1;
749 v8i16
filt, out0, out1, out2, out3;
755 filt =
LD_SH(filter);
760 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
761 src += (8 * src_stride);
764 filt0, filt1, out0, out1);
766 filt0, filt1, out2, out3);
770 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
771 dst += (4 * dst_stride);
773 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
774 dst += (4 * dst_stride);
776 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
777 src += (8 * src_stride);
780 filt0, filt1, out0, out1);
782 filt0, filt1, out2, out3);
786 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
787 dst += (4 * dst_stride);
789 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
794 int height,
int mx,
int my)
800 }
else if (8 == height) {
802 }
else if (16 == height) {
809 int height,
int mx,
int my)
813 v16i8
src0,
src1, src2, src3, filt0, filt1, mask0, mask1;
815 v8i16
filt, out0, out1, out2, out3;
821 filt =
LD_SH(filter);
826 for (loop_cnt = (height >> 2); loop_cnt--;) {
827 LD_SB4(src, src_stride, src0, src1, src2, src3);
828 src += (4 * src_stride);
832 filt1, out0, out1, out2, out3);
837 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
838 dst += (4 * dst_stride);
844 int height,
int mx,
int my)
848 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
849 v16i8 filt0, filt1, mask0, mask1;
850 v8i16
filt, out0, out1, out2, out3, out4, out5, out6, out7;
857 filt =
LD_SH(filter);
862 for (loop_cnt = (height >> 2); loop_cnt--;) {
863 LD_SB4(src, src_stride, src0, src2, src4, src6);
864 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
865 src += (4 * src_stride);
869 filt1, out0, out1, out2, out3);
871 filt1, out4, out5, out6, out7);
893 int height,
int mx,
int my)
897 v16i8
src0,
src1, src2, src3, src4, src5;
898 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
899 v16i8 src2110, src4332, filt0, filt1;
900 v8i16
filt, out10, out32;
905 filt =
LD_SH(filter);
908 LD_SB3(src, src_stride, src0, src1, src2);
909 src += (3 * src_stride);
911 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
913 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
914 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
916 for (loop_cnt = (height >> 2); loop_cnt--;) {
917 LD_SB3(src, src_stride, src3, src4, src5);
918 src += (3 * src_stride);
919 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
920 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
921 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
926 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
927 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
928 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
933 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
934 dst += (4 * dst_stride);
940 int height,
int mx,
int my)
944 v16i8
src0,
src1, src2, src7, src8, src9, src10;
945 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
947 v8i16
filt, out0_r, out1_r, out2_r, out3_r;
951 filt =
LD_SH(filter);
954 LD_SB3(src, src_stride, src0, src1, src2);
955 src += (3 * src_stride);
958 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
960 for (loop_cnt = (height >> 2); loop_cnt--;) {
961 LD_SB4(src, src_stride, src7, src8, src9, src10);
962 src += (4 * src_stride);
965 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
966 src72_r, src87_r, src98_r, src109_r);
972 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
975 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
976 dst += (4 * dst_stride);
986 int height,
int mx,
int my)
990 v16i8
src0,
src1, src2, src3, src4, src5, src6;
991 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
992 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
993 v16u8 tmp0, tmp1, tmp2, tmp3;
994 v8i16
filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
998 filt =
LD_SH(filter);
1001 LD_SB3(src, src_stride, src0, src1, src2);
1002 src += (3 * src_stride);
1005 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
1006 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
1008 for (loop_cnt = (height >> 2); loop_cnt--;) {
1009 LD_SB4(src, src_stride, src3, src4, src5, src6);
1010 src += (4 * src_stride);
1013 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1014 src32_r, src43_r, src54_r, src65_r);
1015 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
1016 src32_l, src43_l, src54_l, src65_l);
1027 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1028 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1029 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1030 out3_r, tmp0, tmp1, tmp2, tmp3);
1032 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1033 dst += (4 * dst_stride);
1045 int height,
int mx,
int my)
1050 v16i8
src0,
src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1051 v16u8 mask0, mask1,
out;
1052 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1053 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1056 src -= (1 + 1 * src_stride);
1059 filt =
LD_SH(filter_horiz);
1064 LD_SB3(src, src_stride, src0, src1, src2);
1065 src += (3 * src_stride);
1068 hz_out0 =
HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1069 hz_out1 =
HORIZ_4TAP_FILT(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
1070 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1072 filt =
LD_SH(filter_vert);
1075 for (loop_cnt = (height >> 2); loop_cnt--;) {
1076 LD_SB4(src, src_stride, src3, src4, src5, src6);
1077 src += (4 * src_stride);
1080 hz_out3 =
HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1081 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1082 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1086 hz_out5 =
HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1087 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1088 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1094 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1095 dst += (4 * dst_stride);
1104 int height,
int mx,
int my)
1109 v16i8
src0,
src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
1110 v16u8 mask0, mask1, out0, out1;
1111 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
1112 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1113 v8i16 vec0, vec1, vec2, vec3, vec4;
1116 src -= (1 + 1 * src_stride);
1119 filt =
LD_SH(filter_horiz);
1124 LD_SB3(src, src_stride, src0, src1, src2);
1125 src += (3 * src_stride);
1128 hz_out0 =
HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1129 hz_out1 =
HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1130 hz_out2 =
HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1131 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1133 filt =
LD_SH(filter_vert);
1136 for (loop_cnt = (height >> 2); loop_cnt--;) {
1137 LD_SB4(src, src_stride, src3, src4, src5, src6);
1138 src += (4 * src_stride);
1141 hz_out3 =
HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1142 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1145 hz_out0 =
HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1146 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1149 hz_out1 =
HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1150 vec4 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1153 hz_out2 =
HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1154 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec0, vec1);
1161 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1162 dst += (4 * dst_stride);
1171 int height,
int mx,
int my)
1175 for (multiple8_cnt = 2; multiple8_cnt--;) {
1186 int height,
int mx,
int my)
1191 v16i8
src0,
src1, src2, src3, src4, src5, src6;
1192 v16i8 filt_hz0, filt_hz1, filt_hz2;
1193 v16u8 res0, res1, mask0, mask1, mask2;
1194 v8i16
filt, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
1195 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
1198 src -= (2 + 1 * src_stride);
1201 filt =
LD_SH(filter_horiz);
1202 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1207 LD_SB3(src, src_stride, src0, src1, src2);
1208 src += (3 * src_stride);
1212 filt_hz1, filt_hz2);
1214 filt_hz1, filt_hz2);
1215 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1217 filt =
LD_SH(filter_vert);
1220 for (loop_cnt = (height >> 2); loop_cnt--;) {
1221 LD_SB4(src, src_stride, src3, src4, src5, src6);
1222 src += (4 * src_stride);
1226 filt_hz1, filt_hz2);
1227 hz_out2 = (v8i16) __msa_sldi_b((v16i8) hz_out3, (v16i8) hz_out1, 8);
1228 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1232 filt_hz1, filt_hz2);
1233 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1234 vec2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1241 ST_W2(res0, 0, 1, dst, dst_stride);
1242 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1243 dst += (4 * dst_stride);
1252 int height,
int mx,
int my)
1257 v16i8
src0,
src1, src2, src3, src4, src5, src6;
1258 v16i8 filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
1259 v8i16
filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
1260 v8i16 tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
1264 src -= (2 + src_stride);
1267 filt =
LD_SH(filter_horiz);
1268 SPLATI_H3_SB(filt, 0, 1, 2, filt_hz0, filt_hz1, filt_hz2);
1273 LD_SB3(src, src_stride, src0, src1, src2);
1274 src += (3 * src_stride);
1278 filt_hz1, filt_hz2);
1280 filt_hz1, filt_hz2);
1282 filt_hz1, filt_hz2);
1283 ILVEV_B2_SH(hz_out0, hz_out1, hz_out1, hz_out2, vec0, vec2);
1285 filt =
LD_SH(filter_vert);
1288 for (loop_cnt = (height >> 2); loop_cnt--;) {
1289 LD_SB4(src, src_stride, src3, src4, src5, src6);
1290 src += (4 * src_stride);
1295 filt_hz1, filt_hz2);
1296 vec1 = (v8i16) __msa_ilvev_b((v16i8) hz_out3, (v16i8) hz_out2);
1300 filt_hz1, filt_hz2);
1301 vec3 = (v8i16) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out3);
1305 filt_hz1, filt_hz2);
1306 vec0 = (v8i16) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
1310 filt_hz1, filt_hz2);
1311 ILVEV_B2_SH(hz_out3, hz_out0, hz_out1, hz_out2, vec1, vec2);
1318 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1319 dst += (4 * dst_stride);
1325 int height,
int mx,
int my)
1329 for (multiple8_cnt = 2; multiple8_cnt--;) {
1340 int height,
int mx,
int my)
1345 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1346 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1348 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1349 v8i16 hz_out7, tmp0, tmp1, out0, out1, out2, out3;
1350 v8i16
filt, filt_vt0, filt_vt1, filt_vt2;
1354 src -= (1 + 2 * src_stride);
1357 filt =
LD_SH(filter_horiz);
1362 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1363 src += (5 * src_stride);
1366 hz_out0 =
HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
1367 hz_out2 =
HORIZ_4TAP_FILT(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
1368 hz_out3 =
HORIZ_4TAP_FILT(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
1369 hz_out1 = (v8i16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1370 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1372 filt =
LD_SH(filter_vert);
1373 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1375 for (loop_cnt = (height >> 2); loop_cnt--;) {
1376 LD_SB4(src, src_stride, src5, src6, src7, src8);
1378 src += (4 * src_stride);
1380 hz_out5 =
HORIZ_4TAP_FILT(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
1381 hz_out4 = (v8i16) __msa_sldi_b((v16i8) hz_out5, (v16i8) hz_out3, 8);
1382 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1383 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1385 hz_out7 =
HORIZ_4TAP_FILT(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
1386 hz_out6 = (v8i16) __msa_sldi_b((v16i8) hz_out7, (v16i8) hz_out5, 8);
1387 out3 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1388 tmp1 =
DPADD_SH3_SH(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
1393 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1394 dst += (4 * dst_stride);
1404 int height,
int mx,
int my)
1409 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1410 v16i8 filt_hz0, filt_hz1, mask0, mask1;
1411 v8i16
filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
1412 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1413 v8i16 hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
1417 src -= (1 + 2 * src_stride);
1420 filt =
LD_SH(filter_horiz);
1425 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1426 src += (5 * src_stride);
1429 hz_out0 =
HORIZ_4TAP_FILT(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
1430 hz_out1 =
HORIZ_4TAP_FILT(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
1431 hz_out2 =
HORIZ_4TAP_FILT(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
1432 hz_out3 =
HORIZ_4TAP_FILT(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
1433 hz_out4 =
HORIZ_4TAP_FILT(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
1434 ILVEV_B2_SH(hz_out0, hz_out1, hz_out2, hz_out3, out0, out1);
1435 ILVEV_B2_SH(hz_out1, hz_out2, hz_out3, hz_out4, out3, out4);
1437 filt =
LD_SH(filter_vert);
1438 SPLATI_H3_SH(filt, 0, 1, 2, filt_vt0, filt_vt1, filt_vt2);
1440 for (loop_cnt = (height >> 2); loop_cnt--;) {
1441 LD_SB4(src, src_stride, src5, src6, src7, src8);
1442 src += (4 * src_stride);
1446 hz_out5 =
HORIZ_4TAP_FILT(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
1447 out2 = (v8i16) __msa_ilvev_b((v16i8) hz_out5, (v16i8) hz_out4);
1448 tmp0 =
DPADD_SH3_SH(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
1450 hz_out6 =
HORIZ_4TAP_FILT(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
1451 out5 = (v8i16) __msa_ilvev_b((v16i8) hz_out6, (v16i8) hz_out5);
1452 tmp1 =
DPADD_SH3_SH(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
1454 hz_out7 =
HORIZ_4TAP_FILT(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
1455 out6 = (v8i16) __msa_ilvev_b((v16i8) hz_out7, (v16i8) hz_out6);
1456 tmp2 =
DPADD_SH3_SH(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
1458 hz_out8 =
HORIZ_4TAP_FILT(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
1459 out7 = (v8i16) __msa_ilvev_b((v16i8) hz_out8, (v16i8) hz_out7);
1460 tmp3 =
DPADD_SH3_SH(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
1466 ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
1467 dst += (4 * dst_stride);
1479 int height,
int mx,
int my)
1483 for (multiple8_cnt = 2; multiple8_cnt--;) {
1497 v16u8 filt0, vec0, vec1, res0, res1;
1498 v8u16 vec2, vec3,
filt;
1503 filt =
LD_UH(filter);
1504 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1506 LD_SB4(src, src_stride, src0, src1, src2, src3);
1507 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1508 DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
1511 ST_W2(res0, 0, 1, dst, dst_stride);
1512 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1519 v16u8 vec0, vec1, vec2, vec3, filt0;
1520 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
1521 v16i8 res0, res1, res2, res3;
1522 v8u16 vec4, vec5, vec6, vec7,
filt;
1527 filt =
LD_UH(filter);
1528 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1530 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1531 VSHF_B2_UB(src0, src1, src2, src3, mask, mask, vec0, vec1);
1532 VSHF_B2_UB(src4, src5, src6, src7, mask, mask, vec2, vec3);
1533 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1534 vec4, vec5, vec6, vec7);
1536 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
1537 res0, res1, res2, res3);
1538 ST_W2(res0, 0, 1, dst, dst_stride);
1539 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1540 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
1541 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
1546 int height,
int mx,
int my)
1552 }
else if (8 == height) {
1563 v8u16 vec0, vec1, vec2, vec3,
filt;
1568 filt =
LD_UH(filter);
1569 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1571 LD_SB4(src, src_stride, src0, src1, src2, src3);
1572 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1573 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1574 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1575 vec0, vec1, vec2, vec3);
1578 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
1587 v8u16 vec0, vec1, vec2, vec3,
filt;
1592 filt =
LD_UH(filter);
1593 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1595 LD_SB4(src, src_stride, src0, src1, src2, src3);
1596 src += (4 * src_stride);
1598 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1599 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1600 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1601 vec0, vec1, vec2, vec3);
1604 LD_SB4(src, src_stride, src0, src1, src2, src3);
1605 src += (4 * src_stride);
1608 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1610 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1611 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1612 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1613 vec0, vec1, vec2, vec3);
1616 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1617 dst += (8 * dst_stride);
1620 LD_SB4(src, src_stride, src0, src1, src2, src3);
1621 src += (4 * src_stride);
1623 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1624 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1625 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1626 vec0, vec1, vec2, vec3);
1628 LD_SB4(src, src_stride, src0, src1, src2, src3);
1629 src += (4 * src_stride);
1632 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1634 VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
1635 VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
1636 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1637 vec0, vec1, vec2, vec3);
1640 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1646 int height,
int mx,
int my)
1660 int height,
int mx,
int my)
1664 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
1665 v16u8 filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1666 v8u16 out0, out1, out2, out3, out4, out5, out6, out7,
filt;
1670 loop_cnt = (height >> 2) - 1;
1673 filt =
LD_UH(filter);
1674 filt0 = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1676 LD_SB4(src, src_stride, src0, src2, src4, src6);
1677 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1678 src += (4 * src_stride);
1680 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1681 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1682 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1683 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1684 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1685 out0, out1, out2, out3);
1686 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1687 out4, out5, out6, out7);
1699 for (; loop_cnt--;) {
1700 LD_SB4(src, src_stride, src0, src2, src4, src6);
1701 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
1702 src += (4 * src_stride);
1704 VSHF_B2_UB(src0, src0, src1, src1, mask, mask, vec0, vec1);
1705 VSHF_B2_UB(src2, src2, src3, src3, mask, mask, vec2, vec3);
1706 VSHF_B2_UB(src4, src4, src5, src5, mask, mask, vec4, vec5);
1707 VSHF_B2_UB(src6, src6, src7, src7, mask, mask, vec6, vec7);
1708 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1709 out0, out1, out2, out3);
1710 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1711 out4, out5, out6, out7);
1729 v16i8
src0,
src1, src2, src3, src4;
1730 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
1735 filt =
LD_SH(filter);
1736 filt0 = (v16u8) __msa_splati_h(filt, 0);
1738 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1739 src += (5 * src_stride);
1741 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1742 src10_r, src21_r, src32_r, src43_r);
1743 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1744 DOTP_UB2_UH(src2110, src4332, filt0, filt0, tmp0, tmp1);
1747 src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
1748 ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
1755 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1756 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r;
1757 v16i8 src65_r, src87_r, src2110, src4332, src6554, src8776;
1758 v8u16 tmp0, tmp1, tmp2, tmp3;
1762 filt =
LD_SH(filter);
1763 filt0 = (v16u8) __msa_splati_h(filt, 0);
1765 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1766 src += (8 * src_stride);
1771 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1773 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1775 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1776 src87_r, src76_r, src2110, src4332, src6554, src8776);
1777 DOTP_UB4_UH(src2110, src4332, src6554, src8776, filt0, filt0, filt0, filt0,
1778 tmp0, tmp1, tmp2, tmp3);
1781 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
1782 ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1787 int height,
int mx,
int my)
1793 }
else if (8 == height) {
1802 v16u8
src0,
src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
1804 v8u16 tmp0, tmp1, tmp2, tmp3;
1808 filt =
LD_SH(filter);
1809 filt0 = (v16u8) __msa_splati_h(filt, 0);
1811 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
1812 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec1);
1813 ILVR_B2_UB(src3, src2, src4, src3, vec2, vec3);
1814 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1815 tmp0, tmp1, tmp2, tmp3);
1819 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1827 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1828 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1830 v8u16 tmp0, tmp1, tmp2, tmp3;
1834 filt =
LD_SH(filter);
1835 filt0 = (v16u8) __msa_splati_h(filt, 0);
1840 for (loop_cnt = (height >> 3); loop_cnt--;) {
1841 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
1842 src += (8 * src_stride);
1844 ILVR_B4_UB(src1, src0, src2, src1, src3, src2, src4, src3,
1845 vec0, vec1, vec2, vec3);
1846 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1847 vec4, vec5, vec6, vec7);
1848 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
1849 tmp0, tmp1, tmp2, tmp3);
1853 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
1855 DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1856 tmp0, tmp1, tmp2, tmp3);
1860 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
1861 dst += (8 * dst_stride);
1869 int height,
int mx,
int my)
1883 int height,
int mx,
int my)
1887 v16u8
src0,
src1, src2, src3, src4;
1888 v16u8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
1889 v8u16 tmp0, tmp1, tmp2, tmp3;
1893 filt =
LD_SH(filter);
1894 filt0 = (v16u8) __msa_splati_h(filt, 0);
1899 for (loop_cnt = (height >> 2); loop_cnt--;) {
1900 LD_UB4(src, src_stride, src1, src2, src3, src4);
1901 src += (4 * src_stride);
1903 ILVR_B2_UB(src1, src0, src2, src1, vec0, vec2);
1904 ILVL_B2_UB(src1, src0, src2, src1, vec1, vec3);
1905 DOTP_UB2_UH(vec0, vec1, filt0, filt0, tmp0, tmp1);
1911 ILVR_B2_UB(src3, src2, src4, src3, vec4, vec6);
1912 ILVL_B2_UB(src3, src2, src4, src3, vec5, vec7);
1913 DOTP_UB2_UH(vec2, vec3, filt0, filt0, tmp2, tmp3);
1919 DOTP_UB2_UH(vec4, vec5, filt0, filt0, tmp0, tmp1);
1925 DOTP_UB2_UH(vec6, vec7, filt0, filt0, tmp2, tmp3);
1937 const int8_t *filter_horiz,
1938 const int8_t *filter_vert)
1941 v16u8 filt_vt, filt_hz, vec0, vec1, res0, res1;
1942 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4,
filt, tmp0, tmp1;
1947 filt =
LD_UH(filter_horiz);
1948 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1950 filt =
LD_UH(filter_vert);
1951 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1953 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1957 hz_out1 = (v8u16) __msa_sldi_b((v16i8) hz_out2, (v16i8) hz_out0, 8);
1958 hz_out3 = (v8u16) __msa_pckod_d((v2i64) hz_out4, (v2i64) hz_out2);
1960 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
1961 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp0, tmp1);
1965 ST_W2(res0, 0, 1, dst, dst_stride);
1966 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
1971 const int8_t *filter_horiz,
1972 const int8_t *filter_vert)
1974 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8,
mask;
1975 v16i8 res0, res1, res2, res3;
1976 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
1977 v8u16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1978 v8u16 hz_out7, hz_out8, vec4, vec5, vec6, vec7,
filt;
1983 filt =
LD_UH(filter_horiz);
1984 filt_hz = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1986 filt =
LD_UH(filter_vert);
1987 filt_vt = (v16u8) __msa_splati_h((v8i16)
filt, 0);
1989 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1990 src += (8 * src_stride);
1998 SLDI_B3_UH(hz_out2, hz_out0, hz_out4, hz_out2, hz_out6, hz_out4, 8, hz_out1,
2000 hz_out7 = (v8u16) __msa_pckod_d((v2i64) hz_out8, (v2i64) hz_out6);
2002 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2003 ILVEV_B2_UB(hz_out4, hz_out5, hz_out6, hz_out7, vec2, vec3);
2004 DOTP_UB4_UH(vec0, vec1, vec2, vec3, filt_vt, filt_vt, filt_vt, filt_vt,
2005 vec4, vec5, vec6, vec7);
2008 PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
2009 res0, res1, res2, res3);
2010 ST_W2(res0, 0, 1, dst, dst_stride);
2011 ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
2012 ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
2013 ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
2018 int height,
int mx,
int my)
2025 filter_horiz, filter_vert);
2026 }
else if (8 == height) {
2028 filter_horiz, filter_vert);
2034 const int8_t *filter_horiz,
2035 const int8_t *filter_vert)
2038 v16u8 filt_hz, filt_vt, vec0, vec1, vec2, vec3;
2039 v8u16 hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
2045 filt =
LD_SH(filter_horiz);
2046 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2048 filt =
LD_SH(filter_vert);
2049 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2051 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2055 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2056 tmp0 = __msa_dotp_u_h(vec0, filt_vt);
2059 vec1 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2060 tmp1 = __msa_dotp_u_h(vec1, filt_vt);
2063 vec2 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2064 tmp2 = __msa_dotp_u_h(vec2, filt_vt);
2067 vec3 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2068 tmp3 = __msa_dotp_u_h(vec3, filt_vt);
2073 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2078 const int8_t *filter_horiz,
2079 const int8_t *filter_vert,
2084 v16u8 filt_hz, filt_vt, vec0;
2085 v8u16 hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
2091 filt =
LD_SH(filter_horiz);
2092 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2094 filt =
LD_SH(filter_vert);
2095 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2102 for (loop_cnt = (height >> 3); loop_cnt--;) {
2103 LD_SB4(src, src_stride, src1, src2, src3, src4);
2104 src += (4 * src_stride);
2107 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2108 tmp1 = __msa_dotp_u_h(vec0, filt_vt);
2111 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2112 tmp2 = __msa_dotp_u_h(vec0, filt_vt);
2118 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2119 tmp3 = __msa_dotp_u_h(vec0, filt_vt);
2122 LD_SB4(src, src_stride, src1, src2, src3, src4);
2123 src += (4 * src_stride);
2124 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2125 tmp4 = __msa_dotp_u_h(vec0, filt_vt);
2130 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2133 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2134 tmp5 = __msa_dotp_u_h(vec0, filt_vt);
2137 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2138 tmp6 = __msa_dotp_u_h(vec0, filt_vt);
2141 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
2142 tmp7 = __msa_dotp_u_h(vec0, filt_vt);
2145 vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out0, (v16i8) hz_out1);
2146 tmp8 = __msa_dotp_u_h(vec0, filt_vt);
2151 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
2152 dst += (8 * dst_stride);
2158 int height,
int mx,
int my)
2165 filter_horiz, filter_vert);
2168 filter_horiz, filter_vert, height);
2174 int height,
int mx,
int my)
2179 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
mask;
2180 v16u8 filt_hz, filt_vt, vec0, vec1;
2181 v8u16 tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
2187 filt =
LD_SH(filter_horiz);
2188 filt_hz = (v16u8) __msa_splati_h(filt, 0);
2190 filt =
LD_SH(filter_vert);
2191 filt_vt = (v16u8) __msa_splati_h(filt, 0);
2193 LD_SB2(src, 8, src0, src1);
2200 for (loop_cnt = (height >> 2); loop_cnt--;) {
2201 LD_SB4(src, src_stride, src0, src2, src4, src6);
2202 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2203 src += (4 * src_stride);
2207 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2208 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2216 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2217 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2225 ILVEV_B2_UB(hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
2226 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2234 ILVEV_B2_UB(hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
2235 DOTP_UB2_UH(vec0, vec1, filt_vt, filt_vt, tmp1, tmp2);
2245 int height,
int mx,
int my)
2248 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2249 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
2251 if (0 == height % 8) {
2252 for (cnt = height >> 3; cnt--;) {
2254 src0, src1, src2, src3, src4, src5, src6, src7);
2255 src += (8 * src_stride);
2257 out0 = __msa_copy_u_d((v2i64) src0, 0);
2258 out1 = __msa_copy_u_d((v2i64) src1, 0);
2259 out2 = __msa_copy_u_d((v2i64) src2, 0);
2260 out3 = __msa_copy_u_d((v2i64) src3, 0);
2261 out4 = __msa_copy_u_d((v2i64) src4, 0);
2262 out5 = __msa_copy_u_d((v2i64) src5, 0);
2263 out6 = __msa_copy_u_d((v2i64) src6, 0);
2264 out7 = __msa_copy_u_d((v2i64) src7, 0);
2266 SD4(out0, out1, out2, out3, dst, dst_stride);
2267 dst += (4 * dst_stride);
2268 SD4(out4, out5, out6, out7, dst, dst_stride);
2269 dst += (4 * dst_stride);
2271 }
else if (0 == height % 4) {
2272 for (cnt = (height / 4); cnt--;) {
2273 LD_UB4(src, src_stride, src0, src1, src2, src3);
2274 src += (4 * src_stride);
2275 out0 = __msa_copy_u_d((v2i64) src0, 0);
2276 out1 = __msa_copy_u_d((v2i64) src1, 0);
2277 out2 = __msa_copy_u_d((v2i64) src2, 0);
2278 out3 = __msa_copy_u_d((v2i64) src3, 0);
2280 SD4(out0, out1, out2, out3, dst, dst_stride);
2281 dst += (4 * dst_stride);
2292 v16u8
src0,
src1, src2, src3, src4, src5, src6, src7;
2294 for (cnt = (width >> 4); cnt--;) {
2298 for (loop_cnt = (height >> 3); loop_cnt--;) {
2299 LD_UB8(src_tmp, src_stride,
2300 src0, src1, src2, src3, src4, src5, src6, src7);
2301 src_tmp += (8 * src_stride);
2303 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2304 dst_tmp, dst_stride);
2305 dst_tmp += (8 * dst_stride);
2315 int height,
int mx,
int my)
2320 if (0 == height % 8) {
2322 }
else if (0 == height % 4) {
2323 for (cnt = (height >> 2); cnt--;) {
2324 LD_UB4(src, src_stride, src0, src1, src2, src3);
2325 src += (4 * src_stride);
2327 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2328 dst += (4 * dst_stride);
void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_bilinear8_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1, out2, out3)
#define SPLATI_H3_SH(...)
void ff_put_vp8_epel16_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define XORI_B2_128_SB(...)
#define PCKEV_XORI128_UB(in0, in1)
#define XORI_B3_128_SB(...)
#define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1)
void ff_put_vp8_pixels8_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2,filt0, filt1, filt2,out0, out1)
#define SPLATI_H2_SH(...)
static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)
#define XORI_B4_128_UB(...)
static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel16_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define PCKEV_ST_SB(in0, in1, pdst)
void ff_put_vp8_epel16_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
VP8 compatible video decoder.
static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B2_128_UB(...)
void ff_put_vp8_epel16_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static const int8_t bilinear_filters_msa[7][2]
void ff_put_vp8_bilinear16_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, int32_t height, int32_t width)
void ff_put_vp8_bilinear8_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height)
void ff_put_vp8_epel16_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static const int8_t subpel_filters_msa[7][8]
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static const uint16_t mask[17]
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
#define SPLATI_H2_SB(...)
void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define XORI_B4_128_SB(...)
#define HORIZ_6TAP_FILT(src0, src1, mask0, mask1, mask2,filt_h0, filt_h1, filt_h2)
#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, filt0, filt1,out0, out1)
void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_pixels16_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_bilinear8_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)
static const uint8_t mc_filt_mask_arr[16 *3]
void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
void ff_put_vp8_bilinear16_h_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel16_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_4TAP_FILT(src0, src1, mask0, mask1, filt_h0, filt_h1)
static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel16_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3,mask0, mask1, mask2,filt0, filt1, filt2,out0, out1, out2, out3)
void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SD4(in0, in1, in2, in3, pdst, stride)
void ff_put_vp8_epel16_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static const int8_t filt[NUMTAPS *2]
#define SPLATI_H3_SB(...)
static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_put_vp8_bilinear16_v_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter)
void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
#define ST_W2(in, idx0, idx1, pdst, stride)
void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)
static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert)
void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, uint8_t *src, ptrdiff_t src_stride, int height, int mx, int my)