27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
33 int16_t *dst,
int32_t dst_stride,
45 in0 = (v8i16) __msa_ilvr_b(
zero,
src0);
47 ST_D2(in0, 0, 1, dst, dst_stride);
58 ST_D4(in0, in1, 0, 1, 0, 1, dst, dst_stride);
59 }
else if (0 ==
height % 8) {
61 v8i16 in0, in1, in2, in3;
64 for (loop_cnt = (
height >> 3); loop_cnt--;) {
67 src += (8 * src_stride);
74 ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
75 dst += (8 * dst_stride);
81 int16_t *dst,
int32_t dst_stride,
84 uint32_t loop_cnt = (
height >> 3);
85 uint32_t res =
height & 0x07;
88 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
90 for (; loop_cnt--; ) {
92 src += (8 * src_stride);
100 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
101 dst += (8 * dst_stride);
108 in0 = (v8i16)__msa_ilvr_b((v16i8)
zero, (v16i8)
src0);
110 out0 = __msa_copy_u_d((v2i64) in0, 0);
111 out1 = __msa_copy_u_w((v4i32) in0, 2);
119 int16_t *dst,
int32_t dst_stride,
133 ST_SH2(in0, in1, dst, dst_stride);
136 v8i16 in0, in1, in2, in3;
142 SLLI_4V(in0, in1, in2, in3, 6);
143 ST_SH4(in0, in1, in2, in3, dst, dst_stride);
146 v8i16 in0, in1, in2, in3, in4, in5;
153 SLLI_4V(in0, in1, in2, in3, 6);
156 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
157 }
else if (0 ==
height % 8) {
160 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
162 for (loop_cnt = (
height >> 3); loop_cnt--;) {
165 src += (8 * src_stride);
171 SLLI_4V(in0, in1, in2, in3, 6);
172 SLLI_4V(in4, in5, in6, in7, 6);
173 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
174 dst += (8 * dst_stride);
180 int16_t *dst,
int32_t dst_stride,
184 uint32_t res =
height & 0x07;
187 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
189 for (loop_cnt = (
height >> 3); loop_cnt--;) {
191 src += (8 * src_stride);
194 in0_r, in1_r, in2_r, in3_r);
195 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
200 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
201 ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
202 dst += (4 * dst_stride);
205 in0_r, in1_r, in2_r, in3_r);
206 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
211 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
212 ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
213 dst += (4 * dst_stride);
219 in0_r = (v8i16)__msa_ilvr_b((v16i8)
zero, (v16i8)
src0);
220 in0 = (v8i16)__msa_ilvl_b((v16i8)
zero, (v16i8)
src0);
224 out0 = __msa_copy_u_d((v2i64) in0, 0);
231 int16_t *dst,
int32_t dst_stride,
238 v8i16 in0_r, in1_r, in2_r, in3_r;
239 v8i16 in0_l, in1_l, in2_l, in3_l;
244 in0_r, in1_r, in2_r, in3_r);
246 in0_l, in1_l, in2_l, in3_l);
247 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
248 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
249 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
250 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
251 }
else if (12 ==
height) {
253 v16i8 src8, src9, src10, src11;
254 v8i16 in0_r, in1_r, in2_r, in3_r;
255 v8i16 in0_l, in1_l, in2_l, in3_l;
258 src += (8 * src_stride);
259 LD_SB4(
src, src_stride, src8, src9, src10, src11);
262 in0_r, in1_r, in2_r, in3_r);
264 in0_l, in1_l, in2_l, in3_l);
265 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
266 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
267 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
268 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
269 dst += (4 * dst_stride);
272 in0_r, in1_r, in2_r, in3_r);
274 in0_l, in1_l, in2_l, in3_l);
275 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
276 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
277 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
278 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
279 dst += (4 * dst_stride);
282 in0_r, in1_r, in2_r, in3_r);
284 in0_l, in1_l, in2_l, in3_l);
285 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
286 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
287 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
288 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
289 }
else if (0 == (
height % 8)) {
292 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
294 for (loop_cnt = (
height >> 3); loop_cnt--;) {
297 src += (8 * src_stride);
299 in1_r, in2_r, in3_r);
301 in1_l, in2_l, in3_l);
302 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
303 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
304 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
305 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
306 dst += (4 * dst_stride);
309 in1_r, in2_r, in3_r);
311 in1_l, in2_l, in3_l);
312 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
313 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
314 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
315 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
316 dst += (4 * dst_stride);
322 int16_t *dst,
int32_t dst_stride,
328 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
330 for (loop_cnt = (
height >> 2); loop_cnt--;) {
332 LD_SB4((
src + 16), src_stride, src4, src5, src6, src7);
333 src += (4 * src_stride);
334 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, in0_r, in1_r,
336 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, in0_l, in1_l,
338 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
339 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
340 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
341 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
342 ILVR_B4_SH(
zero, src4,
zero, src5,
zero, src6,
zero, src7, in0_r, in1_r,
344 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
345 ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
346 dst += (4 * dst_stride);
351 int16_t *dst,
int32_t dst_stride,
357 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
359 for (loop_cnt = (
height >> 2); loop_cnt--;) {
362 src += (4 * src_stride);
364 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, in0_r, in1_r,
366 ILVL_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, in0_l, in1_l,
368 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
369 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
370 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
372 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
375 ILVR_B4_SH(
zero, src4,
zero, src5,
zero, src6,
zero, src7, in0_r, in1_r,
377 ILVL_B4_SH(
zero, src4,
zero, src5,
zero, src6,
zero, src7, in0_l, in1_l,
379 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
380 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
381 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
383 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
389 int16_t *dst,
int32_t dst_stride,
395 v16i8 src8, src9, src10, src11;
396 v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
397 v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
399 for (loop_cnt = (
height >> 2); loop_cnt--;) {
410 in0_r, in1_r, in2_r, in3_r);
412 in0_l, in1_l, in2_l, in3_l);
415 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
416 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
417 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
418 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
420 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
424 in0_r, in1_r, in2_r, in3_r);
426 in0_l, in1_l, in2_l, in3_l);
429 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
430 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
431 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
432 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
434 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
440 int16_t *dst,
int32_t dst_stride,
446 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
448 for (loop_cnt = (
height >> 1); loop_cnt--;) {
455 in0_r, in1_r, in2_r, in3_r);
457 in0_l, in1_l, in2_l, in3_l);
458 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
459 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
460 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
461 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
465 in0_r, in1_r, in2_r, in3_r);
467 in0_l, in1_l, in2_l, in3_l);
468 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
469 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
470 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
471 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
477 int16_t *dst,
int32_t dst_stride,
481 uint32_t res = (
height & 0x07) >> 1;
483 v8i16 filt0, filt1, filt2, filt3;
484 v16i8 mask1, mask2, mask3;
485 v16i8 vec0, vec1, vec2, vec3;
486 v8i16 dst0, dst1, dst2, dst3;
487 v8i16 filter_vec, const_vec;
491 const_vec = __msa_ldi_h(128);
495 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
501 for (loop_cnt = (
height >> 3); loop_cnt--;) {
503 src += (8 * src_stride);
507 vec0, vec1, vec2, vec3);
509 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
510 dst0, dst0, dst0, dst0);
512 vec0, vec1, vec2, vec3);
514 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
515 dst1, dst1, dst1, dst1);
516 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
517 vec0, vec1, vec2, vec3);
519 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
520 dst2, dst2, dst2, dst2);
521 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
522 vec0, vec1, vec2, vec3);
524 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
525 dst3, dst3, dst3, dst3);
527 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
528 dst += (8 * dst_stride);
532 src += 2 * src_stride;
535 vec0, vec1, vec2, vec3);
537 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
538 dst0, dst0, dst0, dst0);
539 ST_D2(dst0, 0, 1, dst, dst_stride);
540 dst += 2 * dst_stride;
545 int16_t *dst,
int32_t dst_stride,
550 v8i16 filt0, filt1, filt2, filt3;
551 v16i8 mask1, mask2, mask3;
552 v16i8 vec0, vec1, vec2, vec3;
553 v8i16 dst0, dst1, dst2, dst3;
554 v8i16 filter_vec, const_vec;
558 const_vec = __msa_ldi_h(128);
562 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
568 for (loop_cnt = (
height >> 2); loop_cnt--;) {
570 src += (4 * src_stride);
574 vec0, vec1, vec2, vec3);
576 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
577 dst0, dst0, dst0, dst0);
579 vec0, vec1, vec2, vec3);
581 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
582 dst1, dst1, dst1, dst1);
584 vec0, vec1, vec2, vec3);
586 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
587 dst2, dst2, dst2, dst2);
588 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
589 vec0, vec1, vec2, vec3);
591 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
592 dst3, dst3, dst3, dst3);
594 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
595 dst += (4 * dst_stride);
600 int16_t *dst,
int32_t dst_stride,
604 int64_t res0, res1, res2, res3;
606 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
607 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
608 v8i16 filt0, filt1, filt2, filt3, dst0, dst1, dst2, dst3, dst4, dst5;
609 v8i16 filter_vec, const_vec;
612 const_vec = __msa_ldi_h(128);
616 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 for (loop_cnt = 4; loop_cnt--;) {
629 LD_SB4(
src + 8, src_stride, src4, src5, src6, src7);
630 src += (4 * src_stride);
641 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec4, vec5);
642 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
647 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
648 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
653 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec4, vec5);
654 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
659 VSHF_B2_SB(src4, src5, src6, src7, mask7, mask7, vec4, vec5);
660 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
664 res0 = __msa_copy_s_d((v2i64) dst4, 0);
665 res1 = __msa_copy_s_d((v2i64) dst4, 1);
666 res2 = __msa_copy_s_d((v2i64) dst5, 0);
667 res3 = __msa_copy_s_d((v2i64) dst5, 1);
668 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
669 SD4(res0, res1, res2, res3, (dst + 8), dst_stride);
670 dst += (4 * dst_stride);
675 int16_t *dst,
int32_t dst_stride,
680 v8i16 filt0, filt1, filt2, filt3;
681 v16i8 mask1, mask2, mask3;
682 v16i8 vec0, vec1, vec2, vec3;
683 v8i16 dst0, dst1, dst2, dst3;
684 v8i16 filter_vec, const_vec;
688 const_vec = __msa_ldi_h(128);
692 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
698 for (loop_cnt = (
height >> 1); loop_cnt--;) {
701 src += (2 * src_stride);
710 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
714 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
718 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
722 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
725 ST_SH2(dst0, dst2, dst, dst_stride);
726 ST_SH2(dst1, dst3, dst + 8, dst_stride);
727 dst += (2 * dst_stride);
732 int16_t *dst,
int32_t dst_stride,
737 v8i16 filt0, filt1, filt2, filt3;
738 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
739 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
740 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
741 v8i16 filter_vec, const_vec;
746 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
756 const_vec = __msa_ldi_h(128);
759 for (loop_cnt = (
height >> 1); loop_cnt--;) {
775 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
781 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
787 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
793 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
797 ST_SH2(dst0, dst1, dst, 8);
798 ST_SH(dst2, dst + 16);
800 ST_SH2(dst3, dst4, dst, 8);
801 ST_SH(dst5, dst + 16);
807 int16_t *dst,
int32_t dst_stride,
812 v8i16 filt0, filt1, filt2, filt3;
813 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
814 v16i8 vec0, vec1, vec2, vec3;
815 v8i16 dst0, dst1, dst2, dst3;
816 v8i16 filter_vec, const_vec;
821 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
831 const_vec = __msa_ldi_h(128);
834 for (loop_cnt =
height; loop_cnt--;) {
841 vec0, vec1, vec2, vec3);
843 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
844 dst0, dst0, dst0, dst0);
846 vec0, vec1, vec2, vec3);
848 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
849 dst1, dst1, dst1, dst1);
851 vec0, vec1, vec2, vec3);
853 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
854 dst2, dst2, dst2, dst2);
856 vec0, vec1, vec2, vec3);
858 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
859 dst3, dst3, dst3, dst3);
861 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
867 int16_t *dst,
int32_t dst_stride,
872 v8i16 filt0, filt1, filt2, filt3;
873 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
874 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
875 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
876 v8i16 filter_vec, const_vec;
881 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
891 const_vec = __msa_ldi_h(128);
894 for (loop_cnt =
height; loop_cnt--;) {
908 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
912 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
916 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
920 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
922 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
932 ST_SH2(dst4, dst5, (dst + 32), 8);
938 int16_t *dst,
int32_t dst_stride,
943 v8i16 filt0, filt1, filt2, filt3;
944 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
945 v16i8 vec0, vec1, vec2, vec3;
946 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
947 v8i16 filter_vec, const_vec;
953 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
963 const_vec = __msa_ldi_h(128);
966 for (loop_cnt =
height; loop_cnt--;) {
973 vec0, vec1, vec2, vec3);
975 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
976 dst0, dst0, dst0, dst0);
980 vec0, vec1, vec2, vec3);
982 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
983 dst1, dst1, dst1, dst1);
984 ST_SH(dst1, dst + 8);
987 vec0, vec1, vec2, vec3);
989 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
990 dst2, dst2, dst2, dst2);
991 ST_SH(dst2, dst + 16);
994 vec0, vec1, vec2, vec3);
996 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
997 dst3, dst3, dst3, dst3);
998 ST_SH(dst3, dst + 24);
1001 vec0, vec1, vec2, vec3);
1003 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1004 dst4, dst4, dst4, dst4);
1005 ST_SH(dst4, dst + 32);
1008 vec0, vec1, vec2, vec3);
1010 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1011 dst5, dst5, dst5, dst5);
1012 ST_SH(dst5, dst + 40);
1014 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1015 vec0, vec1, vec2, vec3);
1017 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1018 dst6, dst6, dst6, dst6);
1019 ST_SH(dst6, dst + 48);
1021 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1022 vec0, vec1, vec2, vec3);
1024 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1025 dst7, dst7, dst7, dst7);
1026 ST_SH(dst7, dst + 56);
1032 int16_t *dst,
int32_t dst_stride,
1037 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1038 v16i8 src9, src10, src11, src12, src13, src14;
1039 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1040 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1041 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1042 v16i8 src2110, src4332, src6554, src8776, src10998;
1043 v16i8 src12111110, src14131312;
1044 v8i16 dst10, dst32, dst54, dst76;
1045 v8i16 filt0, filt1, filt2, filt3;
1046 v8i16 filter_vec, const_vec;
1048 src -= (3 * src_stride);
1050 const_vec = __msa_ldi_h(128);
1054 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1057 src += (7 * src_stride);
1059 src10_r, src32_r, src54_r, src21_r);
1060 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1061 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1062 src2110, src4332, src6554);
1065 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1067 src7, src8, src9, src10, src11, src12, src13, src14);
1068 src += (8 * src_stride);
1070 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1071 src76_r, src87_r, src98_r, src109_r);
1072 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1073 src1110_r, src1211_r, src1312_r, src1413_r);
1074 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
1075 src1211_r, src1110_r, src1413_r, src1312_r,
1076 src8776, src10998, src12111110, src14131312);
1081 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1084 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1087 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1089 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1090 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1092 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
1093 dst += (8 * dst_stride);
1096 src4332 = src12111110;
1097 src6554 = src14131312;
1102 src += 2 * src_stride;
1103 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1104 src8776 = (v16i8)__msa_ilvr_d((v2i64) src87_r, src76_r);
1105 src8776 = (v16i8)__msa_xori_b((v16i8) src8776, 128);
1108 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1109 ST_D2(dst10, 0, 1, dst, dst_stride);
1110 dst += 2 * dst_stride;
1119 int16_t *dst,
int32_t dst_stride,
1123 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1124 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1125 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1126 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1127 v8i16 filter_vec, const_vec;
1128 v8i16 filt0, filt1, filt2, filt3;
1130 src -= (3 * src_stride);
1131 const_vec = __msa_ldi_h(128);
1135 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1138 src += (7 * src_stride);
1141 src10_r, src32_r, src54_r, src21_r);
1142 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1144 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1145 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1146 src += (4 * src_stride);
1148 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1149 src76_r, src87_r, src98_r, src109_r);
1153 filt0, filt1, filt2, filt3,
1154 dst0_r, dst0_r, dst0_r, dst0_r);
1157 filt0, filt1, filt2, filt3,
1158 dst1_r, dst1_r, dst1_r, dst1_r);
1161 filt0, filt1, filt2, filt3,
1162 dst2_r, dst2_r, dst2_r, dst2_r);
1165 filt0, filt1, filt2, filt3,
1166 dst3_r, dst3_r, dst3_r, dst3_r);
1168 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1169 dst += (4 * dst_stride);
1182 int16_t *dst,
int32_t dst_stride,
1186 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1187 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1188 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1189 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1190 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1191 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1192 v16i8 src2110, src4332, src6554, src8776, src10998;
1193 v8i16 dst0_l, dst1_l;
1194 v8i16 filter_vec, const_vec;
1195 v8i16 filt0, filt1, filt2, filt3;
1197 src -= (3 * src_stride);
1198 const_vec = __msa_ldi_h(128);
1202 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1205 src += (7 * src_stride);
1208 src10_r, src32_r, src54_r, src21_r);
1209 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1211 src10_l, src32_l, src54_l, src21_l);
1212 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1213 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1214 src2110, src4332, src6554);
1216 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1217 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1218 src += (4 * src_stride);
1220 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1221 src76_r, src87_r, src98_r, src109_r);
1222 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1223 src76_l, src87_l, src98_l, src109_l);
1224 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1228 filt0, filt1, filt2, filt3,
1229 dst0_r, dst0_r, dst0_r, dst0_r);
1232 filt0, filt1, filt2, filt3,
1233 dst1_r, dst1_r, dst1_r, dst1_r);
1236 filt0, filt1, filt2, filt3,
1237 dst2_r, dst2_r, dst2_r, dst2_r);
1240 filt0, filt1, filt2, filt3,
1241 dst3_r, dst3_r, dst3_r, dst3_r);
1244 filt0, filt1, filt2, filt3,
1245 dst0_l, dst0_l, dst0_l, dst0_l);
1248 filt0, filt1, filt2, filt3,
1249 dst1_l, dst1_l, dst1_l, dst1_l);
1251 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1252 ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
1253 dst += (4 * dst_stride);
1276 const uint8_t *src_tmp;
1279 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1280 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1281 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1282 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1283 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1284 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1285 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1286 v8i16 filter_vec, const_vec;
1287 v8i16 filt0, filt1, filt2, filt3;
1289 src -= (3 * src_stride);
1290 const_vec = __msa_ldi_h(128);
1294 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1296 for (cnt =
width >> 4; cnt--;) {
1301 src_tmp += (7 * src_stride);
1304 src10_r, src32_r, src54_r, src21_r);
1305 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1307 src10_l, src32_l, src54_l, src21_l);
1308 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1310 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1311 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1312 src_tmp += (4 * src_stride);
1314 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1315 src76_r, src87_r, src98_r, src109_r);
1316 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1317 src76_l, src87_l, src98_l, src109_l);
1321 filt0, filt1, filt2, filt3,
1322 dst0_r, dst0_r, dst0_r, dst0_r);
1325 filt0, filt1, filt2, filt3,
1326 dst1_r, dst1_r, dst1_r, dst1_r);
1329 filt0, filt1, filt2, filt3,
1330 dst2_r, dst2_r, dst2_r, dst2_r);
1333 filt0, filt1, filt2, filt3,
1334 dst3_r, dst3_r, dst3_r, dst3_r);
1337 filt0, filt1, filt2, filt3,
1338 dst0_l, dst0_l, dst0_l, dst0_l);
1341 filt0, filt1, filt2, filt3,
1342 dst1_l, dst1_l, dst1_l, dst1_l);
1345 filt0, filt1, filt2, filt3,
1346 dst2_l, dst2_l, dst2_l, dst2_l);
1349 filt0, filt1, filt2, filt3,
1350 dst3_l, dst3_l, dst3_l, dst3_l);
1352 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1353 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1354 dst_tmp += (4 * dst_stride);
1377 int16_t *dst,
int32_t dst_stride,
1385 int16_t *dst,
int32_t dst_stride,
1395 int16_t *dst,
int32_t dst_stride,
1403 int16_t *dst,
int32_t dst_stride,
1411 int16_t *dst,
int32_t dst_stride,
1419 int16_t *dst,
int32_t dst_stride,
1420 const int8_t *filter_x,
const int8_t *filter_y,
1424 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1425 v8i16 filt0, filt1, filt2, filt3;
1426 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1427 v16i8 mask1, mask2, mask3;
1428 v8i16 filter_vec, const_vec;
1429 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1430 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1431 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1432 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1433 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1434 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1437 src -= ((3 * src_stride) + 3);
1438 filter_vec =
LD_SH(filter_x);
1439 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1441 filter_vec =
LD_SH(filter_y);
1444 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1450 const_vec = __msa_ldi_h(128);
1454 src += (7 * src_stride);
1457 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1458 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1460 vec8, vec9, vec10, vec11);
1461 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1462 vec12, vec13, vec14, vec15);
1464 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1465 dst30, dst30, dst30, dst30);
1467 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1468 dst41, dst41, dst41, dst41);
1470 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1471 dst52, dst52, dst52, dst52);
1473 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1474 dst63, dst63, dst63, dst63);
1479 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1481 for (loop_cnt =
height >> 2; loop_cnt--;) {
1482 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1483 src += (4 * src_stride);
1486 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1487 vec0, vec1, vec2, vec3);
1488 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1489 vec4, vec5, vec6, vec7);
1492 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1493 dst97, dst97, dst97, dst97);
1494 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1495 dst108, dst108, dst108, dst108);
1497 dst76_r = __msa_ilvr_h(dst97, dst66);
1499 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1500 dst98_r = __msa_ilvr_h(dst66, dst108);
1503 filt_h0, filt_h1, filt_h2, filt_h3);
1505 filt_h0, filt_h1, filt_h2, filt_h3);
1507 filt_h0, filt_h1, filt_h2, filt_h3);
1509 filt_h0, filt_h1, filt_h2, filt_h3);
1510 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1511 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1512 ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
1513 dst += (4 * dst_stride);
1521 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1529 const int8_t *filter_x,
1530 const int8_t *filter_y,
1533 uint32_t loop_cnt, cnt;
1534 const uint8_t *src_tmp;
1537 v8i16 filt0, filt1, filt2, filt3;
1538 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1539 v16i8 mask1, mask2, mask3;
1540 v8i16 filter_vec, const_vec;
1541 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1542 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1543 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1544 v4i32 dst0_r, dst0_l;
1545 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1546 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1547 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1549 src -= ((3 * src_stride) + 3);
1550 filter_vec =
LD_SH(filter_x);
1551 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1553 filter_vec =
LD_SH(filter_y);
1556 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1562 const_vec = __msa_ldi_h(128);
1565 for (cnt =
width >> 3; cnt--;) {
1570 src_tmp += (7 * src_stride);
1575 vec0, vec1, vec2, vec3);
1577 vec4, vec5, vec6, vec7);
1579 vec8, vec9, vec10, vec11);
1580 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1581 vec12, vec13, vec14, vec15);
1583 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1584 dst0, dst0, dst0, dst0);
1586 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1587 dst1, dst1, dst1, dst1);
1589 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1590 dst2, dst2, dst2, dst2);
1592 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1593 dst3, dst3, dst3, dst3);
1596 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1597 vec0, vec1, vec2, vec3);
1598 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1599 vec4, vec5, vec6, vec7);
1600 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1601 vec8, vec9, vec10, vec11);
1603 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1604 dst4, dst4, dst4, dst4);
1606 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1607 dst5, dst5, dst5, dst5);
1609 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1610 dst6, dst6, dst6, dst6);
1612 for (loop_cnt =
height; loop_cnt--;) {
1613 src7 =
LD_SB(src_tmp);
1614 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1615 src_tmp += src_stride;
1617 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1618 vec0, vec1, vec2, vec3);
1620 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1621 dst7, dst7, dst7, dst7);
1628 filt_h0, filt_h1, filt_h2, filt_h3);
1630 filt_h0, filt_h1, filt_h2, filt_h3);
1634 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1635 ST_SW(dst0_r, dst_tmp);
1636 dst_tmp += dst_stride;
1653 int16_t *dst,
int32_t dst_stride,
1654 const int8_t *filter_x,
const int8_t *filter_y,
1658 filter_x, filter_y,
height, 8);
1662 int16_t *dst,
int32_t dst_stride,
1663 const int8_t *filter_x,
const int8_t *filter_y,
1667 const uint8_t *src_tmp;
1669 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1670 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1671 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1672 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1673 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1674 v8i16 filter_vec, const_vec;
1675 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1676 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1677 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
1678 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
1679 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
1681 src -= ((3 * src_stride) + 3);
1682 filter_vec =
LD_SH(filter_x);
1683 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1685 filter_vec =
LD_SH(filter_y);
1688 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1695 const_vec = __msa_ldi_h(128);
1702 src_tmp += (7 * src_stride);
1710 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1713 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0,
1716 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1,
1719 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2,
1722 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3,
1726 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1727 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1728 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1731 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4,
1734 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5,
1737 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6,
1740 for (loop_cnt =
height; loop_cnt--;) {
1741 src7 =
LD_SB(src_tmp);
1742 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1743 src_tmp += src_stride;
1745 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1748 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7,
1755 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1756 filt_h1, filt_h2, filt_h3);
1757 dst0_l =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1758 filt_h1, filt_h2, filt_h3);
1762 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1763 ST_SW(dst0_r, dst_tmp);
1764 dst_tmp += dst_stride;
1784 src += (7 * src_stride);
1787 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1788 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1789 VSHF_B4_SB(
src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1791 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1794 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30,
1795 dst30, dst30, dst30);
1797 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41,
1798 dst41, dst41, dst41);
1800 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52,
1801 dst52, dst52, dst52);
1803 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63,
1804 dst63, dst63, dst63);
1810 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1812 for (loop_cnt =
height >> 2; loop_cnt--;) {
1813 LD_SB4(
src, src_stride, src7, src8, src9, src10);
1814 src += (4 * src_stride);
1817 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1819 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1823 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97,
1824 dst97, dst97, dst97);
1825 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108,
1826 dst108, dst108, dst108);
1828 dst76_r = __msa_ilvr_h(dst97, dst66);
1830 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1831 dst98_r = __msa_ilvr_h(dst66, dst108);
1833 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1834 filt_h1, filt_h2, filt_h3);
1835 dst1_r =
HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1836 filt_h1, filt_h2, filt_h3);
1837 dst2_r =
HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1838 filt_h1, filt_h2, filt_h3);
1839 dst3_r =
HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1840 filt_h1, filt_h2, filt_h3);
1841 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1842 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1843 ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
1844 dst += (4 * dst_stride);
1852 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1857 int16_t *dst,
int32_t dst_stride,
1858 const int8_t *filter_x,
const int8_t *filter_y,
1862 filter_x, filter_y,
height, 16);
1866 int16_t *dst,
int32_t dst_stride,
1867 const int8_t *filter_x,
const int8_t *filter_y,
1871 filter_x, filter_y,
height, 24);
1875 int16_t *dst,
int32_t dst_stride,
1876 const int8_t *filter_x,
const int8_t *filter_y,
1880 filter_x, filter_y,
height, 32);
1884 int16_t *dst,
int32_t dst_stride,
1885 const int8_t *filter_x,
const int8_t *filter_y,
1889 filter_x, filter_y,
height, 48);
1893 int16_t *dst,
int32_t dst_stride,
1894 const int8_t *filter_x,
const int8_t *filter_y,
1898 filter_x, filter_y,
height, 64);
1909 v16i8 mask1, vec0, vec1;
1911 v8i16 filter_vec, const_vec;
1921 const_vec = __msa_ldi_h(128);
1931 ST_D2(dst0, 0, 1, dst, dst_stride);
1942 v16i8 mask1, vec0, vec1;
1944 v8i16 filter_vec, const_vec;
1954 const_vec = __msa_ldi_h(128);
1968 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1981 v16i8 mask1, vec0, vec1;
1982 v8i16 dst0, dst1, dst2, dst3;
1983 v8i16 filter_vec, const_vec;
1993 const_vec = __msa_ldi_h(128);
1996 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1998 src += (8 * src_stride);
2008 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2011 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2015 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2016 dst += (8 * dst_stride);
2029 }
else if (4 ==
height) {
2031 }
else if (0 ==
height % 8) {
2045 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2046 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2047 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
2052 v8i16 filter_vec, const_vec;
2061 const_vec = __msa_ldi_h(128);
2064 for (loop_cnt = 2; loop_cnt--;) {
2066 src += (4 * src_stride);
2079 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2083 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
2084 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
2085 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
2086 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
2088 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
2089 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
2090 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
2091 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
2094 SW(dst_val_int0, dst + 4);
2097 SW(dst_val_int1, dst + 4);
2100 SW(dst_val_int2, dst + 4);
2103 SW(dst_val_int3, dst + 4);
2116 v8i16 filt0, filt1, dst0, dst1;
2121 v8i16 filter_vec, const_vec;
2130 const_vec = __msa_ldi_h(128);
2133 for (loop_cnt = (
height >> 1); loop_cnt--;) {
2135 src += (2 * src_stride);
2147 ST_SH2(dst0, dst1, dst, dst_stride);
2148 dst += (2 * dst_stride);
2165 v8i16 dst0, dst1, dst2, dst3;
2166 v8i16 filter_vec, const_vec;
2175 const_vec = __msa_ldi_h(128);
2178 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2180 src += (4 * src_stride);
2196 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2200 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2201 dst += (4 * dst_stride);
2233 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2234 v8i16 filter_vec, const_vec;
2238 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2249 const_vec = __msa_ldi_h(128);
2252 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2254 src += (4 * src_stride);
2266 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2276 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2277 ST_D4(dst4, dst5, 0, 1, 0, 1, dst + 8, dst_stride);
2278 dst += (4 * dst_stride);
2291 v16i8 src4, src5, src6, src7;
2295 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2297 v8i16 filter_vec, const_vec;
2306 const_vec = __msa_ldi_h(128);
2309 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2312 src += (4 * src_stride);
2328 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2332 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2336 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2340 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2344 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2348 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
2349 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
2350 dst += (4 * dst_stride);
2362 int16_t *dst_tmp = dst + 16;
2366 v16i8 mask1, mask00, mask11;
2368 v8i16 dst0, dst1, dst2, dst3;
2369 v8i16 filter_vec, const_vec;
2378 mask11 = mask0 + 10;
2380 const_vec = __msa_ldi_h(128);
2383 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2387 src += (4 * src_stride);
2407 ST_SH2(dst0, dst1, dst, 8);
2409 ST_SH2(dst2, dst3, dst, 8);
2412 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2416 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2420 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2424 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2428 ST_SH2(dst0, dst1, dst, 8);
2430 ST_SH2(dst2, dst3, dst, 8);
2438 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2442 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2446 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2450 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2451 dst_tmp += (4 * dst_stride);
2466 v16i8 mask1, mask2, mask3;
2467 v8i16 dst0, dst1, dst2, dst3;
2468 v16i8 vec0, vec1, vec2, vec3;
2469 v8i16 filter_vec, const_vec;
2476 const_vec = __msa_ldi_h(128);
2483 for (loop_cnt =
height; loop_cnt--;) {
2496 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2500 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2502 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2514 v16i8 src10_r, src32_r, src21_r, src43_r;
2515 v16i8 src2110, src4332;
2518 v8i16 filter_vec, const_vec;
2522 const_vec = __msa_ldi_h(128);
2530 src10_r, src21_r, src32_r, src43_r);
2532 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2535 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2537 ST_D2(dst10, 0, 1, dst, dst_stride);
2548 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2549 v16i8 src2110, src4332, src6554;
2552 v8i16 filter_vec, const_vec;
2556 const_vec = __msa_ldi_h(128);
2564 src10_r, src21_r, src32_r, src43_r);
2565 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2566 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2567 src2110, src4332, src6554);
2570 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2572 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2574 ST_D4(dst10, dst32, 0, 1, 0, 1, dst, dst_stride);
2584 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2585 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2586 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2587 v16i8 src2110, src4332, src6554, src8776, src10998;
2588 v8i16 dst10, dst32, dst54, dst76;
2590 v8i16 filter_vec, const_vec;
2593 const_vec = __msa_ldi_h(128);
2600 src += (3 * src_stride);
2603 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2604 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2606 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2607 src += (8 * src_stride);
2609 src32_r, src43_r, src54_r, src65_r);
2610 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
2611 src76_r, src87_r, src98_r, src109_r);
2612 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2613 src98_r, src4332, src6554, src8776, src10998);
2619 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2620 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2621 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2622 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2623 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2627 int16_t *dst,
int32_t dst_stride,
2630 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2631 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
2632 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
2634 v8i16 dst10, dst32, dst54, dst76, filt0, filt1, filter_vec, const_vec;
2637 const_vec = __msa_ldi_h(128);
2644 src += (3 * src_stride);
2647 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2648 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2650 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2651 src += (8 * src_stride);
2652 ILVR_B4_SB(src3,
src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2654 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2655 src87_r, src98_r, src109_r);
2656 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2657 src98_r, src4332, src6554, src8776, src10998);
2664 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2665 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2666 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2667 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2668 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2669 dst += (8 * dst_stride);
2674 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
2675 src += (8 * src_stride);
2677 ILVR_B4_SB(src3,
src2, src4, src3, src5, src4, src6, src5, src32_r, src43_r,
2679 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
2680 src87_r, src98_r, src109_r);
2681 ILVR_D4_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r, src109_r,
2682 src98_r, src4332, src6554, src8776, src10998);
2689 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2690 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2691 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2692 DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
2693 ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
2705 }
else if (4 ==
height) {
2707 }
else if (8 ==
height) {
2709 }
else if (16 ==
height) {
2723 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2724 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2726 v16i8 src10_r, src32_r, src21_r, src43_r;
2727 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2729 v8i16 filter_vec, const_vec;
2732 const_vec = __msa_ldi_h(128);
2739 src += (3 * src_stride);
2743 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2745 src += (2 * src_stride);
2750 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2752 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2755 src += (2 * src_stride);
2760 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2762 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2764 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2765 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2766 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2767 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2769 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2770 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2771 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2772 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2775 SW(dst_val_int0, dst + 4);
2778 SW(dst_val_int1, dst + 4);
2781 SW(dst_val_int2, dst + 4);
2784 SW(dst_val_int3, dst + 4);
2793 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2795 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2797 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2798 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2800 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2801 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2804 SW(dst_val_int0, dst + 4);
2807 SW(dst_val_int1, dst + 4);
2819 v16i8 src10_r, src32_r, src21_r, src43_r;
2820 v8i16 dst0_r, dst1_r;
2822 v8i16 filter_vec, const_vec;
2825 const_vec = __msa_ldi_h(128);
2832 src += (3 * src_stride);
2840 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2842 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2844 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2854 v16i8 src10_r, src32_r, src21_r, src43_r;
2855 v8i16 dst0_r, dst1_r;
2857 v8i16 filter_vec, const_vec;
2860 const_vec = __msa_ldi_h(128);
2867 src += (3 * src_stride);
2872 src += (2 * src_stride);
2877 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2879 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2881 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2882 dst += (2 * dst_stride);
2885 src += (2 * src_stride);
2890 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2892 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2894 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2895 dst += (2 * dst_stride);
2902 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2904 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2906 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2918 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2919 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2921 v8i16 filter_vec, const_vec;
2924 const_vec = __msa_ldi_h(128);
2931 src += (3 * src_stride);
2935 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2936 LD_SB4(
src, src_stride, src3, src4, src5, src6);
2937 src += (4 * src_stride);
2940 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2945 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2946 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2947 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
2948 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
2949 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2950 dst += (4 * dst_stride);
2967 }
else if (6 ==
height) {
2984 v16i8 src10_r, src32_r, src21_r, src43_r;
2985 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2986 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2987 v16i8 src2110, src4332;
2988 v16i8 src54_r, src65_r, src6554;
2989 v8i16 dst0_l, dst1_l;
2991 v8i16 filter_vec, const_vec;
2993 src -= (1 * src_stride);
2994 const_vec = __msa_ldi_h(128);
3001 src += (3 * src_stride);
3005 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3007 for (loop_cnt = 4; loop_cnt--;) {
3009 src += (2 * src_stride);
3011 src += (2 * src_stride);
3017 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3018 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3019 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3020 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3023 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3025 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3027 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3029 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3031 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3033 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3035 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
3036 ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
3037 dst += (4 * dst_stride);
3055 v16i8 src10_r, src32_r, src21_r, src43_r;
3056 v16i8 src10_l, src32_l, src21_l, src43_l;
3057 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3059 v8i16 filter_vec, const_vec;
3062 const_vec = __msa_ldi_h(128);
3069 src += (3 * src_stride);
3074 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3076 src += (2 * src_stride);
3081 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3083 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3085 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3087 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3088 ST_SH2(dst0_r, dst0_l, dst, 8);
3090 ST_SH2(dst1_r, dst1_l, dst, 8);
3094 src += (2 * src_stride);
3099 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3101 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3103 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3105 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3106 ST_SH2(dst0_r, dst0_l, dst, 8);
3108 ST_SH2(dst1_r, dst1_l, dst, 8);
3122 v16i8 src6, src7, src8, src9, src10, src11;
3123 v16i8 src10_r, src32_r, src76_r, src98_r;
3124 v16i8 src21_r, src43_r, src87_r, src109_r;
3125 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3126 v16i8 src10_l, src32_l, src21_l, src43_l;
3127 v8i16 dst0_l, dst1_l;
3129 v8i16 filter_vec, const_vec;
3132 const_vec = __msa_ldi_h(128);
3143 LD_SB3(
src + 16, src_stride, src6, src7, src8);
3144 src += (3 * src_stride);
3146 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3148 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3154 LD_SB2(
src + 16, src_stride, src9, src10);
3155 src += (2 * src_stride);
3157 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3160 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3162 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3164 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3166 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3168 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3170 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3172 ST_SH2(dst0_r, dst0_l, dst, 8);
3173 ST_SH(dst2_r, dst + 16);
3175 ST_SH2(dst1_r, dst1_l, dst, 8);
3176 ST_SH(dst3_r, dst + 16);
3184 LD_SB2(
src + 16, src_stride, src11, src8);
3185 src += (2 * src_stride);
3187 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3190 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3192 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3194 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3196 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3198 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3200 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3202 ST_SH2(dst0_r, dst0_l, dst, 8);
3203 ST_SH(dst2_r, dst + 16);
3205 ST_SH2(dst1_r, dst1_l, dst, 8);
3206 ST_SH(dst3_r, dst + 16);
3220 v16i8 src6, src7, src8, src9, src10, src11;
3221 v16i8 src10_r, src32_r, src76_r, src98_r;
3222 v16i8 src21_r, src43_r, src87_r, src109_r;
3223 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3224 v16i8 src10_l, src32_l, src76_l, src98_l;
3225 v16i8 src21_l, src43_l, src87_l, src109_l;
3226 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3228 v8i16 filter_vec, const_vec;
3231 const_vec = __msa_ldi_h(128);
3242 LD_SB3(
src + 16, src_stride, src6, src7, src8);
3243 src += (3 * src_stride);
3245 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3246 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3248 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3254 LD_SB2(
src + 16, src_stride, src9, src10);
3255 src += (2 * src_stride);
3257 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3258 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3261 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3263 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3265 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3267 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3269 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3271 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3273 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3275 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3277 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3279 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3287 LD_SB2(
src + 16, src_stride, src11, src8);
3288 src += (2 * src_stride);
3290 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3291 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
3294 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3296 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3298 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3300 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3302 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3304 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
3306 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3308 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
3310 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3312 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3321 const int8_t *filter_x,
3322 const int8_t *filter_y)
3326 v8i16 filt_h0, filt_h1;
3329 v8i16 filter_vec, const_vec;
3330 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3331 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3334 src -= (src_stride + 1);
3335 filter_vec =
LD_SH(filter_x);
3338 filter_vec =
LD_SH(filter_y);
3345 const_vec = __msa_ldi_h(128);
3367 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3368 ST_D2(dst0, 0, 1, dst, dst_stride);
3375 const int8_t *filter_x,
3376 const int8_t *filter_y)
3380 v8i16 filt_h0, filt_h1;
3383 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3384 v8i16 filter_vec, const_vec;
3385 v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
3386 v4i32 dst0, dst1, dst2, dst3;
3388 src -= (src_stride + 1);
3390 filter_vec =
LD_SH(filter_x);
3393 filter_vec =
LD_SH(filter_y);
3400 const_vec = __msa_ldi_h(128);
3409 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3428 SRA_4V(dst0, dst1, dst2, dst3, 6);
3430 ST_D4(dst0, dst2, 0, 1, 0, 1, dst, dst_stride);
3438 const int8_t *filter_x,
3439 const int8_t *filter_y,
3444 v16i8 src7, src8, src9, src10;
3446 v8i16 filt_h0, filt_h1;
3449 v8i16 filter_vec, const_vec;
3450 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3451 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3452 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
3453 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
3454 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3456 src -= (src_stride + 1);
3457 filter_vec =
LD_SH(filter_x);
3460 filter_vec =
LD_SH(filter_y);
3467 const_vec = __msa_ldi_h(128);
3471 src += (3 * src_stride);
3480 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3482 for (loop_cnt =
height >> 3; loop_cnt--;) {
3484 src3, src4, src5, src6, src7, src8, src9, src10);
3485 src += (8 * src_stride);
3488 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3489 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3490 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3491 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3500 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
3502 dst32_r = __msa_ilvr_h(dst73, dst22);
3506 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3507 dst76_r = __msa_ilvr_h(dst22, dst106);
3517 SRA_4V(dst0, dst1, dst2, dst3, 6);
3518 SRA_4V(dst4, dst5, dst6, dst7, 6);
3519 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3520 dst0, dst1, dst2, dst3);
3521 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
3522 dst += (8 * dst_stride);
3526 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3534 const int8_t *filter_x,
3535 const int8_t *filter_y,
3540 filter_x, filter_y);
3541 }
else if (4 ==
height) {
3543 filter_x, filter_y);
3544 }
else if (0 == (
height % 8)) {
3546 filter_x, filter_y,
height);
3554 const int8_t *filter_x,
3555 const int8_t *filter_y,
3558 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
3560 v8i16 filt_h0, filt_h1;
3563 v8i16 filter_vec, const_vec;
3564 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3565 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3566 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3567 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
3568 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
3569 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
3570 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3571 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3572 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
3574 src -= (src_stride + 1);
3575 filter_vec =
LD_SH(filter_x);
3578 filter_vec =
LD_SH(filter_y);
3585 const_vec = __msa_ldi_h(128);
3589 src += (3 * src_stride);
3606 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3609 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3610 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3611 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3612 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3623 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3624 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3625 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3626 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3635 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10);
3646 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3647 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3648 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3661 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3662 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3663 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3664 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3665 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3666 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3667 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3668 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
3669 ST_W4(tmp4, 0, 1, 2, 3, dst + 4, dst_stride);
3670 dst += 4 * dst_stride;
3671 ST_D4(tmp2, tmp3, 0, 1, 0, 1, dst, dst_stride);
3672 ST_W4(tmp5, 0, 1, 2, 3, dst + 4, dst_stride);
3679 const int8_t *filter_x,
3680 const int8_t *filter_y)
3684 v8i16 filt_h0, filt_h1;
3687 v8i16 filter_vec, const_vec;
3688 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3689 v8i16 dst0, dst1, dst2, dst3, dst4;
3690 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3691 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3692 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3694 src -= (src_stride + 1);
3696 filter_vec =
LD_SH(filter_x);
3699 filter_vec =
LD_SH(filter_y);
3706 const_vec = __msa_ldi_h(128);
3715 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3716 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3737 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3738 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3739 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3743 int16_t *dst,
int32_t dst_stride,
3744 const int8_t *filter_x,
3745 const int8_t *filter_y,
int32_t width8mult)
3748 v16i8
src0,
src1,
src2, src3, src4, src5, src6, mask0, mask1;
3749 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3750 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
3751 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3752 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3753 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3754 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3756 src -= (src_stride + 1);
3758 filter_vec =
LD_SH(filter_x);
3761 filter_vec =
LD_SH(filter_y);
3769 const_vec = __msa_ldi_h(128);
3772 for (cnt = width8mult; cnt--;) {
3791 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3792 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3793 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3794 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3816 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3817 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3818 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3819 PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3821 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
3830 const int8_t *filter_x,
3831 const int8_t *filter_y)
3833 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
3835 v8i16 filt_h0, filt_h1;
3838 v8i16 filter_vec, const_vec;
3839 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3840 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3841 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3842 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3843 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3844 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3845 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3846 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3847 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3849 src -= (src_stride + 1);
3851 filter_vec =
LD_SH(filter_x);
3854 filter_vec =
LD_SH(filter_y);
3861 const_vec = __msa_ldi_h(128);
3865 src += (5 * src_stride);
3866 LD_SB4(
src, src_stride, src5, src6, src7, src8);
3874 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3875 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3876 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3877 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3878 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3879 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3922 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3923 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3924 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3927 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3928 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3930 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3931 dst += (2 * dst_stride);
3932 ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3933 dst += (2 * dst_stride);
3934 ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3941 const int8_t *filter_x,
3942 const int8_t *filter_y,
3946 uint32_t loop_cnt, cnt;
3947 const uint8_t *src_tmp;
3951 v8i16 filt_h0, filt_h1;
3954 v8i16 filter_vec, const_vec;
3955 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3956 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3957 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3958 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3959 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3961 src -= (src_stride + 1);
3963 filter_vec =
LD_SH(filter_x);
3966 filter_vec =
LD_SH(filter_y);
3973 const_vec = __msa_ldi_h(128);
3976 for (cnt = width8mult; cnt--;) {
3981 src_tmp += (3 * src_stride);
3999 for (loop_cnt =
height >> 2; loop_cnt--;) {
4000 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4001 src_tmp += (4 * src_stride);
4004 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4005 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4006 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4007 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4032 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4033 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4036 dst2_l, dst2_r, dst3_l, dst3_r,
4037 dst0_r, dst1_r, dst2_r, dst3_r);
4039 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
4040 dst_tmp += (4 * dst_stride);
4058 const int8_t *filter_x,
4059 const int8_t *filter_y,
4065 filter_x, filter_y);
4066 }
else if (4 ==
height) {
4068 filter_x, filter_y, 1);
4069 }
else if (6 ==
height) {
4071 filter_x, filter_y);
4072 }
else if (0 == (
height % 4)) {
4074 filter_x, filter_y,
height, 1);
4082 const int8_t *filter_x,
4083 const int8_t *filter_y,
4087 const uint8_t *src_tmp;
4089 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4090 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4091 v16i8 mask0, mask1, mask2, mask3;
4092 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4093 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
4094 v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
4095 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4096 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4097 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4098 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4100 src -= (src_stride + 1);
4102 filter_vec =
LD_SH(filter_x);
4105 filter_vec =
LD_SH(filter_y);
4113 const_vec = __msa_ldi_h(128);
4120 src_tmp += (3 * src_stride);
4138 for (loop_cnt = 4; loop_cnt--;) {
4139 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4140 src_tmp += (4 * src_stride);
4143 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4144 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4145 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4146 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4171 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4172 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4173 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4174 dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
4175 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
4176 dst_tmp += (4 * dst_stride);
4192 src += (3 * src_stride);
4201 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4203 for (loop_cnt = 2; loop_cnt--;) {
4204 LD_SB8(
src, src_stride, src3, src4, src5, src6, src7, src8, src9,
4206 src += (8 * src_stride);
4208 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4209 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4210 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4211 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4220 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
4222 dst32_r = __msa_ilvr_h(dst73, dst22);
4226 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4227 dst76_r = __msa_ilvr_h(dst22, dst106);
4238 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
4239 SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
4240 PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
4242 ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
4243 dst += (8 * dst_stride);
4247 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4255 const int8_t *filter_x,
4256 const int8_t *filter_y,
4261 filter_x, filter_y, 2);
4264 filter_x, filter_y,
height, 2);
4272 const int8_t *filter_x,
4273 const int8_t *filter_y,
4277 filter_x, filter_y,
height, 3);
4284 const int8_t *filter_x,
4285 const int8_t *filter_y,
4289 filter_x, filter_y,
height, 4);
4292 #define MC_COPY(WIDTH) \
4293 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
4294 const uint8_t *src, \
4295 ptrdiff_t src_stride, \
4301 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
4316 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4317 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
4318 const uint8_t *src, \
4319 ptrdiff_t src_stride, \
4325 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR]; \
4327 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4328 MAX_PB_SIZE, filter, height); \
4331 MC(qpel,
h, 4, 8, hz, mx);
4332 MC(qpel,
h, 8, 8, hz, mx);
4333 MC(qpel,
h, 12, 8, hz, mx);
4334 MC(qpel,
h, 16, 8, hz, mx);
4335 MC(qpel,
h, 24, 8, hz, mx);
4336 MC(qpel,
h, 32, 8, hz, mx);
4337 MC(qpel,
h, 48, 8, hz, mx);
4338 MC(qpel,
h, 64, 8, hz, mx);
4340 MC(qpel, v, 4, 8, vt, my);
4341 MC(qpel, v, 8, 8, vt, my);
4342 MC(qpel, v, 12, 8, vt, my);
4343 MC(qpel, v, 16, 8, vt, my);
4344 MC(qpel, v, 24, 8, vt, my);
4345 MC(qpel, v, 32, 8, vt, my);
4346 MC(qpel, v, 48, 8, vt, my);
4347 MC(qpel, v, 64, 8, vt, my);
4349 MC(epel,
h, 4, 4, hz, mx);
4350 MC(epel,
h, 6, 4, hz, mx);
4351 MC(epel,
h, 8, 4, hz, mx);
4352 MC(epel,
h, 12, 4, hz, mx);
4353 MC(epel,
h, 16, 4, hz, mx);
4354 MC(epel,
h, 24, 4, hz, mx);
4355 MC(epel,
h, 32, 4, hz, mx);
4357 MC(epel, v, 4, 4, vt, my);
4358 MC(epel, v, 6, 4, vt, my);
4359 MC(epel, v, 8, 4, vt, my);
4360 MC(epel, v, 12, 4, vt, my);
4361 MC(epel, v, 16, 4, vt, my);
4362 MC(epel, v, 24, 4, vt, my);
4363 MC(epel, v, 32, 4, vt, my);
4367 #define MC_HV(PEL, WIDTH, TAP) \
4368 void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst, \
4369 const uint8_t *src, \
4370 ptrdiff_t src_stride, \
4376 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx]; \
4377 const int8_t *filter_y = ff_hevc_##PEL##_filters[my]; \
4379 hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
4380 filter_x, filter_y, height); \