27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
34 SRARI_H2_SH(out0, out1, rnd_val); \
35 CLIP_SH2_0_255(out0, out1); \
38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
39 vec0, vec1, vec2, vec3, rnd_val, \
40 out0, out1, out2, out3) \
42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
50 SRARI_H2_SH(out0, out1, rnd_val); \
51 CLIP_SH2_0_255_MAX_SATU(out0, out1); \
54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
55 vec3, rnd_val, out0, out1, out2, out3) \
57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
77 LW2(src0_ptr, src_stride, tp0, tp1);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
82 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
85 dst0 = __msa_srari_h(dst0, 7);
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
90 }
else if (4 == height) {
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
101 }
else if (0 == height % 8) {
102 for (loop_cnt = (height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
121 dst3, 7, dst0, dst1, dst2, dst3);
123 ST4x8_UB(dst0, dst1, dst, dst_stride);
124 dst += (8 * dst_stride);
138 uint64_t tp0, tp1, tp2, tp3;
139 v16u8 out0, out1, out2, out3;
141 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
145 for (loop_cnt = (height >> 3); loop_cnt--;) {
146 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
147 src0_ptr += (4 * src_stride);
150 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
151 src0_ptr += (4 * src_stride);
154 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
155 src1_ptr += (8 * src2_stride);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 SLLI_4V(dst4, dst5, dst6, dst7, 6);
163 7, dst0, dst1, dst2, dst3);
165 7, dst4, dst5, dst6, dst7);
168 ST6x4_UB(out0, out1, dst, dst_stride);
169 dst += (4 * dst_stride);
170 ST6x4_UB(out2, out3, dst, dst_stride);
171 dst += (4 * dst_stride);
183 uint64_t tp0, tp1, tp2, tp3;
184 v16u8 out0, out1, out2, out3;
185 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
187 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
188 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
191 LD2(src0_ptr, src_stride, tp0, tp1);
193 LD_SH2(src1_ptr, src2_stride, in0, in1);
197 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
199 }
else if (4 == height) {
200 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
205 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
206 SLLI_4V(dst0, dst1, dst2, dst3, 6);
208 7, dst0, dst1, dst2, dst3);
210 ST8x4_UB(out0, out1, dst, dst_stride);
211 }
else if (6 == height) {
212 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
213 src0_ptr += 4 * src_stride;
216 LD2(src0_ptr, src_stride, tp0, tp1);
221 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
222 SLLI_4V(dst0, dst1, dst2, dst3, 6);
225 7, dst0, dst1, dst2, dst3);
227 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
228 ST8x4_UB(out0, out1, dst, dst_stride);
229 dst += (4 * dst_stride);
231 }
else if (0 == height % 8) {
234 for (loop_cnt = (height >> 3); loop_cnt--;) {
235 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
236 src0_ptr += 4 * src_stride;
239 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
240 src0_ptr += 4 * src_stride;
247 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
249 src1_ptr += (8 * src2_stride);
250 SLLI_4V(dst0, dst1, dst2, dst3, 6);
251 SLLI_4V(dst4, dst5, dst6, dst7, 6);
253 dst3, 7, dst0, dst1, dst2, dst3);
255 dst7, 7, dst4, dst5, dst6, dst7);
258 ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
259 dst += (8 * dst_stride);
274 v16u8 out0, out1, out2;
276 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
277 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
279 for (loop_cnt = 4; loop_cnt--;) {
280 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
281 src0_ptr += (4 * src_stride);
283 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
284 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
285 src1_ptr += (4 * src2_stride);
287 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
289 SLLI_4V(dst0, dst1, dst2, dst3, 6);
290 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
291 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
294 7, dst0, dst1, dst2, dst3);
296 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
297 ST12x4_UB(out0, out1, out2, dst, dst_stride);
298 dst += (4 * dst_stride);
311 v16u8 out0, out1, out2, out3;
313 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
314 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
317 for (loop_cnt = (height >> 2); loop_cnt--;) {
318 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
319 src0_ptr += (4 * src_stride);
320 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
321 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
322 src1_ptr += (4 * src2_stride);
327 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
328 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
330 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
332 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
333 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
334 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
335 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
336 dst += (4 * dst_stride);
349 v16u8 out0, out1, out2, out3, out4, out5;
350 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
zero = { 0 };
351 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
352 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
354 for (loop_cnt = 8; loop_cnt--;) {
355 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
356 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
357 src0_ptr += (4 * src_stride);
358 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
359 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
360 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
361 src1_ptr += (4 * src2_stride);
365 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
368 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
369 SLLI_4V(dst0, dst1, dst2, dst3, 6);
370 SLLI_4V(dst4, dst5, dst6, dst7, 6);
371 SLLI_4V(dst8, dst9, dst10, dst11, 6);
373 7, dst0, dst1, dst2, dst3);
375 7, dst4, dst5, dst6, dst7);
377 dst11, 7, dst8, dst9, dst10, dst11);
378 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
379 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
380 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
381 ST8x4_UB(out2, out5, dst + 16, dst_stride);
382 dst += (4 * dst_stride);
395 v16u8 out0, out1, out2, out3;
398 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
399 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
401 for (loop_cnt = (height >> 1); loop_cnt--;) {
402 LD_SB2(src0_ptr, 16, src0, src1);
403 src0_ptr += src_stride;
404 LD_SB2(src0_ptr, 16, src2, src3);
405 src0_ptr += src_stride;
406 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
407 src1_ptr += src2_stride;
408 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
409 src1_ptr += src2_stride;
415 SLLI_4V(dst0, dst1, dst2, dst3, 6);
416 SLLI_4V(dst4, dst5, dst6, dst7, 6);
418 7, dst0, dst1, dst2, dst3);
420 7, dst4, dst5, dst6, dst7);
423 ST_UB2(out0, out1, dst, 16);
425 ST_UB2(out2, out3, dst, 16);
439 v16u8 out0, out1, out2, out3, out4, out5;
440 v16i8
src0,
src1, src2, src3, src4, src5;
442 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
443 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
445 for (loop_cnt = (height >> 1); loop_cnt--;) {
446 LD_SB3(src0_ptr, 16, src0, src1, src2);
447 src0_ptr += src_stride;
448 LD_SB3(src0_ptr, 16, src3, src4, src5);
449 src0_ptr += src_stride;
451 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
452 src1_ptr += src2_stride;
453 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
454 src1_ptr += src2_stride;
463 SLLI_4V(dst0, dst1, dst2, dst3, 6);
464 SLLI_4V(dst4, dst5, dst6, dst7, 6);
465 SLLI_4V(dst8, dst9, dst10, dst11, 6);
468 7, dst0, dst1, dst2, dst3);
470 7, dst4, dst5, dst6, dst7);
472 dst11, 7, dst8, dst9, dst10, dst11);
473 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
474 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
475 ST_UB2(out0, out1, dst, 16);
476 ST_UB(out2, dst + 32);
478 ST_UB2(out3, out4, dst, 16);
479 ST_UB(out5, dst + 32);
493 v16u8 out0, out1, out2, out3;
496 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
497 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
499 for (loop_cnt = height; loop_cnt--;) {
500 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
501 src0_ptr += src_stride;
502 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
503 src1_ptr += src2_stride;
509 SLLI_4V(dst0, dst1, dst2, dst3, 6);
510 SLLI_4V(dst4, dst5, dst6, dst7, 6);
512 7, dst0, dst1, dst2, dst3);
514 7, dst4, dst5, dst6, dst7);
518 ST_UB4(out0, out1, out2, out3, dst, 16);
533 v8i16 filt0, filt1, filt2, filt3;
534 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
535 v16i8 mask1, mask2, mask3;
536 v16i8 vec0, vec1, vec2, vec3;
537 v8i16 dst0, dst1, dst2, dst3;
538 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
539 v8i16 filter_vec, const_vec;
545 filter_vec =
LD_SH(filter);
546 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
552 const_vec = __msa_ldi_h(128);
555 for (loop_cnt = (height >> 3); loop_cnt--;) {
556 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
557 src4, src5, src6, src7);
558 src0_ptr += (8 * src_stride);
559 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
560 src1_ptr += (8 * src2_stride);
570 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
571 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
572 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
574 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
575 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
576 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
578 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
579 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
580 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
582 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
583 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
584 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
588 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
591 ST4x8_UB(dst0, dst1, dst, dst_stride);
592 dst += (8 * dst_stride);
606 v8i16 filt0, filt1, filt2, filt3;
608 v16i8 mask1, mask2, mask3;
609 v16i8 vec0, vec1, vec2, vec3;
610 v8i16 dst0, dst1, dst2, dst3;
611 v8i16 in0, in1, in2, in3;
612 v8i16 filter_vec, const_vec;
617 const_vec = __msa_ldi_h(128);
620 filter_vec =
LD_SH(filter);
621 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 for (loop_cnt = (height >> 2); loop_cnt--;) {
628 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
629 src0_ptr += (4 * src_stride);
630 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
631 src1_ptr += (4 * src2_stride);
638 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
639 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
640 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
642 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
643 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
644 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
646 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
647 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
648 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
650 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
651 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
652 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
656 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
659 ST8x4_UB(dst0, dst1, dst, dst_stride);
660 dst += (4 * dst_stride);
677 v16i8 vec0, vec1, vec2;
678 v8i16 filt0, filt1, filt2, filt3;
679 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
680 v8i16 dst0, dst1, dst2;
681 v8i16 in0, in1, in2, in3;
682 v8i16 filter_vec, const_vec;
685 const_vec = __msa_ldi_h(128);
688 filter_vec =
LD_SH(filter);
689 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
700 for (loop_cnt = 8; loop_cnt--;) {
701 LD_SB2(src0_ptr, 8, src0, src1);
702 src0_ptr += src_stride;
703 LD_SB2(src0_ptr, 8, src2, src3);
704 src0_ptr += src_stride;
705 LD_SH2(src1_ptr, 8, in0, in1);
706 src1_ptr += src2_stride;
707 LD_SH2(src1_ptr, 8, in2, in3);
708 src1_ptr += src2_stride;
715 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
718 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
719 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
722 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
723 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
726 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
727 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
730 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
732 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
734 dst2 = __msa_adds_s_h(in2, dst2);
735 dst2 = __msa_srari_h(dst2, 7);
739 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
740 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
741 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
742 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
763 v8i16 filt0, filt1, filt2, filt3;
764 v16i8 mask1, mask2, mask3;
765 v16i8 vec0, vec1, vec2, vec3;
766 v8i16 dst0, dst1, dst2, dst3;
767 v8i16 in0, in1, in2, in3;
768 v8i16 filter_vec, const_vec;
772 const_vec = __msa_ldi_h(128);
775 filter_vec =
LD_SH(filter);
776 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
782 for (loop_cnt = (height >> 1); loop_cnt--;) {
783 LD_SB2(src0_ptr, 8, src0, src1);
784 src0_ptr += src_stride;
785 LD_SB2(src0_ptr, 8, src2, src3);
786 src0_ptr += src_stride;
787 LD_SH2(src1_ptr, 8, in0, in1);
788 src1_ptr += src2_stride;
789 LD_SH2(src1_ptr, 8, in2, in3);
790 src1_ptr += src2_stride;
797 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
798 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
799 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
801 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
802 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
803 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
805 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
806 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
807 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
809 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
810 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
811 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
815 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
818 ST_SH2(dst0, dst1, dst, dst_stride);
819 dst += (2 * dst_stride);
835 v8i16 filt0, filt1, filt2, filt3;
836 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
837 v16i8 vec0, vec1, vec2, vec3;
838 v8i16 dst0, dst1, dst2;
840 v8i16 filter_vec, const_vec;
843 src0_ptr = src0_ptr - 3;
844 const_vec = __msa_ldi_h(128);
847 filter_vec =
LD_SH(filter);
848 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
858 for (loop_cnt = height; loop_cnt--;) {
859 LD_SB2(src0_ptr, 16, src0, src1);
860 src0_ptr += src_stride;
861 LD_SH2(src1_ptr, 8, in0, in1);
862 in2 =
LD_SH(src1_ptr + 16);
863 src1_ptr += src2_stride;
869 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
870 VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
873 VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
874 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
877 VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
878 VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
879 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
883 dst2 = __msa_adds_s_h(dst2, in2);
884 dst2 = __msa_srari_h(dst2, 7);
888 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
890 SD(dst_val0, dst + 16);
906 v8i16 filt0, filt1, filt2, filt3;
907 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
908 v16i8 vec0, vec1, vec2, vec3;
909 v8i16 dst0, dst1, dst2, dst3;
910 v8i16 in0, in1, in2, in3;
911 v8i16 filter_vec, const_vec;
915 const_vec = __msa_ldi_h(128);
918 filter_vec =
LD_SH(filter);
919 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
929 for (loop_cnt = height; loop_cnt--;) {
930 LD_SB2(src0_ptr, 16, src0, src1);
931 src2 =
LD_SB(src0_ptr + 24);
932 src0_ptr += src_stride;
933 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
934 src1_ptr += src2_stride;
941 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
942 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
943 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
945 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
946 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
947 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
949 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
950 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
951 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
953 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
954 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
959 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
962 ST_SB2(tmp0, tmp1, dst, 16);
978 v16i8 tmp0, tmp1, tmp2;
979 v8i16 filt0, filt1, filt2, filt3;
980 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
981 v16i8 vec0, vec1, vec2, vec3;
982 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
983 v8i16 in0, in1, in2, in3, in4, in5;
984 v8i16 filter_vec, const_vec;
989 const_vec = __msa_ldi_h(128);
992 filter_vec =
LD_SH(filter);
993 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1003 for (loop_cnt = 64; loop_cnt--;) {
1004 LD_SB3(src0_ptr, 16, src0, src1, src2);
1005 src3 =
LD_SB(src0_ptr + 40);
1006 src0_ptr += src_stride;
1007 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1015 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1016 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
1017 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1019 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1020 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
1021 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1023 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1024 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
1025 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1027 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1028 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
1029 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1035 ST_SB(tmp1, dst + 16);
1037 LD_SH2(src1_ptr + 32, 8, in4, in5);
1038 src1_ptr += src2_stride;
1042 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1043 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1044 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1046 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1047 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1048 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1053 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1054 ST_SB(tmp2, dst + 32);
1069 v16i8
src0,
src1, src2, src3, src4, src5, tmp0, tmp1;
1070 v8i16 filt0, filt1, filt2, filt3;
1072 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1073 v16i8 vec0, vec1, vec2, vec3;
1074 v8i16 dst0, dst1, dst2, dst3;
1075 v8i16 in0, in1, in2, in3;
1076 v8i16 filter_vec, const_vec;
1080 const_vec = __msa_ldi_h(128);
1083 filter_vec =
LD_SH(filter);
1084 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1094 for (loop_cnt = height; loop_cnt--;) {
1095 LD_SB2(src0_ptr, 16, src0, src1);
1096 src2 =
LD_SB(src0_ptr + 24);
1097 LD_SB2(src0_ptr + 32, 16, src3, src4);
1098 src5 =
LD_SB(src0_ptr + 56);
1099 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1107 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1108 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1109 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1111 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1112 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1113 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1115 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1116 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1117 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1119 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1120 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1121 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1125 dst0, dst1, dst2, dst3, 7,
1126 dst0, dst1, dst2, dst3);
1129 ST_SB2(tmp0, tmp1, dst, 16);
1135 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1142 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1143 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1144 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1146 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1147 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1148 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1150 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1151 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1152 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1154 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1155 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1156 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1159 dst0, dst1, dst2, dst3, 7,
1160 dst0, dst1, dst2, dst3);
1162 ST_SB2(tmp0, tmp1, dst + 32, 16);
1163 src1_ptr += src2_stride;
1164 src0_ptr += src_stride;
1179 v16i8
src0,
src1, src2, src3, src4, src5;
1180 v16i8 src6, src7, src8, src9, src10;
1181 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1182 v16i8 src11, src12, src13, src14;
1183 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1184 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1185 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1186 v16i8 src2110, src4332, src6554, src8776, src10998;
1187 v16i8 src12111110, src14131312;
1188 v8i16 dst10, dst32, dst54, dst76;
1189 v8i16 filt0, filt1, filt2, filt3;
1190 v8i16 filter_vec, const_vec;
1192 src0_ptr -= (3 * src_stride);
1194 const_vec = __msa_ldi_h(128);
1197 filter_vec =
LD_SH(filter);
1198 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1200 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1201 src0_ptr += (7 * src_stride);
1202 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1203 src10_r, src32_r, src54_r, src21_r);
1204 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1205 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1206 src2110, src4332, src6554);
1209 for (loop_cnt = (height >> 3); loop_cnt--;) {
1210 LD_SB8(src0_ptr, src_stride,
1211 src7, src8, src9, src10, src11, src12, src13, src14);
1212 src0_ptr += (8 * src_stride);
1213 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1214 src1_ptr += (8 * src2_stride);
1218 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1219 src76_r, src87_r, src98_r, src109_r);
1220 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1221 src1110_r, src1211_r, src1312_r, src1413_r);
1222 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1223 src1413_r, src1312_r,
1224 src8776, src10998, src12111110, src14131312);
1229 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1232 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1235 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1237 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1238 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1241 dst10, dst32, dst54, dst76, 7,
1242 dst10, dst32, dst54, dst76);
1244 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1245 ST4x8_UB(dst10, dst54, dst, dst_stride);
1246 dst += (8 * dst_stride);
1249 src4332 = src12111110;
1250 src6554 = src14131312;
1265 v16i8
src0,
src1, src2, src3, src4, src5;
1266 v16i8 src6, src7, src8, src9, src10;
1267 v8i16 in0, in1, in2, in3;
1268 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1269 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1270 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1271 v8i16 filt0, filt1, filt2, filt3;
1272 v8i16 filter_vec, const_vec;
1274 src0_ptr -= (3 * src_stride);
1275 const_vec = __msa_ldi_h(128);
1278 filter_vec =
LD_SH(filter);
1279 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1281 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1282 src0_ptr += (7 * src_stride);
1284 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1285 src10_r, src32_r, src54_r, src21_r);
1286 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1288 for (loop_cnt = (height >> 2); loop_cnt--;) {
1289 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1290 src0_ptr += (4 * src_stride);
1291 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1292 src1_ptr += (4 * src2_stride);
1294 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1295 src76_r, src87_r, src98_r, src109_r);
1299 filt0, filt1, filt2, filt3,
1300 dst0_r, dst0_r, dst0_r, dst0_r);
1303 filt0, filt1, filt2, filt3,
1304 dst1_r, dst1_r, dst1_r, dst1_r);
1307 filt0, filt1, filt2, filt3,
1308 dst2_r, dst2_r, dst2_r, dst2_r);
1311 filt0, filt1, filt2, filt3,
1312 dst3_r, dst3_r, dst3_r, dst3_r);
1315 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1316 dst0_r, dst1_r, dst2_r, dst3_r);
1318 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1319 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1320 dst += (4 * dst_stride);
1343 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1344 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1345 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1346 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1347 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1348 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1349 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1350 v16i8 src2110, src4332, src6554, src8776, src10998;
1351 v8i16 dst0_l, dst1_l;
1352 v8i16 filt0, filt1, filt2, filt3;
1353 v8i16 filter_vec, const_vec;
1355 src0_ptr -= (3 * src_stride);
1356 const_vec = __msa_ldi_h(128);
1359 filter_vec =
LD_SH(filter);
1360 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1363 src0_ptr += (7 * src_stride);
1366 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1367 src10_r, src32_r, src54_r, src21_r);
1368 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1369 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1370 src10_l, src32_l, src54_l, src21_l);
1371 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1372 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1373 src2110, src4332, src6554);
1375 for (loop_cnt = (height >> 2); loop_cnt--;) {
1376 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1377 src0_ptr += (4 * src_stride);
1378 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1379 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1380 src1_ptr += (4 * src2_stride);
1384 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1385 src76_r, src87_r, src98_r, src109_r);
1386 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1387 src76_l, src87_l, src98_l, src109_l);
1388 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1392 filt0, filt1, filt2, filt3,
1393 dst0_r, dst0_r, dst0_r, dst0_r);
1396 filt0, filt1, filt2, filt3,
1397 dst1_r, dst1_r, dst1_r, dst1_r);
1400 filt0, filt1, filt2, filt3,
1401 dst2_r, dst2_r, dst2_r, dst2_r);
1404 filt0, filt1, filt2, filt3,
1405 dst3_r, dst3_r, dst3_r, dst3_r);
1408 filt0, filt1, filt2, filt3,
1409 dst0_l, dst0_l, dst0_l, dst0_l);
1412 filt0, filt1, filt2, filt3,
1413 dst1_l, dst1_l, dst1_l, dst1_l);
1416 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1417 dst0_r, dst1_r, dst2_r, dst3_r);
1421 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1422 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1423 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
1424 dst += (4 * dst_stride);
1449 int16_t *src1_ptr_tmp;
1453 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1454 v8i16 in0, in1, in2, in3;
1455 v16i8 src10_r, src32_r, src54_r, src76_r;
1456 v16i8 src21_r, src43_r, src65_r, src87_r;
1457 v8i16 dst0_r, dst1_r;
1458 v16i8 src10_l, src32_l, src54_l, src76_l;
1459 v16i8 src21_l, src43_l, src65_l, src87_l;
1460 v8i16 dst0_l, dst1_l;
1461 v8i16 filt0, filt1, filt2, filt3;
1462 v8i16 filter_vec, const_vec;
1464 src0_ptr -= (3 * src_stride);
1465 const_vec = __msa_ldi_h(128);
1468 filter_vec =
LD_SH(filter);
1469 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1471 for (cnt = (width >> 4); cnt--;) {
1472 src0_ptr_tmp = src0_ptr;
1473 src1_ptr_tmp = src1_ptr;
1476 LD_SB7(src0_ptr_tmp, src_stride,
1477 src0, src1, src2, src3, src4, src5, src6);
1478 src0_ptr_tmp += (7 * src_stride);
1481 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1482 src10_r, src32_r, src54_r, src21_r);
1483 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1484 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1485 src10_l, src32_l, src54_l, src21_l);
1486 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1488 for (loop_cnt = (height >> 1); loop_cnt--;) {
1489 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1490 src0_ptr_tmp += (2 * src_stride);
1491 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1492 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1493 src1_ptr_tmp += (2 * src2_stride);
1496 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1497 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1501 filt0, filt1, filt2, filt3,
1502 dst0_r, dst0_r, dst0_r, dst0_r);
1505 filt0, filt1, filt2, filt3,
1506 dst1_r, dst1_r, dst1_r, dst1_r);
1509 filt0, filt1, filt2, filt3,
1510 dst0_l, dst0_l, dst0_l, dst0_l);
1513 filt0, filt1, filt2, filt3,
1514 dst1_l, dst1_l, dst1_l, dst1_l);
1517 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1518 dst0_r, dst1_r, dst0_l, dst1_l);
1520 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1521 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1522 dst_tmp += (2 * dst_stride);
1555 dst, dst_stride, filter, height, 16);
1568 dst, dst_stride, filter, height, 16);
1570 dst + 16, dst_stride, filter, height);
1583 dst, dst_stride, filter, height, 32);
1596 dst, dst_stride, filter, height, 48);
1609 dst, dst_stride, filter, height, 64);
1618 const int8_t *filter_x,
1619 const int8_t *filter_y,
1625 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1626 v8i16 in0 = { 0 }, in1 = { 0 };
1627 v8i16 filt0, filt1, filt2, filt3;
1628 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1629 v16i8 mask1, mask2, mask3;
1630 v8i16 filter_vec, const_vec;
1631 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1632 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1634 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1635 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1636 v4i32 dst0, dst1, dst2, dst3;
1639 src0_ptr -= ((3 * src_stride) + 3);
1640 filter_vec =
LD_SH(filter_x);
1641 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1643 filter_vec =
LD_SH(filter_y);
1646 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1652 const_vec = __msa_ldi_h(128);
1655 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1656 src0_ptr += (7 * src_stride);
1660 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1661 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1662 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1663 vec8, vec9, vec10, vec11);
1664 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1665 vec12, vec13, vec14, vec15);
1680 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1682 for (loop_cnt = height >> 2; loop_cnt--;) {
1683 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1684 src0_ptr += (4 * src_stride);
1687 LD2(src1_ptr, src2_stride, tp0, tp1);
1689 src1_ptr += (2 * src2_stride);
1690 LD2(src1_ptr, src2_stride, tp0, tp1);
1692 src1_ptr += (2 * src2_stride);
1694 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1695 vec0, vec1, vec2, vec3);
1696 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1697 vec4, vec5, vec6, vec7);
1703 dst76 = __msa_ilvr_h(dst97, dst66);
1705 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1706 dst98 = __msa_ilvr_h(dst66, dst108);
1708 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1710 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1712 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1714 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1717 SRA_4V(dst0, dst1, dst2, dst3, 6);
1720 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1723 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1724 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1725 dst += (4 * dst_stride);
1733 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1743 const int8_t *filter_x,
1744 const int8_t *filter_y,
1750 int16_t *src1_ptr_tmp;
1753 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1755 v8i16 filt0, filt1, filt2, filt3;
1756 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1758 v16i8 mask1, mask2, mask3;
1759 v8i16 filter_vec, const_vec;
1760 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1761 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1762 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1763 v4i32 dst0_r, dst0_l;
1764 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1765 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1767 src0_ptr -= ((3 * src_stride) + 3);
1768 const_vec = __msa_ldi_h(128);
1771 filter_vec =
LD_SH(filter_x);
1772 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1774 filter_vec =
LD_SH(filter_y);
1777 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1783 for (cnt = width >> 3; cnt--;) {
1784 src0_ptr_tmp = src0_ptr;
1786 src1_ptr_tmp = src1_ptr;
1788 LD_SB7(src0_ptr_tmp, src_stride,
1789 src0, src1, src2, src3, src4, src5, src6);
1790 src0_ptr_tmp += (7 * src_stride);
1794 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1795 vec0, vec1, vec2, vec3);
1796 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1797 vec4, vec5, vec6, vec7);
1798 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1799 vec8, vec9, vec10, vec11);
1800 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1801 vec12, vec13, vec14, vec15);
1811 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1812 vec0, vec1, vec2, vec3);
1813 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1814 vec4, vec5, vec6, vec7);
1815 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1816 vec8, vec9, vec10, vec11);
1824 for (loop_cnt = height; loop_cnt--;) {
1825 src7 =
LD_SB(src0_ptr_tmp);
1826 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1827 src0_ptr_tmp += src_stride;
1829 in0 =
LD_SH(src1_ptr_tmp);
1830 src1_ptr_tmp += src2_stride;
1832 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1833 vec0, vec1, vec2, vec3);
1841 filt_h0, filt_h1, filt_h2, filt_h3);
1843 filt_h0, filt_h1, filt_h2, filt_h3);
1847 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1849 tmp = __msa_srari_h(tmp, 7);
1851 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8) tmp);
1853 dst_tmp += dst_stride;
1876 const int8_t *filter_x,
1877 const int8_t *filter_y,
1881 dst, dst_stride, filter_x, filter_y,
1891 const int8_t *filter_x,
1892 const int8_t *filter_y,
1896 uint8_t *src0_ptr_tmp, *dst_tmp;
1897 int16_t *src1_ptr_tmp;
1900 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1901 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1902 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1903 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1904 v8i16 in0, in1 = { 0 }, out0, out1,
tmp, filter_vec, const_vec;
1905 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1906 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1907 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1908 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1909 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1910 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1911 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
1913 src0_ptr -= ((3 * src_stride) + 3);
1915 const_vec = __msa_ldi_h(128);
1918 filter_vec =
LD_SH(filter_x);
1919 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1921 filter_vec =
LD_SH(filter_y);
1924 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1931 src0_ptr_tmp = src0_ptr;
1933 src1_ptr_tmp = src1_ptr;
1935 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
1937 src0_ptr_tmp += (7 * src_stride);
1941 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1943 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1945 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1947 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1957 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1959 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1961 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1970 for (loop_cnt = 16; loop_cnt--;) {
1971 src7 =
LD_SB(src0_ptr_tmp);
1972 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1973 src0_ptr_tmp += src_stride;
1975 in0 =
LD_SH(src1_ptr_tmp);
1976 src1_ptr_tmp += src2_stride;
1978 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1986 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1987 filt_h1, filt_h2, filt_h3);
1988 dst0_l =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1989 filt_h1, filt_h2, filt_h3);
1993 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1995 tmp = __msa_srari_h(tmp, 7);
1997 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8) tmp);
1999 dst_tmp += dst_stride;
2019 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2020 src0_ptr += (7 * src_stride);
2024 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2025 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2026 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2027 vec8, vec9, vec10, vec11);
2028 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2029 vec12, vec13, vec14, vec15);
2043 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2045 for (loop_cnt = 4; loop_cnt--;) {
2046 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2047 src0_ptr += (4 * src_stride);
2050 LD2(src1_ptr, src2_stride, tp0, tp1);
2052 src1_ptr += (2 * src2_stride);
2053 LD2(src1_ptr, src2_stride, tp0, tp1);
2055 src1_ptr += (2 * src2_stride);
2057 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2059 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2066 dst76 = __msa_ilvr_h(dst97, dst66);
2068 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2069 dst98 = __msa_ilvr_h(dst66, dst108);
2071 tmp0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2073 tmp1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2075 tmp2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2077 tmp3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2079 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2082 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2085 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2086 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2087 dst += (4 * dst_stride);
2095 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2105 const int8_t *filter_x,
2106 const int8_t *filter_y,
2110 dst, dst_stride, filter_x, filter_y,
2120 const int8_t *filter_x,
2121 const int8_t *filter_y,
2125 dst, dst_stride, filter_x, filter_y,
2135 const int8_t *filter_x,
2136 const int8_t *filter_y,
2140 dst, dst_stride, filter_x, filter_y,
2150 const int8_t *filter_x,
2151 const int8_t *filter_y,
2155 dst, dst_stride, filter_x, filter_y,
2165 const int8_t *filter_x,
2166 const int8_t *filter_y,
2170 dst, dst_stride, filter_x, filter_y,
2184 v16i8
src0,
src1, dst0, vec0, vec1;
2189 v8i16 filter_vec, const_vec;
2193 const_vec = __msa_ldi_h(128);
2196 filter_vec =
LD_SH(filter);
2201 LD_SB2(src0_ptr, src_stride, src0, src1);
2202 LD_SH2(src1_ptr, src2_stride, in0, in1);
2203 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2205 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2209 tmp0 = __msa_adds_s_h(tmp0, in0);
2210 tmp0 = __msa_srari_h(tmp0, 7);
2212 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2227 v16i8
src0,
src1, src2, src3, dst0, vec0, vec1;
2228 v8i16 in0, in1, in2, in3;
2233 v8i16 filter_vec, const_vec;
2237 const_vec = __msa_ldi_h(128);
2240 filter_vec =
LD_SH(filter);
2245 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2246 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2253 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2254 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2255 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2258 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2260 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2274 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2276 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2278 v16i8 mask1, vec0, vec1, vec2, vec3;
2279 v8i16 tmp0, tmp1, tmp2, tmp3;
2280 v8i16 filter_vec, const_vec;
2284 const_vec = __msa_ldi_h(128);
2287 filter_vec =
LD_SH(filter);
2292 for (loop_cnt = (height >> 3); loop_cnt--;) {
2293 LD_SB8(src0_ptr, src_stride,
2294 src0, src1, src2, src3, src4, src5, src6, src7);
2295 src0_ptr += (8 * src_stride);
2296 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2297 src1_ptr += (4 * src2_stride);
2298 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2299 src1_ptr += (4 * src2_stride);
2308 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2309 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2310 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2312 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
2313 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2314 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2318 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2321 ST4x8_UB(dst0, dst1, dst, dst_stride);
2322 dst += (8 * dst_stride);
2337 dst, dst_stride, filter, height);
2338 }
else if (4 == height) {
2340 dst, dst_stride, filter, height);
2341 }
else if (8 == height || 16 == height) {
2343 src1_ptr, src2_stride,
2344 dst, dst_stride, filter, height);
2360 v8i16 in0, in1, in2, in3;
2363 v16i8 vec0, vec1, vec2, vec3;
2364 v8i16 dst0, dst1, dst2, dst3;
2365 v8i16 filter_vec, const_vec;
2369 const_vec = __msa_ldi_h(128);
2372 filter_vec =
LD_SH(filter);
2377 for (loop_cnt = (height >> 2); loop_cnt--;) {
2378 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2379 src0_ptr += (4 * src_stride);
2380 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2381 src1_ptr += (4 * src2_stride);
2388 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2389 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2390 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2392 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2393 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2394 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2398 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2401 ST6x4_UB(dst0, dst1, dst, dst_stride);
2402 dst += (4 * dst_stride);
2419 v16i8 mask1, vec0, vec1, vec2, vec3;
2421 v8i16 filter_vec, const_vec;
2425 const_vec = __msa_ldi_h(128);
2428 filter_vec =
LD_SH(filter);
2433 LD_SB2(src0_ptr, src_stride, src0, src1);
2434 LD_SH2(src1_ptr, src2_stride, in0, in1);
2439 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2440 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2441 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2445 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2459 v16i8
src0,
src1, src2, src3, src4, src5;
2460 v8i16 in0, in1, in2, in3, in4, in5;
2463 v16i8 vec0, vec1, vec2, vec3;
2464 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2465 v8i16 filter_vec, const_vec;
2469 const_vec = __msa_ldi_h(128);
2472 filter_vec =
LD_SH(filter);
2477 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2478 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2479 src1_ptr += (4 * src2_stride);
2480 LD_SH2(src1_ptr, src2_stride, in4, in5);
2487 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2488 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2489 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2491 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2492 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2493 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2498 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2499 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2500 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2504 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2508 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2509 ST8x4_UB(dst0, dst1, dst, dst_stride);
2510 dst += (4 * dst_stride);
2526 v8i16 in0, in1, in2, in3;
2529 v16i8 vec0, vec1, vec2, vec3;
2530 v8i16 dst0, dst1, dst2, dst3;
2531 v8i16 filter_vec, const_vec;
2535 const_vec = __msa_ldi_h(128);
2538 filter_vec =
LD_SH(filter);
2543 for (loop_cnt = (height >> 2); loop_cnt--;) {
2544 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2545 src0_ptr += (4 * src_stride);
2546 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2547 src1_ptr += (4 * src2_stride);
2554 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2555 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2556 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2558 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2559 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2560 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2564 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2567 ST8x4_UB(dst0, dst1, dst, dst_stride);
2568 dst += (4 * dst_stride);
2583 dst, dst_stride, filter, height);
2584 }
else if (6 == height) {
2586 dst, dst_stride, filter, height);
2587 }
else if (0 == (height % 4)) {
2589 src1_ptr, src2_stride,
2590 dst, dst_stride, filter, height);
2606 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2609 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2612 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2613 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2614 v8i16 filter_vec, const_vec;
2618 const_vec = __msa_ldi_h(128);
2621 filter_vec =
LD_SH(filter);
2627 for (loop_cnt = (height >> 2); loop_cnt--;) {
2628 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2629 src0_ptr += (4 * src_stride);
2630 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2631 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2632 src1_ptr += (4 * src2_stride);
2643 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2644 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2645 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2649 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2650 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2651 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5);
2652 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2657 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2661 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2662 ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
2663 dst += (4 * dst_stride);
2677 v16i8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3;
2678 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2682 v8i16 filter_vec, const_vec;
2686 const_vec = __msa_ldi_h(128);
2689 filter_vec =
LD_SH(filter);
2694 for (loop_cnt = (height >> 1); loop_cnt--;) {
2695 LD_SB2(src0_ptr, src_stride, src0, src2);
2696 LD_SB2(src0_ptr + 8, src_stride, src1, src3);
2697 src0_ptr += (2 * src_stride);
2698 LD_SH2(src1_ptr, src2_stride, in0, in2);
2699 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2700 src1_ptr += (2 * src2_stride);
2709 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2710 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2711 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2713 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2714 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2715 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2719 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2722 ST_SH2(dst0, dst1, dst, dst_stride);
2723 dst += (2 * dst_stride);
2736 int16_t *src1_ptr_tmp;
2739 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2740 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2743 v16i8 mask1, mask2, mask3;
2744 v16i8 vec0, vec1, vec2, vec3;
2745 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2746 v8i16 filter_vec, const_vec;
2750 const_vec = __msa_ldi_h(128);
2753 filter_vec =
LD_SH(filter);
2761 src1_ptr_tmp = src1_ptr + 16;
2763 for (loop_cnt = (height >> 2); loop_cnt--;) {
2764 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2765 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2766 src0_ptr += (4 * src_stride);
2767 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2768 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2769 src1_ptr += (4 * src2_stride);
2776 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2777 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
2778 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2780 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2781 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
2782 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2789 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2790 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2791 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2793 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2794 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2795 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2799 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2801 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2804 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2805 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2806 dst += (4 * dst_stride);
2808 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2809 src1_ptr_tmp += (4 * src2_stride);
2815 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2816 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2819 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1);
2820 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2821 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2825 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2828 ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
2829 dst_tmp += (4 * dst_stride);
2844 v8i16 in0, in1, in2, in3;
2847 v16i8 mask1, mask2, mask3;
2848 v8i16 dst0, dst1, dst2, dst3;
2849 v16i8 vec0, vec1, vec2, vec3;
2850 v8i16 filter_vec, const_vec;
2854 const_vec = __msa_ldi_h(128);
2857 filter_vec =
LD_SH(filter);
2864 for (loop_cnt = height; loop_cnt--;) {
2865 LD_SB2(src0_ptr, 16, src0, src1);
2866 src2 =
LD_SB(src0_ptr + 24);
2867 src0_ptr += src_stride;
2868 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2869 src1_ptr += src2_stride;
2876 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2877 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2878 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2880 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2881 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2882 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2886 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2889 ST_SH2(dst0, dst1, dst, 16);
2903 v16i8
src0,
src1, src2, src3, src4;
2905 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2908 v8i16 filter_vec, const_vec;
2910 src0_ptr -= src_stride;
2912 const_vec = __msa_ldi_h(128);
2915 filter_vec =
LD_SH(filter);
2918 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2919 src0_ptr += (3 * src_stride);
2921 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2922 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2923 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2925 LD_SB2(src0_ptr, src_stride, src3, src4);
2926 LD_SH2(src1_ptr, src2_stride, in0, in1);
2927 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2928 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2929 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2930 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2933 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2934 dst10 = __msa_adds_s_h(dst10, in0);
2935 dst10 = __msa_srari_h(dst10, 7);
2938 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2951 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2952 v8i16 in0, in1, in2, in3;
2953 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2954 v16i8 src2110, src4332, src6554;
2957 v8i16 filter_vec, const_vec;
2959 src0_ptr -= src_stride;
2961 const_vec = __msa_ldi_h(128);
2964 filter_vec =
LD_SH(filter);
2967 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2968 src0_ptr += (3 * src_stride);
2969 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2970 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2971 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2973 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2974 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2976 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2977 src32_r, src43_r, src54_r, src65_r);
2978 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2982 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2984 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2987 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2988 ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
3001 v16i8
src0,
src1, src2, src3, src4, src5;
3002 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3003 v16i8 src6, src7, src8, src9;
3004 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3005 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3006 v16i8 src2110, src4332, src6554, src8776;
3007 v8i16 dst10, dst32, dst54, dst76;
3009 v8i16 filter_vec, const_vec;
3011 src0_ptr -= src_stride;
3013 const_vec = __msa_ldi_h(128);
3016 filter_vec =
LD_SH(filter);
3019 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3020 src0_ptr += (3 * src_stride);
3021 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3022 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3023 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3025 for (loop_cnt = (height >> 3); loop_cnt--;) {
3026 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3027 src0_ptr += (6 * src_stride);
3028 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3029 src1_ptr += (8 * src2_stride);
3032 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3033 src32_r, src43_r, src54_r, src65_r);
3034 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3035 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3036 src4332, src6554, src8776);
3040 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3042 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3044 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3046 LD_SB2(src0_ptr, src_stride, src9, src2);
3047 src0_ptr += (2 * src_stride);
3048 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3049 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3050 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3052 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3055 dst10, dst32, dst54, dst76, 7,
3056 dst10, dst32, dst54, dst76);
3058 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3059 ST4x8_UB(dst10, dst54, dst, dst_stride);
3060 dst += (8 * dst_stride);
3075 dst, dst_stride, filter, height);
3076 }
else if (4 == height) {
3078 dst, dst_stride, filter, height);
3081 src1_ptr, src2_stride,
3082 dst, dst_stride, filter, height);
3095 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3096 v8i16 in0, in1, in2, in3;
3097 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3098 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3100 v8i16 filter_vec, const_vec;
3102 src0_ptr -= src_stride;
3104 const_vec = __msa_ldi_h(128);
3107 filter_vec =
LD_SH(filter);
3110 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3111 src0_ptr += (3 * src_stride);
3112 LD_SB2(src0_ptr, src_stride, src3, src4);
3113 src0_ptr += (2 * src_stride);
3114 LD_SB2(src0_ptr, src_stride, src5, src6);
3115 src0_ptr += (2 * src_stride);
3116 LD_SB2(src0_ptr, src_stride, src7, src8);
3117 src0_ptr += (2 * src_stride);
3118 LD_SB2(src0_ptr, src_stride, src9, src10);
3119 src0_ptr += (2 * src_stride);
3121 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3122 src1_ptr += (4 * src2_stride);
3130 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3131 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3134 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3136 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3138 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3141 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3143 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3146 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3147 dst0_r, dst1_r, dst2_r, dst3_r);
3149 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3150 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3151 dst += (4 * dst_stride);
3153 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3154 src1_ptr += (4 * src2_stride);
3155 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3158 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3160 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3162 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3165 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3167 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3170 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3171 dst0_r, dst1_r, dst2_r, dst3_r);
3173 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3174 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3175 dst += (4 * dst_stride);
3187 v16i8
src0,
src1, src2, src3, src4;
3188 v8i16 in0, in1, dst0_r, dst1_r;
3189 v16i8 src10_r, src32_r, src21_r, src43_r;
3191 v8i16 filter_vec, const_vec;
3193 src0_ptr -= src_stride;
3195 const_vec = __msa_ldi_h(128);
3198 filter_vec =
LD_SH(filter);
3201 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3202 src0_ptr += (3 * src_stride);
3204 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3206 LD_SB2(src0_ptr, src_stride, src3, src4);
3207 LD_SH2(src1_ptr, src2_stride, in0, in1);
3209 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3212 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3214 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3217 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3231 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3232 v8i16 in0, in1, in2, in3, in4, in5;
3233 v16i8 src10_r, src32_r, src54_r, src76_r;
3234 v16i8 src21_r, src43_r, src65_r, src87_r;
3235 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3237 v8i16 filter_vec, const_vec;
3239 src0_ptr -= src_stride;
3241 const_vec = __msa_ldi_h(128);
3244 filter_vec =
LD_SH(filter);
3247 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3248 src0_ptr += (3 * src_stride);
3250 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3252 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3253 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3255 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3256 src32_r, src43_r, src54_r, src65_r);
3257 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3260 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3262 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3264 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3266 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3268 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3270 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3272 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3273 dst0_r, dst1_r, dst2_r, dst3_r);
3276 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3277 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3278 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3279 dst += (4 * dst_stride);
3293 v16i8
src0,
src1, src2, src3, src4, src5;
3294 v8i16 in0, in1, in2, in3;
3295 v16i8 src10_r, src32_r, src21_r, src43_r;
3296 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3298 v8i16 filter_vec, const_vec;
3300 src0_ptr -= src_stride;
3302 const_vec = __msa_ldi_h(128);
3305 filter_vec =
LD_SH(filter);
3308 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3309 src0_ptr += (3 * src_stride);
3311 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3313 for (loop_cnt = (height >> 2); loop_cnt--;) {
3314 LD_SB2(src0_ptr, src_stride, src3, src4);
3315 src0_ptr += (2 * src_stride);
3316 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3317 src1_ptr += (4 * src2_stride);
3319 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3322 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3324 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3326 LD_SB2(src0_ptr, src_stride, src5, src2);
3327 src0_ptr += (2 * src_stride);
3329 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3332 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3334 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3336 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3337 dst0_r, dst1_r, dst2_r, dst3_r);
3339 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3340 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3341 dst += (4 * dst_stride);
3356 dst, dst_stride, filter, height);
3357 }
else if (6 == height) {
3359 dst, dst_stride, filter, height);
3362 src1_ptr, src2_stride,
3363 dst, dst_stride, filter, height);
3377 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3378 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3379 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3380 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3381 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3382 v16i8 src2110, src4332, src6554;
3383 v8i16 dst0_l, dst1_l, filt0, filt1;
3384 v8i16 filter_vec, const_vec;
3386 src0_ptr -= (1 * src_stride);
3388 const_vec = __msa_ldi_h(128);
3391 filter_vec =
LD_SH(filter);
3394 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3395 src0_ptr += (3 * src_stride);
3397 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3398 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3399 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3401 for (loop_cnt = (height >> 2); loop_cnt--;) {
3402 LD_SB2(src0_ptr, src_stride, src3, src4);
3403 src0_ptr += (2 * src_stride);
3404 LD_SB2(src0_ptr, src_stride, src5, src6);
3405 src0_ptr += (2 * src_stride);
3406 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3407 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3408 src1_ptr += (4 * src2_stride);
3413 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3414 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3415 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3416 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3417 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3418 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3421 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3423 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3425 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3427 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3429 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3431 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3433 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3434 dst0_r, dst1_r, dst2_r, dst3_r);
3437 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3438 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3439 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
3440 dst += (4 * dst_stride);
3459 v16i8
src0,
src1, src2, src3, src4, src5;
3460 v8i16 in0, in1, in2, in3;
3461 v16i8 src10_r, src32_r, src21_r, src43_r;
3462 v16i8 src10_l, src32_l, src21_l, src43_l;
3463 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3465 v8i16 filter_vec, const_vec;
3467 src0_ptr -= src_stride;
3469 const_vec = __msa_ldi_h(128);
3472 filter_vec =
LD_SH(filter);
3475 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3476 src0_ptr += (3 * src_stride);
3478 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3479 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3481 for (loop_cnt = (height >> 2); loop_cnt--;) {
3482 LD_SB2(src0_ptr, src_stride, src3, src4);
3483 src0_ptr += (2 * src_stride);
3484 LD_SH2(src1_ptr, src2_stride, in0, in1);
3485 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3486 src1_ptr += (2 * src2_stride);
3488 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3489 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3492 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3494 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3496 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3498 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3500 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3501 dst0_r, dst1_r, dst0_l, dst1_l);
3503 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3504 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3505 dst += (2 * dst_stride);
3507 LD_SB2(src0_ptr, src_stride, src5, src2);
3508 src0_ptr += (2 * src_stride);
3509 LD_SH2(src1_ptr, src2_stride, in0, in1);
3510 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3511 src1_ptr += (2 * src2_stride);
3513 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3514 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3517 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3519 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3521 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3523 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3525 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3526 dst0_r, dst1_r, dst0_l, dst1_l);
3528 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3529 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3530 dst += (2 * dst_stride);
3544 v16i8
src0,
src1, src2, src3, src4, src5;
3545 v16i8 src6, src7, src8, src9, src10, src11;
3546 v8i16 in0, in1, in2, in3, in4, in5;
3547 v16i8 src10_r, src32_r, src76_r, src98_r;
3548 v16i8 src21_r, src43_r, src87_r, src109_r;
3549 v16i8 src10_l, src32_l, src21_l, src43_l;
3550 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3551 v8i16 dst0_l, dst1_l;
3553 v8i16 filter_vec, const_vec;
3555 src0_ptr -= src_stride;
3557 const_vec = __msa_ldi_h(128);
3560 filter_vec =
LD_SH(filter);
3564 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3566 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3567 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3569 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3570 src0_ptr += (3 * src_stride);
3572 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3574 for (loop_cnt = (height >> 2); loop_cnt--;) {
3576 LD_SB2(src0_ptr, src_stride, src3, src4);
3577 LD_SH2(src1_ptr, src2_stride, in0, in1);
3578 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3579 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3580 src1_ptr += (2 * src2_stride);
3582 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3583 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3585 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3586 src0_ptr += (2 * src_stride);
3588 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3591 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3593 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3595 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3597 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3600 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3602 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3605 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3606 dst0_r, dst1_r, dst0_l, dst1_l);
3610 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3611 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3612 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3613 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3614 dst += (2 * dst_stride);
3617 LD_SB2(src0_ptr, src_stride, src5, src2);
3618 LD_SH2(src1_ptr, src2_stride, in0, in1);
3619 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3620 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3621 src1_ptr += (2 * src2_stride);
3623 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3624 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3626 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3627 src0_ptr += (2 * src_stride);
3629 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3632 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3634 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3636 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3638 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3641 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3643 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3646 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3647 dst0_r, dst1_r, dst0_l, dst1_l);
3650 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3651 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3652 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3653 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3654 dst += (2 * dst_stride);
3669 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3670 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3671 v16i8 src10_r, src32_r, src76_r, src98_r;
3672 v16i8 src21_r, src43_r, src87_r, src109_r;
3673 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3674 v16i8 src10_l, src32_l, src76_l, src98_l;
3675 v16i8 src21_l, src43_l, src87_l, src109_l;
3676 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3678 v8i16 filter_vec, const_vec;
3680 src0_ptr -= src_stride;
3682 const_vec = __msa_ldi_h(128);
3685 filter_vec =
LD_SH(filter);
3689 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3691 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3692 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3695 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3696 src0_ptr += (3 * src_stride);
3698 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3699 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3701 for (loop_cnt = (height >> 1); loop_cnt--;) {
3703 LD_SB2(src0_ptr, src_stride, src3, src4);
3704 LD_SH2(src1_ptr, src2_stride, in0, in1);
3705 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3706 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3707 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3708 src1_ptr += (2 * src2_stride);
3710 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3711 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3714 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3716 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3718 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3720 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3723 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3724 dst0_r, dst1_r, dst0_l, dst1_l);
3732 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3733 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3734 dst += (2 * dst_stride);
3737 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3738 src0_ptr += (2 * src_stride);
3740 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3741 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3744 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3746 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3748 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3750 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3753 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3754 dst2_r, dst3_r, dst2_l, dst3_l);
3756 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3757 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3758 dst_tmp += (2 * dst_stride);
3774 const int8_t *filter_x,
3775 const int8_t *filter_y)
3780 v16i8
src0,
src1, src2, src3, src4;
3782 v8i16 filt_h0, filt_h1;
3785 v8i16 filter_vec, const_vec;
3786 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3787 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43,
tmp;
3790 src0_ptr -= (src_stride + 1);
3792 filter_vec =
LD_SH(filter_x);
3795 filter_vec =
LD_SH(filter_y);
3802 const_vec = __msa_ldi_h(128);
3805 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
3808 LD2(src1_ptr, src2_stride, tp0, tp1);
3810 in0 = __msa_adds_s_h(in0, const_vec);
3812 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3813 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3814 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3827 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3828 tmp = __msa_adds_s_h(tmp, in0);
3829 tmp = __msa_srari_h(tmp, 7);
3831 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8) tmp);
3841 const int8_t *filter_x,
3842 const int8_t *filter_y)
3846 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3848 v8i16 filt_h0, filt_h1;
3851 v8i16 filter_vec, const_vec;
3852 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3854 v8i16 in0 = { 0 }, in1 = { 0 };
3855 v8i16 dst30, dst41, dst52, dst63;
3856 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3857 v4i32 dst0, dst1, dst2, dst3;
3859 src0_ptr -= (src_stride + 1);
3861 filter_vec =
LD_SH(filter_x);
3864 filter_vec =
LD_SH(filter_y);
3871 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
3874 const_vec = __msa_ldi_h(128);
3877 LD2(src1_ptr, src2_stride, tp0, tp1);
3878 src1_ptr += 2 * src2_stride;
3880 LD2(src1_ptr, src2_stride, tp0, tp1);
3883 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
3885 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3886 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3887 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3888 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3902 SRA_4V(dst0, dst1, dst2, dst3, 6);
3907 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3908 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3917 const int8_t *filter_x,
3918 const int8_t *filter_y,
3924 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3926 v8i16 filt_h0, filt_h1;
3929 v8i16 filter_vec, const_vec;
3930 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3931 v8i16 tmp0, tmp1, tmp2, tmp3;
3932 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3933 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3934 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3935 v8i16 dst98_r, dst109_r;
3936 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
3937 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3939 src0_ptr -= (src_stride + 1);
3941 filter_vec =
LD_SH(filter_x);
3944 filter_vec =
LD_SH(filter_y);
3951 const_vec = __msa_ldi_h(128);
3954 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3955 src0_ptr += (3 * src_stride);
3958 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3959 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3963 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3966 for (loop_cnt = height >> 3; loop_cnt--;) {
3967 LD_SB8(src0_ptr, src_stride,
3968 src3, src4, src5, src6, src7, src8, src9, src10);
3969 src0_ptr += (8 * src_stride);
3971 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3972 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3973 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3974 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3981 dst32_r = __msa_ilvr_h(dst73, dst22);
3985 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3986 dst76_r = __msa_ilvr_h(dst22, dst106);
3988 LD2(src1_ptr, src2_stride, tp0, tp1);
3989 src1_ptr += 2 * src2_stride;
3991 LD2(src1_ptr, src2_stride, tp0, tp1);
3992 src1_ptr += 2 * src2_stride;
3995 LD2(src1_ptr, src2_stride, tp0, tp1);
3996 src1_ptr += 2 * src2_stride;
3998 LD2(src1_ptr, src2_stride, tp0, tp1);
3999 src1_ptr += 2 * src2_stride;
4002 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4003 const_vec, in0, in1, in2, in3);
4012 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4013 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4015 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4016 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4021 ST4x8_UB(out0, out1, dst, dst_stride);
4022 dst += (8 * dst_stride);
4026 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4036 const int8_t *filter_x,
4037 const int8_t *filter_y,
4042 dst, dst_stride, filter_x, filter_y);
4043 }
else if (4 == height) {
4045 dst, dst_stride, filter_x, filter_y);
4046 }
else if (0 == (height % 8)) {
4048 src1_ptr, src2_stride,
4050 filter_x, filter_y, height);
4060 const int8_t *filter_x,
4061 const int8_t *filter_y,
4064 uint32_t tpw0, tpw1, tpw2, tpw3;
4066 v16u8 out0, out1, out2;
4067 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4068 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4070 v8i16 filt_h0, filt_h1;
4073 v8i16 filter_vec, const_vec;
4074 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4075 v8i16 dsth10, tmp4, tmp5;
4076 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4077 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4078 v8i16 tmp0, tmp1, tmp2, tmp3;
4079 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4080 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4081 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4082 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4083 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4084 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4085 v8i16 in4 = { 0 }, in5 = { 0 };
4087 src0_ptr -= (src_stride + 1);
4089 filter_vec =
LD_SH(filter_x);
4092 filter_vec =
LD_SH(filter_y);
4099 const_vec = __msa_ldi_h(128);
4102 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4103 src0_ptr += (3 * src_stride);
4106 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4107 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4108 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4117 LD_SB8(src0_ptr, src_stride,
4118 src3, src4, src5, src6, src7, src8, src9, src10);
4121 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4122 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4123 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4124 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4131 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4132 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4133 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4134 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4149 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4150 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4151 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4164 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4165 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4166 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4167 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4168 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4169 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4170 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4172 LD2(src1_ptr, src2_stride, tp0, tp1);
4174 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4177 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4179 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4182 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4183 in0, in1, in2, in3);
4184 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4189 ST4x8_UB(out0, out1, dst, dst_stride);
4191 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4192 src1_ptr += (4 * src2_stride);
4194 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4196 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4200 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4201 ST2x4_UB(out2, 0, dst + 4, dst_stride);
4202 dst += 4 * dst_stride;
4203 ST2x4_UB(out2, 4, dst + 4, dst_stride);
4212 const int8_t *filter_x,
4213 const int8_t *filter_y)
4216 v16i8
src0,
src1, src2, src3, src4;
4218 v8i16 filt_h0, filt_h1;
4221 v8i16 filter_vec, const_vec;
4222 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4223 v8i16 dst0, dst1, dst2, dst3, dst4;
4224 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4225 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4226 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4230 src0_ptr -= (src_stride + 1);
4232 filter_vec =
LD_SH(filter_x);
4235 filter_vec =
LD_SH(filter_y);
4242 const_vec = __msa_ldi_h(128);
4245 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4248 LD_SH2(src1_ptr, src2_stride, in0, in1);
4249 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4251 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4252 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4253 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4254 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4255 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4271 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4272 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4276 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4286 const int8_t *filter_x,
4287 const int8_t *filter_y,
4292 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
4293 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4294 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4295 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4296 v8i16 in0, in1, in2, in3;
4297 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4298 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4299 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4301 src0_ptr -= (src_stride + 1);
4303 filter_vec =
LD_SH(filter_x);
4306 filter_vec =
LD_SH(filter_y);
4314 const_vec = __msa_ldi_h(128);
4317 for (cnt = width8mult; cnt--;) {
4318 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4322 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4324 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4325 const_vec, in0, in1, in2, in3);
4327 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4328 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4329 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4338 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4339 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4340 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4341 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4362 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4363 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4364 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4365 dst3_r, tmp0, tmp1, tmp2, tmp3);
4366 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4367 tmp0, tmp1, tmp2, tmp3);
4371 ST8x4_UB(out0, out1, dst, dst_stride);
4382 const int8_t *filter_x,
4383 const int8_t *filter_y)
4385 v16u8 out0, out1, out2;
4386 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4387 v8i16 in0, in1, in2, in3, in4, in5;
4389 v8i16 filt_h0, filt_h1;
4392 v8i16 filter_vec, const_vec;
4393 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4394 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4395 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4396 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4397 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4398 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4399 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4400 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4401 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4402 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4404 src0_ptr -= (src_stride + 1);
4406 filter_vec =
LD_SH(filter_x);
4409 filter_vec =
LD_SH(filter_y);
4416 const_vec = __msa_ldi_h(128);
4419 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4420 src0_ptr += (5 * src_stride);
4421 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4426 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4427 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4428 in0, in1, in2, in3);
4429 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4431 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4432 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4433 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4434 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4435 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4436 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4437 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4438 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4439 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4473 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4474 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4475 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4476 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4477 tmp0, tmp1, tmp2, tmp3);
4478 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4479 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4480 tmp0, tmp1, tmp2, tmp3);
4487 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4488 ST8x4_UB(out0, out1, dst, dst_stride);
4489 dst += (4 * dst_stride);
4499 const int8_t *filter_x,
4500 const int8_t *filter_y,
4504 uint32_t loop_cnt, cnt;
4506 int16_t *src1_ptr_tmp;
4509 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4510 v8i16 in0, in1, in2, in3;
4512 v8i16 filt_h0, filt_h1;
4515 v8i16 filter_vec, const_vec;
4516 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4517 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4518 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4519 v8i16 tmp0, tmp1, tmp2, tmp3;
4520 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4521 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4522 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4524 src0_ptr -= (src_stride + 1);
4526 filter_vec =
LD_SH(filter_x);
4529 filter_vec =
LD_SH(filter_y);
4536 const_vec = __msa_ldi_h(128);
4539 for (cnt = width >> 3; cnt--;) {
4540 src0_ptr_tmp = src0_ptr;
4542 src1_ptr_tmp = src1_ptr;
4544 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4545 src0_ptr_tmp += (3 * src_stride);
4548 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4549 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4550 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4559 for (loop_cnt = height >> 2; loop_cnt--;) {
4560 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4561 src0_ptr_tmp += (4 * src_stride);
4562 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4563 src1_ptr_tmp += (4 * src2_stride);
4566 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4567 const_vec, in0, in1, in2, in3);
4569 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4570 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4571 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4572 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4593 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4594 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4595 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4596 dst3_r, tmp0, tmp1, tmp2, tmp3);
4597 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4598 tmp0, tmp1, tmp2, tmp3);
4602 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
4603 dst_tmp += (4 * dst_stride);
4624 const int8_t *filter_x,
4625 const int8_t *filter_y,
4630 dst, dst_stride, filter_x, filter_y);
4631 }
else if (4 == height) {
4633 dst, dst_stride, filter_x, filter_y, 1);
4634 }
else if (6 == height) {
4636 dst, dst_stride, filter_x, filter_y);
4639 src1_ptr, src2_stride,
4641 filter_x, filter_y, height, 8);
4651 const int8_t *filter_x,
4652 const int8_t *filter_y,
4657 uint8_t *src0_ptr_tmp, *dst_tmp;
4658 int16_t *src1_ptr_tmp;
4660 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4661 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4662 v16i8 mask0, mask1, mask2, mask3;
4663 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4664 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4665 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4666 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4667 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4668 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4669 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4670 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4671 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4673 src0_ptr -= (src_stride + 1);
4675 filter_vec =
LD_SH(filter_x);
4678 filter_vec =
LD_SH(filter_y);
4686 const_vec = __msa_ldi_h(128);
4689 src0_ptr_tmp = src0_ptr;
4691 src1_ptr_tmp = src1_ptr;
4693 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4694 src0_ptr_tmp += (3 * src_stride);
4698 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4699 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4700 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4709 for (loop_cnt = 4; loop_cnt--;) {
4710 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4711 src0_ptr_tmp += (4 * src_stride);
4714 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4715 src1_ptr_tmp += (4 * src2_stride);
4716 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4717 const_vec, in0, in1, in2, in3);
4719 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4720 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4721 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4722 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4743 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4744 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4745 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4746 dst3_r, tmp0, tmp1, tmp2, tmp3);
4747 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4748 tmp0, tmp1, tmp2, tmp3);
4752 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
4753 dst_tmp += (4 * dst_stride);
4769 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4770 src0_ptr += (3 * src_stride);
4772 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4773 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4779 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4781 for (loop_cnt = 2; loop_cnt--;) {
4782 LD_SB8(src0_ptr, src_stride,
4783 src3, src4, src5, src6, src7, src8, src9, src10);
4784 src0_ptr += (8 * src_stride);
4786 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4787 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4788 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4789 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4796 dst32_r = __msa_ilvr_h(dst73, dst22);
4800 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4801 dst76_r = __msa_ilvr_h(dst22, dst106);
4803 LD2(src1_ptr, src2_stride, tp0, tp1);
4804 src1_ptr += 2 * src2_stride;
4806 LD2(src1_ptr, src2_stride, tp0, tp1);
4807 src1_ptr += 2 * src2_stride;
4810 LD2(src1_ptr, src2_stride, tp0, tp1);
4811 src1_ptr += 2 * src2_stride;
4813 LD2(src1_ptr, src2_stride, tp0, tp1);
4814 src1_ptr += 2 * src2_stride;
4817 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4818 const_vec, in0, in1, in2, in3);
4829 SRA_4V(dst0, dst1, dst2, dst3, 6);
4830 SRA_4V(dst4, dst5, dst6, dst7, 6);
4831 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4832 tmp0, tmp1, tmp2, tmp3);
4833 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4834 tmp0, tmp1, tmp2, tmp3);
4838 ST4x8_UB(out0, out1, dst, dst_stride);
4839 dst += (8 * dst_stride);
4843 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4853 const int8_t *filter_x,
4854 const int8_t *filter_y,
4859 dst, dst_stride, filter_x, filter_y, 2);
4862 src2_stride, dst, dst_stride, filter_x,
4863 filter_y, height, 16);
4873 const int8_t *filter_x,
4874 const int8_t *filter_y,
4878 dst, dst_stride, filter_x, filter_y,
4888 const int8_t *filter_x,
4889 const int8_t *filter_y,
4893 dst, dst_stride, filter_x, filter_y,
4897 #define BI_MC_COPY(WIDTH) \
4898 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4899 ptrdiff_t dst_stride, \
4901 ptrdiff_t src_stride, \
4902 int16_t *src_16bit, \
4908 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
4909 dst, dst_stride, height); \
4924 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4925 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4926 ptrdiff_t dst_stride, \
4928 ptrdiff_t src_stride, \
4929 int16_t *src_16bit, \
4935 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4937 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4938 MAX_PB_SIZE, dst, dst_stride, \
4942 BI_MC(qpel,
h, 4, 8, hz, mx);
4943 BI_MC(qpel,
h, 8, 8, hz, mx);
4944 BI_MC(qpel,
h, 12, 8, hz, mx);
4945 BI_MC(qpel,
h, 16, 8, hz, mx);
4946 BI_MC(qpel,
h, 24, 8, hz, mx);
4947 BI_MC(qpel,
h, 32, 8, hz, mx);
4948 BI_MC(qpel,
h, 48, 8, hz, mx);
4949 BI_MC(qpel,
h, 64, 8, hz, mx);
4951 BI_MC(qpel, v, 4, 8, vt, my);
4952 BI_MC(qpel, v, 8, 8, vt, my);
4953 BI_MC(qpel, v, 12, 8, vt, my);
4954 BI_MC(qpel, v, 16, 8, vt, my);
4955 BI_MC(qpel, v, 24, 8, vt, my);
4956 BI_MC(qpel, v, 32, 8, vt, my);
4957 BI_MC(qpel, v, 48, 8, vt, my);
4958 BI_MC(qpel, v, 64, 8, vt, my);
4960 BI_MC(epel,
h, 4, 4, hz, mx);
4961 BI_MC(epel,
h, 8, 4, hz, mx);
4962 BI_MC(epel,
h, 6, 4, hz, mx);
4963 BI_MC(epel,
h, 12, 4, hz, mx);
4964 BI_MC(epel,
h, 16, 4, hz, mx);
4965 BI_MC(epel,
h, 24, 4, hz, mx);
4966 BI_MC(epel,
h, 32, 4, hz, mx);
4968 BI_MC(epel, v, 4, 4, vt, my);
4969 BI_MC(epel, v, 8, 4, vt, my);
4970 BI_MC(epel, v, 6, 4, vt, my);
4971 BI_MC(epel, v, 12, 4, vt, my);
4972 BI_MC(epel, v, 16, 4, vt, my);
4973 BI_MC(epel, v, 24, 4, vt, my);
4974 BI_MC(epel, v, 32, 4, vt, my);
4978 #define BI_MC_HV(PEL, WIDTH, TAP) \
4979 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4980 ptrdiff_t dst_stride, \
4982 ptrdiff_t src_stride, \
4983 int16_t *src_16bit, \
4989 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4990 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4992 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4993 MAX_PB_SIZE, dst, dst_stride, \
4994 filter_x, filter_y, height); \
static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define XORI_B2_128_SB(...)
static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B3_128_SB(...)
static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define UNPCK_R_SB_SH(in, out)
static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define SRA_4V(in0, in1, in2, in3, shift)
static void filter(int16_t *output, ptrdiff_t out_stride, int16_t *low, ptrdiff_t low_stride, int16_t *high, ptrdiff_t high_stride, int len, int clip)
static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define BI_MC_HV(PEL, WIDTH, TAP)
static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W2_SH(...)
static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define CLIP_SH_0_255(in)
#define SPLATI_H4_SH(...)
static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static const uint8_t ff_hevc_mask_arr[16 *2]
static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define CLIP_SH4_0_255_MAX_SATU(in0, in1, in2, in3)
static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,vec0, vec1, vec2, vec3, rnd_val,out0, out1, out2, out3)
static int aligned(int val)
#define ST8x2_UB(in, pdst, stride)
static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define CLIP_SH_0_255_MAX_SATU(in)
#define XORI_B7_128_SB(...)
#define LW2(psrc, stride, out0, out1)
static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B4_128_SB(...)
static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W4_SH(...)
static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define DPADD_SB2_SH(...)
static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define INSERT_D2_SH(...)
#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
#define ST2x4_UB(in, stidx, pdst, stride)
#define INSERT_W4_SH(...)
static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define BI_MC_COPY(WIDTH)
static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,vec3,rnd_val, out0, out1, out2, out3)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define INSERT_W2_SB(...)
#define SLLI_4V(in0, in1, in2, in3, shift)
#define ST4x8_UB(in0, in1, pdst, stride)
#define ST6x4_UB(in0, in1, pdst, stride)
static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST8x4_UB(in0, in1, pdst, stride)
#define ST8x8_UB(in0, in1, in2, in3, pdst, stride)
#define INSERT_D2_SB(...)
static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B6_128_SB(...)
static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST12x4_UB(in0, in1, in2, pdst, stride)
static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val,out0, out1)
static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST8x1_UB(in, pdst)
#define SLLI_2V(in0, in1, shift)
#define ST4x2_UB(in, pdst, stride)
#define INSERT_W4_SB(...)
static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define LD2(psrc, stride, out0, out1)
#define CLIP_SH2_0_255_MAX_SATU(in0, in1)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)