27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
34 SRARI_H2_SH(out0, out1, rnd_val); \
35 CLIP_SH2_0_255(out0, out1); \
38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
39 vec0, vec1, vec2, vec3, rnd_val, \
40 out0, out1, out2, out3) \
42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
50 SRARI_H2_SH(out0, out1, rnd_val); \
51 CLIP_SH2_0_255(out0, out1); \
54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
55 vec3, rnd_val, out0, out1, out2, out3) \
57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
63 const int16_t *src1_ptr,
69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
77 LW2(src0_ptr, src_stride, tp0, tp1);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
82 dst0 = (v8i16) __msa_ilvr_b(
zero,
src0);
85 dst0 = __msa_srari_h(dst0, 7);
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89 ST_W2(dst0, 0, 1, dst, dst_stride);
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
101 }
else if (0 ==
height % 8) {
102 for (loop_cnt = (
height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
121 dst3, 7, dst0, dst1, dst2, dst3);
123 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
124 dst += (8 * dst_stride);
131 const int16_t *src1_ptr,
138 uint64_t tp0, tp1, tp2, tp3;
140 v16u8 out0, out1, out2, out3;
142 v16i8
src0 = { 0 },
src1 = { 0 },
src2 = { 0 }, src3 = { 0 };
143 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
144 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
146 for (loop_cnt = (
height >> 3); loop_cnt--;) {
147 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148 src0_ptr += (4 * src_stride);
151 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
152 src0_ptr += (4 * src_stride);
155 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
156 src1_ptr += (8 * src2_stride);
161 SLLI_4V(dst0, dst1, dst2, dst3, 6);
162 SLLI_4V(dst4, dst5, dst6, dst7, 6);
164 7, dst0, dst1, dst2, dst3);
166 7, dst4, dst5, dst6, dst7);
169 ST_W2(out0, 0, 2, dst, dst_stride);
170 ST_H2(out0, 2, 6, dst + 4, dst_stride);
171 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
172 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
173 dst += (4 * dst_stride);
174 ST_W2(out2, 0, 2, dst, dst_stride);
175 ST_H2(out2, 2, 6, dst + 4, dst_stride);
176 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
177 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
178 dst += (4 * dst_stride);
181 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
182 src0_ptr += (4 * src_stride);
185 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
188 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
193 SLLI_4V(dst0, dst1, dst2, dst3, 6);
194 SLLI_4V(dst4, dst5, dst6, dst7, 6);
196 7, dst0, dst1, dst2, dst3);
198 7, dst4, dst5, dst6, dst7);
202 ST_W2(out0, 0, 2, dst, dst_stride);
203 ST_H2(out0, 2, 6, dst + 4, dst_stride);
204 }
else if (res == 4) {
205 ST_W2(out0, 0, 2, dst, dst_stride);
206 ST_H2(out0, 2, 6, dst + 4, dst_stride);
207 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
208 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
210 ST_W2(out0, 0, 2, dst, dst_stride);
211 ST_H2(out0, 2, 6, dst + 4, dst_stride);
212 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
213 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
214 dst += (4 * dst_stride);
215 ST_W2(out2, 0, 2, dst, dst_stride);
216 ST_H2(out2, 2, 6, dst + 4, dst_stride);
223 const int16_t *src1_ptr,
229 uint64_t tp0, tp1, tp2, tp3;
230 v16u8 out0, out1, out2, out3;
231 v16i8
src0 = { 0 },
src1 = { 0 },
src2 = { 0 }, src3 = { 0 };
233 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
234 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
237 LD2(src0_ptr, src_stride, tp0, tp1);
239 LD_SH2(src1_ptr, src2_stride, in0, in1);
243 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
244 ST_D2(out0, 0, 1, dst, dst_stride);
246 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
251 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
252 SLLI_4V(dst0, dst1, dst2, dst3, 6);
254 7, dst0, dst1, dst2, dst3);
256 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
258 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
259 src0_ptr += 4 * src_stride;
262 LD2(src0_ptr, src_stride, tp0, tp1);
267 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
268 SLLI_4V(dst0, dst1, dst2, dst3, 6);
271 7, dst0, dst1, dst2, dst3);
273 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
274 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
275 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
276 }
else if (0 ==
height % 8) {
279 for (loop_cnt = (
height >> 3); loop_cnt--;) {
280 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
281 src0_ptr += 4 * src_stride;
284 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
285 src0_ptr += 4 * src_stride;
292 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
294 src1_ptr += (8 * src2_stride);
295 SLLI_4V(dst0, dst1, dst2, dst3, 6);
296 SLLI_4V(dst4, dst5, dst6, dst7, 6);
298 dst3, 7, dst0, dst1, dst2, dst3);
300 dst7, 7, dst4, dst5, dst6, dst7);
303 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
304 dst += (8 * dst_stride);
311 const int16_t *src1_ptr,
319 v16u8 out0, out1, out2;
321 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
322 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
324 for (loop_cnt = 4; loop_cnt--;) {
326 src0_ptr += (4 * src_stride);
328 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
329 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
330 src1_ptr += (4 * src2_stride);
332 ILVR_B4_SH(
zero,
src0,
zero,
src1,
zero,
src2,
zero, src3, dst0, dst1,
334 SLLI_4V(dst0, dst1, dst2, dst3, 6);
339 7, dst0, dst1, dst2, dst3);
341 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
342 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
343 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
344 dst += (4 * dst_stride);
350 const int16_t *src1_ptr,
357 v16u8 out0, out1, out2, out3;
359 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
360 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
363 for (loop_cnt = (
height >> 2); loop_cnt--;) {
365 src0_ptr += (4 * src_stride);
366 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
367 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
368 src1_ptr += (4 * src2_stride);
373 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
374 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
376 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
378 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
379 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
380 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
381 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
382 dst += (4 * dst_stride);
388 const int16_t *src1_ptr,
395 v16u8 out0, out1, out2, out3, out4, out5;
397 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
398 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
400 for (loop_cnt = 8; loop_cnt--;) {
402 LD_SB4(src0_ptr + 16, src_stride,
src2, src3, src6, src7);
403 src0_ptr += (4 * src_stride);
404 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
405 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
406 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
407 src1_ptr += (4 * src2_stride);
415 SLLI_4V(dst0, dst1, dst2, dst3, 6);
416 SLLI_4V(dst4, dst5, dst6, dst7, 6);
417 SLLI_4V(dst8, dst9, dst10, dst11, 6);
419 7, dst0, dst1, dst2, dst3);
421 7, dst4, dst5, dst6, dst7);
423 dst11, 7, dst8, dst9, dst10, dst11);
424 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
425 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
426 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
427 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
428 dst += (4 * dst_stride);
434 const int16_t *src1_ptr,
441 v16u8 out0, out1, out2, out3;
444 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
445 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
447 for (loop_cnt = (
height >> 1); loop_cnt--;) {
449 src0_ptr += src_stride;
451 src0_ptr += src_stride;
452 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
453 src1_ptr += src2_stride;
454 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
455 src1_ptr += src2_stride;
461 SLLI_4V(dst0, dst1, dst2, dst3, 6);
462 SLLI_4V(dst4, dst5, dst6, dst7, 6);
464 7, dst0, dst1, dst2, dst3);
466 7, dst4, dst5, dst6, dst7);
469 ST_UB2(out0, out1, dst, 16);
471 ST_UB2(out2, out3, dst, 16);
478 const int16_t *src1_ptr,
485 v16u8 out0, out1, out2, out3, out4, out5;
488 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
489 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
491 for (loop_cnt = (
height >> 1); loop_cnt--;) {
493 src0_ptr += src_stride;
494 LD_SB3(src0_ptr, 16, src3, src4, src5);
495 src0_ptr += src_stride;
497 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
498 src1_ptr += src2_stride;
499 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
500 src1_ptr += src2_stride;
509 SLLI_4V(dst0, dst1, dst2, dst3, 6);
510 SLLI_4V(dst4, dst5, dst6, dst7, 6);
511 SLLI_4V(dst8, dst9, dst10, dst11, 6);
514 7, dst0, dst1, dst2, dst3);
516 7, dst4, dst5, dst6, dst7);
518 dst11, 7, dst8, dst9, dst10, dst11);
519 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
520 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
521 ST_UB2(out0, out1, dst, 16);
522 ST_UB(out2, dst + 32);
524 ST_UB2(out3, out4, dst, 16);
525 ST_UB(out5, dst + 32);
532 const int16_t *src1_ptr,
539 v16u8 out0, out1, out2, out3;
542 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
543 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
545 for (loop_cnt =
height; loop_cnt--;) {
547 src0_ptr += src_stride;
548 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
549 src1_ptr += src2_stride;
555 SLLI_4V(dst0, dst1, dst2, dst3, 6);
556 SLLI_4V(dst4, dst5, dst6, dst7, 6);
558 7, dst0, dst1, dst2, dst3);
560 7, dst4, dst5, dst6, dst7);
564 ST_UB4(out0, out1, out2, out3, dst, 16);
571 const int16_t *src1_ptr,
580 v8i16 filt0, filt1, filt2, filt3;
582 v16i8 mask1, mask2, mask3;
583 v16i8 vec0, vec1, vec2, vec3;
584 v8i16 dst0, dst1, dst2, dst3;
585 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
586 v8i16 filter_vec, const_vec;
593 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
599 const_vec = __msa_ldi_h(128);
602 for (loop_cnt = (
height >> 3); loop_cnt--;) {
604 src4, src5, src6, src7);
605 src0_ptr += (8 * src_stride);
606 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
607 src1_ptr += (8 * src2_stride);
618 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
619 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
622 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
623 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
626 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
627 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
630 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
631 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
635 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
638 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
639 dst += (8 * dst_stride);
643 src4, src5, src6, src7);
644 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
655 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
656 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
659 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
660 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
663 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
664 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
667 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
668 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
672 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
676 ST_W2(dst0, 0, 1, dst, dst_stride);
677 }
else if (res == 4) {
678 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
680 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
681 dst += (4 * dst_stride);
682 ST_W2(dst1, 0, 1, dst, dst_stride);
689 const int16_t *src1_ptr,
697 v8i16 filt0, filt1, filt2, filt3;
699 v16i8 mask1, mask2, mask3;
700 v16i8 vec0, vec1, vec2, vec3;
701 v8i16 dst0, dst1, dst2, dst3;
702 v8i16 in0, in1, in2, in3;
703 v8i16 filter_vec, const_vec;
708 const_vec = __msa_ldi_h(128);
712 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
718 for (loop_cnt = (
height >> 2); loop_cnt--;) {
720 src0_ptr += (4 * src_stride);
721 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
722 src1_ptr += (4 * src2_stride);
731 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
735 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
739 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
743 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
747 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
750 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
751 dst += (4 * dst_stride);
757 const int16_t *src1_ptr,
768 v16i8 vec0, vec1, vec2;
769 v8i16 filt0, filt1, filt2, filt3;
770 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
771 v8i16 dst0, dst1, dst2;
772 v8i16 in0, in1, in2, in3;
773 v8i16 filter_vec, const_vec;
776 const_vec = __msa_ldi_h(128);
780 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
791 for (loop_cnt = 8; loop_cnt--;) {
793 src0_ptr += src_stride;
795 src0_ptr += src_stride;
796 LD_SH2(src1_ptr, 8, in0, in1);
797 src1_ptr += src2_stride;
798 LD_SH2(src1_ptr, 8, in2, in3);
799 src1_ptr += src2_stride;
809 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
813 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
817 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
821 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
823 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
825 dst2 = __msa_adds_s_h(in2, dst2);
826 dst2 = __msa_srari_h(dst2, 7);
830 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
831 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
832 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
833 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
845 const int16_t *src1_ptr,
854 v8i16 filt0, filt1, filt2, filt3;
855 v16i8 mask1, mask2, mask3;
856 v16i8 vec0, vec1, vec2, vec3;
857 v8i16 dst0, dst1, dst2, dst3;
858 v8i16 in0, in1, in2, in3;
859 v8i16 filter_vec, const_vec;
863 const_vec = __msa_ldi_h(128);
867 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
873 for (loop_cnt = (
height >> 1); loop_cnt--;) {
875 src0_ptr += src_stride;
877 src0_ptr += src_stride;
878 LD_SH2(src1_ptr, 8, in0, in1);
879 src1_ptr += src2_stride;
880 LD_SH2(src1_ptr, 8, in2, in3);
881 src1_ptr += src2_stride;
890 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
894 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
898 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
902 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
906 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
909 ST_SH2(dst0, dst1, dst, dst_stride);
910 dst += (2 * dst_stride);
916 const int16_t *src1_ptr,
926 v8i16 filt0, filt1, filt2, filt3;
927 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
928 v16i8 vec0, vec1, vec2, vec3;
929 v8i16 dst0, dst1, dst2;
931 v8i16 filter_vec, const_vec;
934 src0_ptr = src0_ptr - 3;
935 const_vec = __msa_ldi_h(128);
939 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
949 for (loop_cnt =
height; loop_cnt--;) {
951 src0_ptr += src_stride;
952 LD_SH2(src1_ptr, 8, in0, in1);
953 in2 =
LD_SH(src1_ptr + 16);
954 src1_ptr += src2_stride;
962 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
966 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
970 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
974 dst2 = __msa_adds_s_h(dst2, in2);
975 dst2 = __msa_srari_h(dst2, 7);
979 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
981 SD(dst_val0, dst + 16);
988 const int16_t *src1_ptr,
997 v8i16 filt0, filt1, filt2, filt3;
998 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
999 v16i8 vec0, vec1, vec2, vec3;
1000 v8i16 dst0, dst1, dst2, dst3;
1001 v8i16 in0, in1, in2, in3;
1002 v8i16 filter_vec, const_vec;
1006 const_vec = __msa_ldi_h(128);
1010 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1020 for (loop_cnt =
height; loop_cnt--;) {
1023 src0_ptr += src_stride;
1024 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1025 src1_ptr += src2_stride;
1034 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1038 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1042 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1046 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1050 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
1053 ST_SB2(tmp0, tmp1, dst, 16);
1060 const int16_t *src1_ptr,
1069 v16i8 tmp0, tmp1, tmp2;
1070 v8i16 filt0, filt1, filt2, filt3;
1071 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1072 v16i8 vec0, vec1, vec2, vec3;
1073 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1074 v8i16 in0, in1, in2, in3, in4, in5;
1075 v8i16 filter_vec, const_vec;
1080 const_vec = __msa_ldi_h(128);
1084 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1094 for (loop_cnt = 64; loop_cnt--;) {
1096 src3 =
LD_SB(src0_ptr + 40);
1097 src0_ptr += src_stride;
1098 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1108 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1112 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1116 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1120 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1126 ST_SB(tmp1, dst + 16);
1128 LD_SH2(src1_ptr + 32, 8, in4, in5);
1129 src1_ptr += src2_stride;
1135 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1139 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1144 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1145 ST_SB(tmp2, dst + 32);
1152 const int16_t *src1_ptr,
1161 v8i16 filt0, filt1, filt2, filt3;
1163 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1164 v16i8 vec0, vec1, vec2, vec3;
1165 v8i16 dst0, dst1, dst2, dst3;
1166 v8i16 in0, in1, in2, in3;
1167 v8i16 filter_vec, const_vec;
1171 const_vec = __msa_ldi_h(128);
1175 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1185 for (loop_cnt =
height; loop_cnt--;) {
1188 LD_SB2(src0_ptr + 32, 16, src3, src4);
1189 src5 =
LD_SB(src0_ptr + 56);
1190 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1200 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1204 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1208 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1212 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1216 dst0, dst1, dst2, dst3, 7,
1217 dst0, dst1, dst2, dst3);
1220 ST_SB2(tmp0, tmp1, dst, 16);
1226 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1235 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1239 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1243 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1247 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1250 dst0, dst1, dst2, dst3, 7,
1251 dst0, dst1, dst2, dst3);
1253 ST_SB2(tmp0, tmp1, dst + 32, 16);
1254 src1_ptr += src2_stride;
1255 src0_ptr += src_stride;
1262 const int16_t *src1_ptr,
1272 v16i8 src6, src7, src8, src9, src10;
1273 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1274 v16i8 src11, src12, src13, src14;
1275 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1276 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1277 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1278 v16i8 src2110, src4332, src6554, src8776, src10998;
1279 v16i8 src12111110, src14131312;
1280 v8i16 dst10, dst32, dst54, dst76;
1281 v8i16 filt0, filt1, filt2, filt3;
1282 v8i16 filter_vec, const_vec;
1284 src0_ptr -= (3 * src_stride);
1286 const_vec = __msa_ldi_h(128);
1290 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1293 src0_ptr += (7 * src_stride);
1295 src10_r, src32_r, src54_r, src21_r);
1296 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1297 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1298 src2110, src4332, src6554);
1301 for (loop_cnt = (
height >> 3); loop_cnt--;) {
1302 LD_SB8(src0_ptr, src_stride,
1303 src7, src8, src9, src10, src11, src12, src13, src14);
1304 src0_ptr += (8 * src_stride);
1305 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1306 src1_ptr += (8 * src2_stride);
1310 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1311 src76_r, src87_r, src98_r, src109_r);
1312 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1313 src1110_r, src1211_r, src1312_r, src1413_r);
1314 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1315 src1413_r, src1312_r,
1316 src8776, src10998, src12111110, src14131312);
1321 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1324 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1327 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1329 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1330 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1333 dst10, dst32, dst54, dst76, 7,
1334 dst10, dst32, dst54, dst76);
1336 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1337 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1338 dst += (8 * dst_stride);
1341 src4332 = src12111110;
1342 src6554 = src14131312;
1346 LD_SB8(src0_ptr, src_stride,
1347 src7, src8, src9, src10, src11, src12, src13, src14);
1348 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1352 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1353 src76_r, src87_r, src98_r, src109_r);
1354 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1355 src1110_r, src1211_r, src1312_r, src1413_r);
1356 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1357 src1413_r, src1312_r,
1358 src8776, src10998, src12111110, src14131312);
1363 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1366 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1369 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1371 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1372 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1375 dst10, dst32, dst54, dst76, 7,
1376 dst10, dst32, dst54, dst76);
1378 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1380 ST_W2(dst10, 0, 1, dst, dst_stride);
1381 }
else if (res == 4) {
1382 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
1384 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
1385 dst += 4 * dst_stride;
1386 ST_W2(dst54, 0, 1, dst, dst_stride);
1393 const int16_t *src1_ptr,
1402 v16i8 src6, src7, src8, src9, src10;
1403 v8i16 in0, in1, in2, in3;
1404 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1405 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1406 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1407 v8i16 filt0, filt1, filt2, filt3;
1408 v8i16 filter_vec, const_vec;
1410 src0_ptr -= (3 * src_stride);
1411 const_vec = __msa_ldi_h(128);
1415 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1418 src0_ptr += (7 * src_stride);
1421 src10_r, src32_r, src54_r, src21_r);
1422 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1424 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1425 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1426 src0_ptr += (4 * src_stride);
1427 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1428 src1_ptr += (4 * src2_stride);
1430 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1431 src76_r, src87_r, src98_r, src109_r);
1435 filt0, filt1, filt2, filt3,
1436 dst0_r, dst0_r, dst0_r, dst0_r);
1439 filt0, filt1, filt2, filt3,
1440 dst1_r, dst1_r, dst1_r, dst1_r);
1443 filt0, filt1, filt2, filt3,
1444 dst2_r, dst2_r, dst2_r, dst2_r);
1447 filt0, filt1, filt2, filt3,
1448 dst3_r, dst3_r, dst3_r, dst3_r);
1451 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1452 dst0_r, dst1_r, dst2_r, dst3_r);
1454 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1455 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1456 dst += (4 * dst_stride);
1471 const int16_t *src1_ptr,
1479 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1480 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1481 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1482 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1483 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1484 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1485 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1486 v16i8 src2110, src4332, src6554, src8776, src10998;
1487 v8i16 dst0_l, dst1_l;
1488 v8i16 filt0, filt1, filt2, filt3;
1489 v8i16 filter_vec, const_vec;
1491 src0_ptr -= (3 * src_stride);
1492 const_vec = __msa_ldi_h(128);
1496 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1499 src0_ptr += (7 * src_stride);
1503 src10_r, src32_r, src54_r, src21_r);
1504 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1506 src10_l, src32_l, src54_l, src21_l);
1507 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1508 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1509 src2110, src4332, src6554);
1511 for (loop_cnt = (
height >> 2); loop_cnt--;) {
1512 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1513 src0_ptr += (4 * src_stride);
1514 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1515 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1516 src1_ptr += (4 * src2_stride);
1520 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1521 src76_r, src87_r, src98_r, src109_r);
1522 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1523 src76_l, src87_l, src98_l, src109_l);
1524 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1528 filt0, filt1, filt2, filt3,
1529 dst0_r, dst0_r, dst0_r, dst0_r);
1532 filt0, filt1, filt2, filt3,
1533 dst1_r, dst1_r, dst1_r, dst1_r);
1536 filt0, filt1, filt2, filt3,
1537 dst2_r, dst2_r, dst2_r, dst2_r);
1540 filt0, filt1, filt2, filt3,
1541 dst3_r, dst3_r, dst3_r, dst3_r);
1544 filt0, filt1, filt2, filt3,
1545 dst0_l, dst0_l, dst0_l, dst0_l);
1548 filt0, filt1, filt2, filt3,
1549 dst1_l, dst1_l, dst1_l, dst1_l);
1552 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1553 dst0_r, dst1_r, dst2_r, dst3_r);
1557 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1558 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1559 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1560 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
1561 dst += (4 * dst_stride);
1578 const int16_t *src1_ptr,
1585 const uint8_t *src0_ptr_tmp;
1586 const int16_t *src1_ptr_tmp;
1590 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1591 v8i16 in0, in1, in2, in3;
1592 v16i8 src10_r, src32_r, src54_r, src76_r;
1593 v16i8 src21_r, src43_r, src65_r, src87_r;
1594 v8i16 dst0_r, dst1_r;
1595 v16i8 src10_l, src32_l, src54_l, src76_l;
1596 v16i8 src21_l, src43_l, src65_l, src87_l;
1597 v8i16 dst0_l, dst1_l;
1598 v8i16 filt0, filt1, filt2, filt3;
1599 v8i16 filter_vec, const_vec;
1601 src0_ptr -= (3 * src_stride);
1602 const_vec = __msa_ldi_h(128);
1606 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1608 for (cnt = (
width >> 4); cnt--;) {
1609 src0_ptr_tmp = src0_ptr;
1610 src1_ptr_tmp = src1_ptr;
1613 LD_SB7(src0_ptr_tmp, src_stride,
1615 src0_ptr_tmp += (7 * src_stride);
1619 src10_r, src32_r, src54_r, src21_r);
1620 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1622 src10_l, src32_l, src54_l, src21_l);
1623 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1625 for (loop_cnt = (
height >> 1); loop_cnt--;) {
1626 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1627 src0_ptr_tmp += (2 * src_stride);
1628 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1629 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1630 src1_ptr_tmp += (2 * src2_stride);
1633 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1634 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1638 filt0, filt1, filt2, filt3,
1639 dst0_r, dst0_r, dst0_r, dst0_r);
1642 filt0, filt1, filt2, filt3,
1643 dst1_r, dst1_r, dst1_r, dst1_r);
1646 filt0, filt1, filt2, filt3,
1647 dst0_l, dst0_l, dst0_l, dst0_l);
1650 filt0, filt1, filt2, filt3,
1651 dst1_l, dst1_l, dst1_l, dst1_l);
1654 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1655 dst0_r, dst1_r, dst0_l, dst1_l);
1657 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1658 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1659 dst_tmp += (2 * dst_stride);
1684 const int16_t *src1_ptr,
1697 const int16_t *src1_ptr,
1712 const int16_t *src1_ptr,
1725 const int16_t *src1_ptr,
1738 const int16_t *src1_ptr,
1751 const int16_t *src1_ptr,
1755 const int8_t *filter_x,
1756 const int8_t *filter_y,
1762 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
1763 v8i16 in0 = { 0 }, in1 = { 0 };
1764 v8i16 filt0, filt1, filt2, filt3;
1765 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1766 v16i8 mask1, mask2, mask3;
1767 v8i16 filter_vec, const_vec;
1768 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1769 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1771 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1772 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1773 v4i32 dst0, dst1, dst2, dst3;
1776 src0_ptr -= ((3 * src_stride) + 3);
1777 filter_vec =
LD_SH(filter_x);
1778 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1780 filter_vec =
LD_SH(filter_y);
1783 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1789 const_vec = __msa_ldi_h(128);
1793 src0_ptr += (7 * src_stride);
1797 VSHF_B4_SB(
src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1798 VSHF_B4_SB(
src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1800 vec8, vec9, vec10, vec11);
1801 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1802 vec12, vec13, vec14, vec15);
1817 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1819 for (loop_cnt =
height >> 2; loop_cnt--;) {
1820 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1821 src0_ptr += (4 * src_stride);
1824 LD2(src1_ptr, src2_stride, tp0, tp1);
1826 src1_ptr += (2 * src2_stride);
1827 LD2(src1_ptr, src2_stride, tp0, tp1);
1829 src1_ptr += (2 * src2_stride);
1831 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1832 vec0, vec1, vec2, vec3);
1833 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1834 vec4, vec5, vec6, vec7);
1840 dst76 = __msa_ilvr_h(dst97, dst66);
1842 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1843 dst98 = __msa_ilvr_h(dst66, dst108);
1845 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1847 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1849 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1851 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1854 SRA_4V(dst0, dst1, dst2, dst3, 6);
1857 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1860 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1861 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
1862 dst += (4 * dst_stride);
1870 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1876 const int16_t *src1_ptr,
1880 const int8_t *filter_x,
1881 const int8_t *filter_y,
1886 const uint8_t *src0_ptr_tmp;
1887 const int16_t *src1_ptr_tmp;
1892 v8i16 filt0, filt1, filt2, filt3;
1893 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1895 v16i8 mask1, mask2, mask3;
1896 v8i16 filter_vec, const_vec;
1897 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1898 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1899 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1900 v4i32 dst0_r, dst0_l;
1901 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1902 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1904 src0_ptr -= ((3 * src_stride) + 3);
1905 const_vec = __msa_ldi_h(128);
1908 filter_vec =
LD_SH(filter_x);
1909 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1911 filter_vec =
LD_SH(filter_y);
1914 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1920 for (cnt =
width >> 3; cnt--;) {
1921 src0_ptr_tmp = src0_ptr;
1923 src1_ptr_tmp = src1_ptr;
1925 LD_SB7(src0_ptr_tmp, src_stride,
1927 src0_ptr_tmp += (7 * src_stride);
1932 vec0, vec1, vec2, vec3);
1934 vec4, vec5, vec6, vec7);
1936 vec8, vec9, vec10, vec11);
1937 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1938 vec12, vec13, vec14, vec15);
1948 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1949 vec0, vec1, vec2, vec3);
1950 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1951 vec4, vec5, vec6, vec7);
1952 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1953 vec8, vec9, vec10, vec11);
1961 for (loop_cnt =
height; loop_cnt--;) {
1962 src7 =
LD_SB(src0_ptr_tmp);
1963 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1964 src0_ptr_tmp += src_stride;
1966 in0 =
LD_SH(src1_ptr_tmp);
1967 src1_ptr_tmp += src2_stride;
1969 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1970 vec0, vec1, vec2, vec3);
1978 filt_h0, filt_h1, filt_h2, filt_h3);
1980 filt_h0, filt_h1, filt_h2, filt_h3);
1984 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1986 tmp = __msa_srari_h(
tmp, 7);
1988 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
1990 dst_tmp += dst_stride;
2009 const int16_t *src1_ptr,
2013 const int8_t *filter_x,
2014 const int8_t *filter_y,
2018 dst, dst_stride, filter_x, filter_y,
2024 const int16_t *src1_ptr,
2028 const int8_t *filter_x,
2029 const int8_t *filter_y,
2033 const uint8_t *src0_ptr_tmp;
2035 const int16_t *src1_ptr_tmp;
2038 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
2039 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2040 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2041 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2042 v8i16 in0, in1 = { 0 }, out0, out1,
tmp, filter_vec, const_vec;
2043 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2044 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2045 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
2046 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
2047 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2048 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2049 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
2051 src0_ptr -= ((3 * src_stride) + 3);
2053 const_vec = __msa_ldi_h(128);
2056 filter_vec =
LD_SH(filter_x);
2057 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2059 filter_vec =
LD_SH(filter_y);
2062 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2069 src0_ptr_tmp = src0_ptr;
2071 src1_ptr_tmp = src1_ptr;
2075 src0_ptr_tmp += (7 * src_stride);
2085 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2095 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2097 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
2099 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2108 for (loop_cnt = 16; loop_cnt--;) {
2109 src7 =
LD_SB(src0_ptr_tmp);
2110 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
2111 src0_ptr_tmp += src_stride;
2113 in0 =
LD_SH(src1_ptr_tmp);
2114 src1_ptr_tmp += src2_stride;
2116 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2124 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2125 filt_h1, filt_h2, filt_h3);
2126 dst0_l =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2127 filt_h1, filt_h2, filt_h3);
2131 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2133 tmp = __msa_srari_h(
tmp, 7);
2135 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
2137 dst_tmp += dst_stride;
2158 src0_ptr += (7 * src_stride);
2162 VSHF_B4_SB(
src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2163 VSHF_B4_SB(
src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2165 vec8, vec9, vec10, vec11);
2166 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2167 vec12, vec13, vec14, vec15);
2181 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2183 for (loop_cnt = 4; loop_cnt--;) {
2184 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2185 src0_ptr += (4 * src_stride);
2188 LD2(src1_ptr, src2_stride, tp0, tp1);
2190 src1_ptr += (2 * src2_stride);
2191 LD2(src1_ptr, src2_stride, tp0, tp1);
2193 src1_ptr += (2 * src2_stride);
2195 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2197 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2204 dst76 = __msa_ilvr_h(dst97, dst66);
2206 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2207 dst98 = __msa_ilvr_h(dst66, dst108);
2209 tmp0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2211 tmp1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2213 tmp2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2215 tmp3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2217 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2220 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2223 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2224 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
2225 dst += (4 * dst_stride);
2233 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2239 const int16_t *src1_ptr,
2243 const int8_t *filter_x,
2244 const int8_t *filter_y,
2248 dst, dst_stride, filter_x, filter_y,
2254 const int16_t *src1_ptr,
2258 const int8_t *filter_x,
2259 const int8_t *filter_y,
2263 dst, dst_stride, filter_x, filter_y,
2269 const int16_t *src1_ptr,
2273 const int8_t *filter_x,
2274 const int8_t *filter_y,
2278 dst, dst_stride, filter_x, filter_y,
2284 const int16_t *src1_ptr,
2288 const int8_t *filter_x,
2289 const int8_t *filter_y,
2293 dst, dst_stride, filter_x, filter_y,
2299 const int16_t *src1_ptr,
2303 const int8_t *filter_x,
2304 const int8_t *filter_y,
2308 dst, dst_stride, filter_x, filter_y,
2314 const int16_t *src1_ptr,
2322 v16i8
src0,
src1, dst0, vec0, vec1;
2327 v8i16 filter_vec, const_vec;
2331 const_vec = __msa_ldi_h(128);
2340 LD_SH2(src1_ptr, src2_stride, in0, in1);
2341 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2347 tmp0 = __msa_adds_s_h(tmp0, in0);
2348 tmp0 = __msa_srari_h(tmp0, 7);
2350 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2352 ST_W2(dst0, 0, 1, dst, dst_stride);
2357 const int16_t *src1_ptr,
2366 v8i16 in0, in1, in2, in3;
2371 v8i16 filter_vec, const_vec;
2375 const_vec = __msa_ldi_h(128);
2384 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2393 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2396 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2398 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2403 const int16_t *src1_ptr,
2414 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2416 v16i8 mask1, vec0, vec1, vec2, vec3;
2417 v8i16 tmp0, tmp1, tmp2, tmp3;
2418 v8i16 filter_vec, const_vec;
2422 const_vec = __msa_ldi_h(128);
2430 for (loop_cnt = (
height >> 3); loop_cnt--;) {
2431 LD_SB8(src0_ptr, src_stride,
2433 src0_ptr += (8 * src_stride);
2434 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2435 src1_ptr += (4 * src2_stride);
2436 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2437 src1_ptr += (4 * src2_stride);
2447 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2448 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2451 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2452 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2456 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2459 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2460 dst += (8 * dst_stride);
2466 const int16_t *src1_ptr,
2476 }
else if (4 ==
height) {
2481 src1_ptr, src2_stride,
2488 const int16_t *src1_ptr,
2499 v8i16 in0, in1, in2, in3;
2502 v16i8 vec0, vec1, vec2, vec3;
2503 v8i16 dst0, dst1, dst2, dst3;
2504 v8i16 filter_vec, const_vec;
2508 const_vec = __msa_ldi_h(128);
2516 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2518 src0_ptr += (4 * src_stride);
2519 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2520 src1_ptr += (4 * src2_stride);
2529 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2533 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2537 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2540 ST_W2(dst0, 0, 2, dst, dst_stride);
2541 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2542 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2543 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2544 dst += (4 * dst_stride);
2548 LD_SH2(src1_ptr, src2_stride, in0, in1);
2560 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2561 ST_W2(dst0, 0, 2, dst, dst_stride);
2562 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2568 const int16_t *src1_ptr,
2579 v16i8 mask1, vec0, vec1, vec2, vec3;
2581 v8i16 filter_vec, const_vec;
2585 const_vec = __msa_ldi_h(128);
2594 LD_SH2(src1_ptr, src2_stride, in0, in1);
2601 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2605 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2606 ST_D2(dst0, 0, 1, dst, dst_stride);
2611 const int16_t *src1_ptr,
2620 v8i16 in0, in1, in2, in3, in4, in5;
2623 v16i8 vec0, vec1, vec2, vec3;
2624 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2625 v8i16 filter_vec, const_vec;
2629 const_vec = __msa_ldi_h(128);
2638 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2639 src1_ptr += (4 * src2_stride);
2640 LD_SH2(src1_ptr, src2_stride, in4, in5);
2649 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2653 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2658 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2659 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2660 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2664 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2668 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2669 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2670 ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
2675 const int16_t *src1_ptr,
2685 v8i16 in0, in1, in2, in3;
2688 v16i8 vec0, vec1, vec2, vec3;
2689 v8i16 dst0, dst1, dst2, dst3;
2690 v8i16 filter_vec, const_vec;
2694 const_vec = __msa_ldi_h(128);
2702 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2704 src0_ptr += (4 * src_stride);
2705 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2706 src1_ptr += (4 * src2_stride);
2715 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2719 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2723 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2726 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2727 dst += (4 * dst_stride);
2733 const int16_t *src1_ptr,
2743 }
else if (6 ==
height) {
2746 }
else if (0 == (
height % 4)) {
2748 src1_ptr, src2_stride,
2755 const int16_t *src1_ptr,
2765 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2768 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2771 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2772 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2773 v8i16 filter_vec, const_vec;
2777 const_vec = __msa_ldi_h(128);
2786 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2788 src0_ptr += (4 * src_stride);
2789 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2790 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2791 src1_ptr += (4 * src2_stride);
2805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2811 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2816 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2820 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2821 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2822 ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
2823 dst += (4 * dst_stride);
2829 const int16_t *src1_ptr,
2838 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2842 v8i16 filter_vec, const_vec;
2846 const_vec = __msa_ldi_h(128);
2854 for (loop_cnt = (
height >> 1); loop_cnt--;) {
2856 LD_SB2(src0_ptr + 8, src_stride,
src1, src3);
2857 src0_ptr += (2 * src_stride);
2858 LD_SH2(src1_ptr, src2_stride, in0, in2);
2859 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2860 src1_ptr += (2 * src2_stride);
2871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2879 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2882 ST_SH2(dst0, dst1, dst, dst_stride);
2883 dst += (2 * dst_stride);
2889 const int16_t *src1_ptr,
2896 const int16_t *src1_ptr_tmp;
2900 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2903 v16i8 mask1, mask2, mask3;
2904 v16i8 vec0, vec1, vec2, vec3;
2905 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2906 v8i16 filter_vec, const_vec;
2910 const_vec = __msa_ldi_h(128);
2921 src1_ptr_tmp = src1_ptr + 16;
2923 for (loop_cnt = (
height >> 2); loop_cnt--;) {
2925 LD_SB4(src0_ptr + 16, src_stride,
src1, src3, src5, src7);
2926 src0_ptr += (4 * src_stride);
2927 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2928 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2929 src1_ptr += (4 * src2_stride);
2938 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2942 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2949 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2950 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2951 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2953 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2954 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2959 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2961 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2964 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2965 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2966 dst += (4 * dst_stride);
2968 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2969 src1_ptr_tmp += (4 * src2_stride);
2976 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2977 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2980 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2981 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2985 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2988 ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
2989 dst_tmp += (4 * dst_stride);
2995 const int16_t *src1_ptr,
3004 v8i16 in0, in1, in2, in3;
3007 v16i8 mask1, mask2, mask3;
3008 v8i16 dst0, dst1, dst2, dst3;
3009 v16i8 vec0, vec1, vec2, vec3;
3010 v8i16 filter_vec, const_vec;
3014 const_vec = __msa_ldi_h(128);
3024 for (loop_cnt =
height; loop_cnt--;) {
3027 src0_ptr += src_stride;
3028 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3029 src1_ptr += src2_stride;
3038 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
3042 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
3046 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
3049 ST_SH2(dst0, dst1, dst, 16);
3056 const int16_t *src1_ptr,
3065 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3068 v8i16 filter_vec, const_vec;
3070 src0_ptr -= src_stride;
3072 const_vec = __msa_ldi_h(128);
3079 src0_ptr += (3 * src_stride);
3082 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3083 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3085 LD_SB2(src0_ptr, src_stride, src3, src4);
3086 LD_SH2(src1_ptr, src2_stride, in0, in1);
3087 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3089 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3090 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3093 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3094 dst10 = __msa_adds_s_h(dst10, in0);
3095 dst10 = __msa_srari_h(dst10, 7);
3098 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
3099 ST_W2(dst10, 0, 1, dst, dst_stride);
3104 const int16_t *src1_ptr,
3112 v8i16 in0, in1, in2, in3;
3113 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3114 v16i8 src2110, src4332, src6554;
3117 v8i16 filter_vec, const_vec;
3119 src0_ptr -= src_stride;
3121 const_vec = __msa_ldi_h(128);
3128 src0_ptr += (3 * src_stride);
3130 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3131 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3133 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3134 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3137 src32_r, src43_r, src54_r, src65_r);
3138 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3142 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3144 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3147 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3148 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3153 const int16_t *src1_ptr,
3162 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3163 v16i8 src6, src7, src8, src9;
3164 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3165 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3166 v16i8 src2110, src4332, src6554, src8776;
3167 v8i16 dst10, dst32, dst54, dst76;
3169 v8i16 filter_vec, const_vec;
3171 src0_ptr -= src_stride;
3173 const_vec = __msa_ldi_h(128);
3180 src0_ptr += (3 * src_stride);
3182 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3183 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3185 for (loop_cnt = (
height >> 3); loop_cnt--;) {
3186 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3187 src0_ptr += (6 * src_stride);
3188 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3189 src1_ptr += (8 * src2_stride);
3193 src32_r, src43_r, src54_r, src65_r);
3194 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3195 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3196 src4332, src6554, src8776);
3200 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3202 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3204 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3207 src0_ptr += (2 * src_stride);
3209 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3210 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3212 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3215 dst10, dst32, dst54, dst76, 7,
3216 dst10, dst32, dst54, dst76);
3218 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3219 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3220 dst += (8 * dst_stride);
3226 const int16_t *src1_ptr,
3236 }
else if (4 ==
height) {
3241 src1_ptr, src2_stride,
3248 const int16_t *src1_ptr,
3255 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
3256 v8i16 in0, in1, in2, in3;
3257 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3258 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3260 v8i16 filter_vec, const_vec;
3262 src0_ptr -= src_stride;
3264 const_vec = __msa_ldi_h(128);
3271 src0_ptr += (3 * src_stride);
3272 LD_SB2(src0_ptr, src_stride, src3, src4);
3273 src0_ptr += (2 * src_stride);
3274 LD_SB2(src0_ptr, src_stride, src5, src6);
3275 src0_ptr += (2 * src_stride);
3276 LD_SB2(src0_ptr, src_stride, src7, src8);
3277 src0_ptr += (2 * src_stride);
3278 LD_SB2(src0_ptr, src_stride, src9, src10);
3279 src0_ptr += (2 * src_stride);
3281 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3282 src1_ptr += (4 * src2_stride);
3294 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3296 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3298 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3301 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3303 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3306 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3307 dst0_r, dst1_r, dst2_r, dst3_r);
3309 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3310 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3311 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3312 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3313 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3314 dst += (4 * dst_stride);
3316 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3317 src1_ptr += (4 * src2_stride);
3318 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3321 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3323 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3325 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3328 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3330 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3333 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3334 dst0_r, dst1_r, dst2_r, dst3_r);
3336 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3337 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3338 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3339 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3340 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3341 dst += (4 * dst_stride);
3346 const int16_t *src1_ptr,
3354 v8i16 in0, in1, dst0_r, dst1_r;
3355 v16i8 src10_r, src32_r, src21_r, src43_r;
3357 v8i16 filter_vec, const_vec;
3359 src0_ptr -= src_stride;
3361 const_vec = __msa_ldi_h(128);
3368 src0_ptr += (3 * src_stride);
3372 LD_SB2(src0_ptr, src_stride, src3, src4);
3373 LD_SH2(src1_ptr, src2_stride, in0, in1);
3378 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3380 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3383 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3385 ST_D2(dst0_r, 0, 1, dst, dst_stride);
3390 const int16_t *src1_ptr,
3397 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
3398 v8i16 in0, in1, in2, in3, in4, in5;
3399 v16i8 src10_r, src32_r, src54_r, src76_r;
3400 v16i8 src21_r, src43_r, src65_r, src87_r;
3401 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3403 v8i16 filter_vec, const_vec;
3405 src0_ptr -= src_stride;
3407 const_vec = __msa_ldi_h(128);
3414 src0_ptr += (3 * src_stride);
3418 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3419 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3422 src32_r, src43_r, src54_r, src65_r);
3423 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3426 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3428 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3430 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3432 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3434 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3436 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3438 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3439 dst0_r, dst1_r, dst2_r, dst3_r);
3442 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3443 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3444 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3445 ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
3450 const int16_t *src1_ptr,
3459 v8i16 in0, in1, in2, in3;
3460 v16i8 src10_r, src32_r, src21_r, src43_r;
3461 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3463 v8i16 filter_vec, const_vec;
3465 src0_ptr -= src_stride;
3467 const_vec = __msa_ldi_h(128);
3474 src0_ptr += (3 * src_stride);
3478 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3479 LD_SB2(src0_ptr, src_stride, src3, src4);
3480 src0_ptr += (2 * src_stride);
3481 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3482 src1_ptr += (4 * src2_stride);
3487 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3489 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3492 src0_ptr += (2 * src_stride);
3497 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3499 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3501 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3502 dst0_r, dst1_r, dst2_r, dst3_r);
3504 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3505 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3506 dst += (4 * dst_stride);
3512 const int16_t *src1_ptr,
3522 }
else if (6 ==
height) {
3527 src1_ptr, src2_stride,
3534 const int16_t *src1_ptr,
3543 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3544 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3545 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3546 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3547 v16i8 src2110, src4332, src6554;
3548 v8i16 dst0_l, dst1_l, filt0, filt1;
3549 v8i16 filter_vec, const_vec;
3551 src0_ptr -= (1 * src_stride);
3553 const_vec = __msa_ldi_h(128);
3560 src0_ptr += (3 * src_stride);
3564 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3566 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3567 LD_SB2(src0_ptr, src_stride, src3, src4);
3568 src0_ptr += (2 * src_stride);
3569 LD_SB2(src0_ptr, src_stride, src5, src6);
3570 src0_ptr += (2 * src_stride);
3571 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3572 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3573 src1_ptr += (4 * src2_stride);
3580 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3581 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3582 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3583 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3586 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3588 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3590 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3592 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3594 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3596 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3598 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3599 dst0_r, dst1_r, dst2_r, dst3_r);
3602 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3603 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3604 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3605 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
3606 dst += (4 * dst_stride);
3617 const int16_t *src1_ptr,
3626 v8i16 in0, in1, in2, in3;
3627 v16i8 src10_r, src32_r, src21_r, src43_r;
3628 v16i8 src10_l, src32_l, src21_l, src43_l;
3629 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3631 v8i16 filter_vec, const_vec;
3633 src0_ptr -= src_stride;
3635 const_vec = __msa_ldi_h(128);
3642 src0_ptr += (3 * src_stride);
3647 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3648 LD_SB2(src0_ptr, src_stride, src3, src4);
3649 src0_ptr += (2 * src_stride);
3650 LD_SH2(src1_ptr, src2_stride, in0, in1);
3651 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3652 src1_ptr += (2 * src2_stride);
3658 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3660 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3662 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3664 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3666 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3667 dst0_r, dst1_r, dst0_l, dst1_l);
3669 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3670 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3671 dst += (2 * dst_stride);
3674 src0_ptr += (2 * src_stride);
3675 LD_SH2(src1_ptr, src2_stride, in0, in1);
3676 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3677 src1_ptr += (2 * src2_stride);
3683 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3685 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3687 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3689 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3691 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3692 dst0_r, dst1_r, dst0_l, dst1_l);
3694 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3695 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3696 dst += (2 * dst_stride);
3702 const int16_t *src1_ptr,
3711 v16i8 src6, src7, src8, src9, src10, src11;
3712 v8i16 in0, in1, in2, in3, in4, in5;
3713 v16i8 src10_r, src32_r, src76_r, src98_r;
3714 v16i8 src21_r, src43_r, src87_r, src109_r;
3715 v16i8 src10_l, src32_l, src21_l, src43_l;
3716 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3717 v8i16 dst0_l, dst1_l;
3719 v8i16 filter_vec, const_vec;
3721 src0_ptr -= src_stride;
3723 const_vec = __msa_ldi_h(128);
3735 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3736 src0_ptr += (3 * src_stride);
3738 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3740 for (loop_cnt = (
height >> 2); loop_cnt--;) {
3742 LD_SB2(src0_ptr, src_stride, src3, src4);
3743 LD_SH2(src1_ptr, src2_stride, in0, in1);
3744 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3745 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3746 src1_ptr += (2 * src2_stride);
3751 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3752 src0_ptr += (2 * src_stride);
3754 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3759 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3761 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3763 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3766 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3768 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3771 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3772 dst0_r, dst1_r, dst0_l, dst1_l);
3776 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3777 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3778 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3779 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3780 dst += (2 * dst_stride);
3784 LD_SH2(src1_ptr, src2_stride, in0, in1);
3785 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3786 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3787 src1_ptr += (2 * src2_stride);
3792 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3793 src0_ptr += (2 * src_stride);
3795 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3798 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3800 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3802 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3804 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3807 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3809 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3812 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3813 dst0_r, dst1_r, dst0_l, dst1_l);
3816 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3817 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3818 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3819 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3820 dst += (2 * dst_stride);
3826 const int16_t *src1_ptr,
3834 uint8_t *dst_tmp = dst + 16;
3835 v16i8
src0,
src1,
src2, src3, src4, src6, src7, src8, src9, src10;
3836 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3837 v16i8 src10_r, src32_r, src76_r, src98_r;
3838 v16i8 src21_r, src43_r, src87_r, src109_r;
3839 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3840 v16i8 src10_l, src32_l, src76_l, src98_l;
3841 v16i8 src21_l, src43_l, src87_l, src109_l;
3842 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3844 v8i16 filter_vec, const_vec;
3846 src0_ptr -= src_stride;
3848 const_vec = __msa_ldi_h(128);
3861 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3862 src0_ptr += (3 * src_stride);
3864 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3865 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3867 for (loop_cnt = (
height >> 1); loop_cnt--;) {
3869 LD_SB2(src0_ptr, src_stride, src3, src4);
3870 LD_SH2(src1_ptr, src2_stride, in0, in1);
3871 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3872 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3873 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3874 src1_ptr += (2 * src2_stride);
3880 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3882 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3884 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3886 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3889 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3890 dst0_r, dst1_r, dst0_l, dst1_l);
3898 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3899 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3900 dst += (2 * dst_stride);
3903 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3904 src0_ptr += (2 * src_stride);
3906 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3907 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3910 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3912 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3914 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3916 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3919 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3920 dst2_r, dst3_r, dst2_l, dst3_l);
3922 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3923 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3924 dst_tmp += (2 * dst_stride);
3936 const int16_t *src1_ptr,
3940 const int8_t *filter_x,
3941 const int8_t *filter_y)
3948 v8i16 filt_h0, filt_h1;
3951 v8i16 filter_vec, const_vec;
3952 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3953 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43,
tmp;
3956 src0_ptr -= (src_stride + 1);
3958 filter_vec =
LD_SH(filter_x);
3961 filter_vec =
LD_SH(filter_y);
3968 const_vec = __msa_ldi_h(128);
3974 LD2(src1_ptr, src2_stride, tp0, tp1);
3976 in0 = __msa_adds_s_h(in0, const_vec);
3993 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3994 tmp = __msa_adds_s_h(
tmp, in0);
3995 tmp = __msa_srari_h(
tmp, 7);
3997 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8)
tmp);
4003 const int16_t *src1_ptr,
4007 const int8_t *filter_x,
4008 const int8_t *filter_y)
4014 v8i16 filt_h0, filt_h1;
4017 v8i16 filter_vec, const_vec;
4018 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4020 v8i16 in0 = { 0 }, in1 = { 0 };
4021 v8i16 dst30, dst41, dst52, dst63;
4022 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4023 v4i32 dst0, dst1, dst2, dst3;
4025 src0_ptr -= (src_stride + 1);
4027 filter_vec =
LD_SH(filter_x);
4030 filter_vec =
LD_SH(filter_y);
4040 const_vec = __msa_ldi_h(128);
4043 LD2(src1_ptr, src2_stride, tp0, tp1);
4044 src1_ptr += 2 * src2_stride;
4046 LD2(src1_ptr, src2_stride, tp0, tp1);
4049 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4054 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4068 SRA_4V(dst0, dst1, dst2, dst3, 6);
4073 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4074 ST_W4(
out, 0, 1, 2, 3, dst, dst_stride);
4079 const int16_t *src1_ptr,
4083 const int8_t *filter_x,
4084 const int8_t *filter_y,
4090 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4092 v8i16 filt_h0, filt_h1;
4095 v8i16 filter_vec, const_vec;
4096 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4097 v8i16 tmp0, tmp1, tmp2, tmp3;
4098 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4099 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4100 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4101 v8i16 dst98_r, dst109_r;
4102 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4103 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4105 src0_ptr -= (src_stride + 1);
4107 filter_vec =
LD_SH(filter_x);
4110 filter_vec =
LD_SH(filter_y);
4117 const_vec = __msa_ldi_h(128);
4121 src0_ptr += (3 * src_stride);
4129 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4132 for (loop_cnt =
height >> 3; loop_cnt--;) {
4133 LD_SB8(src0_ptr, src_stride,
4134 src3, src4, src5, src6, src7, src8, src9, src10);
4135 src0_ptr += (8 * src_stride);
4137 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4138 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4139 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4140 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4147 dst32_r = __msa_ilvr_h(dst73, dst22);
4151 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4152 dst76_r = __msa_ilvr_h(dst22, dst106);
4154 LD2(src1_ptr, src2_stride, tp0, tp1);
4155 src1_ptr += 2 * src2_stride;
4157 LD2(src1_ptr, src2_stride, tp0, tp1);
4158 src1_ptr += 2 * src2_stride;
4161 LD2(src1_ptr, src2_stride, tp0, tp1);
4162 src1_ptr += 2 * src2_stride;
4164 LD2(src1_ptr, src2_stride, tp0, tp1);
4165 src1_ptr += 2 * src2_stride;
4168 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4169 const_vec, in0, in1, in2, in3);
4178 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4179 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4181 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4182 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4187 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4188 dst += (8 * dst_stride);
4192 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4198 const int16_t *src1_ptr,
4202 const int8_t *filter_x,
4203 const int8_t *filter_y,
4208 dst, dst_stride, filter_x, filter_y);
4209 }
else if (4 ==
height) {
4211 dst, dst_stride, filter_x, filter_y);
4212 }
else if (0 == (
height % 8)) {
4214 src1_ptr, src2_stride,
4216 filter_x, filter_y,
height);
4222 const int16_t *src1_ptr,
4226 const int8_t *filter_x,
4227 const int8_t *filter_y,
4230 uint32_t tpw0, tpw1, tpw2, tpw3;
4232 v16u8 out0, out1, out2;
4233 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4234 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4236 v8i16 filt_h0, filt_h1;
4239 v8i16 filter_vec, const_vec;
4240 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4241 v8i16 dsth10, tmp4, tmp5;
4242 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4243 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4244 v8i16 tmp0, tmp1, tmp2, tmp3;
4245 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4246 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4247 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4248 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4249 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4250 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4251 v8i16 in4 = { 0 }, in5 = { 0 };
4253 src0_ptr -= (src_stride + 1);
4255 filter_vec =
LD_SH(filter_x);
4258 filter_vec =
LD_SH(filter_y);
4265 const_vec = __msa_ldi_h(128);
4269 src0_ptr += (3 * src_stride);
4283 LD_SB8(src0_ptr, src_stride,
4284 src3, src4, src5, src6, src7, src8, src9, src10);
4287 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4288 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4289 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4290 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4297 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4298 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4299 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4300 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4315 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4316 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4317 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4330 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4331 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4332 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4333 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4334 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4335 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4336 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4338 LD2(src1_ptr, src2_stride, tp0, tp1);
4340 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4343 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4345 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4348 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4349 in0, in1, in2, in3);
4350 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4355 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4357 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4358 src1_ptr += (4 * src2_stride);
4360 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4362 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4366 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4367 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4372 const int16_t *src1_ptr,
4376 const int8_t *filter_x,
4377 const int8_t *filter_y)
4382 v8i16 filt_h0, filt_h1;
4385 v8i16 filter_vec, const_vec;
4386 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4387 v8i16 dst0, dst1, dst2, dst3, dst4;
4388 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4389 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4390 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4394 src0_ptr -= (src_stride + 1);
4396 filter_vec =
LD_SH(filter_x);
4399 filter_vec =
LD_SH(filter_y);
4406 const_vec = __msa_ldi_h(128);
4412 LD_SH2(src1_ptr, src2_stride, in0, in1);
4413 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4418 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4419 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4435 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4436 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4440 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4446 const int16_t *src1_ptr,
4450 const int8_t *filter_x,
4451 const int8_t *filter_y,
4456 v16i8
src0,
src1,
src2, src3, src4, src5, src6, mask0, mask1;
4457 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4458 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4459 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4460 v8i16 in0, in1, in2, in3;
4461 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4462 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4463 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4465 src0_ptr -= (src_stride + 1);
4467 filter_vec =
LD_SH(filter_x);
4470 filter_vec =
LD_SH(filter_y);
4478 const_vec = __msa_ldi_h(128);
4481 for (cnt = width8mult; cnt--;) {
4486 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4488 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4489 const_vec, in0, in1, in2, in3);
4502 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4503 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4504 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4505 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4526 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4527 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4528 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4529 dst3_r, tmp0, tmp1, tmp2, tmp3);
4530 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4531 tmp0, tmp1, tmp2, tmp3);
4535 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4542 const int16_t *src1_ptr,
4546 const int8_t *filter_x,
4547 const int8_t *filter_y)
4549 v16u8 out0, out1, out2;
4550 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
4551 v8i16 in0, in1, in2, in3, in4, in5;
4553 v8i16 filt_h0, filt_h1;
4556 v8i16 filter_vec, const_vec;
4557 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4558 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4559 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4560 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4561 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4562 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4563 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4564 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4565 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4566 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4568 src0_ptr -= (src_stride + 1);
4570 filter_vec =
LD_SH(filter_x);
4573 filter_vec =
LD_SH(filter_y);
4580 const_vec = __msa_ldi_h(128);
4584 src0_ptr += (5 * src_stride);
4585 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4590 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4591 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4592 in0, in1, in2, in3);
4593 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4598 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4599 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4600 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4601 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4602 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4603 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4637 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4638 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4639 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4640 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4641 tmp0, tmp1, tmp2, tmp3);
4642 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4643 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4644 tmp0, tmp1, tmp2, tmp3);
4651 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4652 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4653 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4658 const int16_t *src1_ptr,
4662 const int8_t *filter_x,
4663 const int8_t *filter_y,
4667 uint32_t loop_cnt, cnt;
4668 const uint8_t *src0_ptr_tmp;
4669 const int16_t *src1_ptr_tmp;
4673 v8i16 in0, in1, in2, in3;
4675 v8i16 filt_h0, filt_h1;
4678 v8i16 filter_vec, const_vec;
4679 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4680 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4681 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4682 v8i16 tmp0, tmp1, tmp2, tmp3;
4683 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4684 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4685 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4687 src0_ptr -= (src_stride + 1);
4689 filter_vec =
LD_SH(filter_x);
4692 filter_vec =
LD_SH(filter_y);
4699 const_vec = __msa_ldi_h(128);
4702 for (cnt =
width >> 3; cnt--;) {
4703 src0_ptr_tmp = src0_ptr;
4705 src1_ptr_tmp = src1_ptr;
4708 src0_ptr_tmp += (3 * src_stride);
4722 for (loop_cnt =
height >> 2; loop_cnt--;) {
4723 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4724 src0_ptr_tmp += (4 * src_stride);
4725 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4726 src1_ptr_tmp += (4 * src2_stride);
4729 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4730 const_vec, in0, in1, in2, in3);
4732 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4733 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4734 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4735 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4756 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4757 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4758 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4759 dst3_r, tmp0, tmp1, tmp2, tmp3);
4760 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4761 tmp0, tmp1, tmp2, tmp3);
4765 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4766 dst_tmp += (4 * dst_stride);
4783 const int16_t *src1_ptr,
4787 const int8_t *filter_x,
4788 const int8_t *filter_y,
4793 dst, dst_stride, filter_x, filter_y);
4794 }
else if (4 ==
height) {
4796 dst, dst_stride, filter_x, filter_y, 1);
4797 }
else if (6 ==
height) {
4799 dst, dst_stride, filter_x, filter_y);
4802 src1_ptr, src2_stride,
4804 filter_x, filter_y,
height, 8);
4810 const int16_t *src1_ptr,
4814 const int8_t *filter_x,
4815 const int8_t *filter_y,
4820 const uint8_t *src0_ptr_tmp;
4822 const int16_t *src1_ptr_tmp;
4824 v16i8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, src9, src10;
4825 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4826 v16i8 mask0, mask1, mask2, mask3;
4827 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4828 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4829 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4830 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4831 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4832 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4833 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4834 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4835 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4837 src0_ptr -= (src_stride + 1);
4839 filter_vec =
LD_SH(filter_x);
4842 filter_vec =
LD_SH(filter_y);
4850 const_vec = __msa_ldi_h(128);
4853 src0_ptr_tmp = src0_ptr;
4855 src1_ptr_tmp = src1_ptr;
4858 src0_ptr_tmp += (3 * src_stride);
4873 for (loop_cnt = 4; loop_cnt--;) {
4874 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4875 src0_ptr_tmp += (4 * src_stride);
4878 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4879 src1_ptr_tmp += (4 * src2_stride);
4880 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4881 const_vec, in0, in1, in2, in3);
4883 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4884 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4885 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4886 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4907 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4908 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4909 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4910 dst3_r, tmp0, tmp1, tmp2, tmp3);
4911 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4912 tmp0, tmp1, tmp2, tmp3);
4916 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4917 dst_tmp += (4 * dst_stride);
4934 src0_ptr += (3 * src_stride);
4943 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4945 for (loop_cnt = 2; loop_cnt--;) {
4946 LD_SB8(src0_ptr, src_stride,
4947 src3, src4, src5, src6, src7, src8, src9, src10);
4948 src0_ptr += (8 * src_stride);
4950 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4951 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4952 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4953 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4960 dst32_r = __msa_ilvr_h(dst73, dst22);
4964 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4965 dst76_r = __msa_ilvr_h(dst22, dst106);
4967 LD2(src1_ptr, src2_stride, tp0, tp1);
4968 src1_ptr += 2 * src2_stride;
4970 LD2(src1_ptr, src2_stride, tp0, tp1);
4971 src1_ptr += 2 * src2_stride;
4974 LD2(src1_ptr, src2_stride, tp0, tp1);
4975 src1_ptr += 2 * src2_stride;
4977 LD2(src1_ptr, src2_stride, tp0, tp1);
4978 src1_ptr += 2 * src2_stride;
4981 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4982 const_vec, in0, in1, in2, in3);
4993 SRA_4V(dst0, dst1, dst2, dst3, 6);
4994 SRA_4V(dst4, dst5, dst6, dst7, 6);
4995 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4996 tmp0, tmp1, tmp2, tmp3);
4997 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4998 tmp0, tmp1, tmp2, tmp3);
5002 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
5003 dst += (8 * dst_stride);
5007 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5013 const int16_t *src1_ptr,
5017 const int8_t *filter_x,
5018 const int8_t *filter_y,
5023 dst, dst_stride, filter_x, filter_y, 2);
5026 src2_stride, dst, dst_stride, filter_x,
5033 const int16_t *src1_ptr,
5037 const int8_t *filter_x,
5038 const int8_t *filter_y,
5042 dst, dst_stride, filter_x, filter_y,
5048 const int16_t *src1_ptr,
5052 const int8_t *filter_x,
5053 const int8_t *filter_y,
5057 dst, dst_stride, filter_x, filter_y,
5061 #define BI_MC_COPY(WIDTH) \
5062 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5063 ptrdiff_t dst_stride, \
5064 const uint8_t *src, \
5065 ptrdiff_t src_stride, \
5066 const int16_t *src_16bit, \
5072 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5073 dst, dst_stride, height); \
5088 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5089 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5090 ptrdiff_t dst_stride, \
5091 const uint8_t *src, \
5092 ptrdiff_t src_stride, \
5093 const int16_t *src_16bit, \
5099 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5101 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5102 MAX_PB_SIZE, dst, dst_stride, \
5106 BI_MC(qpel,
h, 4, 8, hz, mx);
5107 BI_MC(qpel,
h, 8, 8, hz, mx);
5108 BI_MC(qpel,
h, 12, 8, hz, mx);
5109 BI_MC(qpel,
h, 16, 8, hz, mx);
5110 BI_MC(qpel,
h, 24, 8, hz, mx);
5111 BI_MC(qpel,
h, 32, 8, hz, mx);
5112 BI_MC(qpel,
h, 48, 8, hz, mx);
5113 BI_MC(qpel,
h, 64, 8, hz, mx);
5115 BI_MC(qpel, v, 4, 8, vt, my);
5116 BI_MC(qpel, v, 8, 8, vt, my);
5117 BI_MC(qpel, v, 12, 8, vt, my);
5118 BI_MC(qpel, v, 16, 8, vt, my);
5119 BI_MC(qpel, v, 24, 8, vt, my);
5120 BI_MC(qpel, v, 32, 8, vt, my);
5121 BI_MC(qpel, v, 48, 8, vt, my);
5122 BI_MC(qpel, v, 64, 8, vt, my);
5124 BI_MC(epel,
h, 4, 4, hz, mx);
5125 BI_MC(epel,
h, 8, 4, hz, mx);
5126 BI_MC(epel,
h, 6, 4, hz, mx);
5127 BI_MC(epel,
h, 12, 4, hz, mx);
5128 BI_MC(epel,
h, 16, 4, hz, mx);
5129 BI_MC(epel,
h, 24, 4, hz, mx);
5130 BI_MC(epel,
h, 32, 4, hz, mx);
5132 BI_MC(epel, v, 4, 4, vt, my);
5133 BI_MC(epel, v, 8, 4, vt, my);
5134 BI_MC(epel, v, 6, 4, vt, my);
5135 BI_MC(epel, v, 12, 4, vt, my);
5136 BI_MC(epel, v, 16, 4, vt, my);
5137 BI_MC(epel, v, 24, 4, vt, my);
5138 BI_MC(epel, v, 32, 4, vt, my);
5142 #define BI_MC_HV(PEL, WIDTH, TAP) \
5143 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
5144 ptrdiff_t dst_stride, \
5145 const uint8_t *src, \
5146 ptrdiff_t src_stride, \
5147 const int16_t *src_16bit, \
5153 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5154 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5156 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5157 MAX_PB_SIZE, dst, dst_stride, \
5158 filter_x, filter_y, height); \