27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \ 33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ 34 SRARI_H2_SH(out0, out1, rnd_val); \ 35 CLIP_SH2_0_255(out0, out1); \ 38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \ 39 vec0, vec1, vec2, vec3, rnd_val, \ 40 out0, out1, out2, out3) \ 42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \ 43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \ 46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \ 49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \ 50 SRARI_H2_SH(out0, out1, rnd_val); \ 51 CLIP_SH2_0_255(out0, out1); \ 54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \ 55 vec3, rnd_val, out0, out1, out2, out3) \ 57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \ 58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \ 69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
77 LW2(src0_ptr, src_stride, tp0, tp1);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
82 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
85 dst0 = __msa_srari_h(dst0, 7);
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89 ST_W2(dst0, 0, 1, dst, dst_stride);
90 }
else if (4 == height) {
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
101 }
else if (0 == height % 8) {
102 for (loop_cnt = (height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
121 dst3, 7, dst0, dst1, dst2, dst3);
123 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
124 dst += (8 * dst_stride);
138 uint64_t tp0, tp1, tp2, tp3;
139 v16u8 out0, out1, out2, out3;
141 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
145 for (loop_cnt = (height >> 3); loop_cnt--;) {
146 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
147 src0_ptr += (4 * src_stride);
150 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
151 src0_ptr += (4 * src_stride);
154 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
155 src1_ptr += (8 * src2_stride);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 SLLI_4V(dst4, dst5, dst6, dst7, 6);
163 7, dst0, dst1, dst2, dst3);
165 7, dst4, dst5, dst6, dst7);
168 ST_W2(out0, 0, 2, dst, dst_stride);
169 ST_H2(out0, 2, 6, dst + 4, dst_stride);
170 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
171 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
172 dst += (4 * dst_stride);
173 ST_W2(out2, 0, 2, dst, dst_stride);
174 ST_H2(out2, 2, 6, dst + 4, dst_stride);
175 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
176 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177 dst += (4 * dst_stride);
189 uint64_t tp0, tp1, tp2, tp3;
190 v16u8 out0, out1, out2, out3;
191 v16i8
src0 = { 0 },
src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
193 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
194 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
197 LD2(src0_ptr, src_stride, tp0, tp1);
199 LD_SH2(src1_ptr, src2_stride, in0, in1);
203 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
204 ST_D2(out0, 0, 1, dst, dst_stride);
205 }
else if (4 == height) {
206 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
211 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
212 SLLI_4V(dst0, dst1, dst2, dst3, 6);
214 7, dst0, dst1, dst2, dst3);
216 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
217 }
else if (6 == height) {
218 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
219 src0_ptr += 4 * src_stride;
222 LD2(src0_ptr, src_stride, tp0, tp1);
227 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
228 SLLI_4V(dst0, dst1, dst2, dst3, 6);
231 7, dst0, dst1, dst2, dst3);
233 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
234 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
235 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
236 }
else if (0 == height % 8) {
239 for (loop_cnt = (height >> 3); loop_cnt--;) {
240 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
241 src0_ptr += 4 * src_stride;
244 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
245 src0_ptr += 4 * src_stride;
252 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
254 src1_ptr += (8 * src2_stride);
255 SLLI_4V(dst0, dst1, dst2, dst3, 6);
256 SLLI_4V(dst4, dst5, dst6, dst7, 6);
258 dst3, 7, dst0, dst1, dst2, dst3);
260 dst7, 7, dst4, dst5, dst6, dst7);
263 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
264 dst += (8 * dst_stride);
279 v16u8 out0, out1, out2;
281 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
282 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
284 for (loop_cnt = 4; loop_cnt--;) {
285 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
286 src0_ptr += (4 * src_stride);
288 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
289 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
290 src1_ptr += (4 * src2_stride);
292 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
294 SLLI_4V(dst0, dst1, dst2, dst3, 6);
295 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
296 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
299 7, dst0, dst1, dst2, dst3);
301 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
302 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
303 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
304 dst += (4 * dst_stride);
317 v16u8 out0, out1, out2, out3;
319 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
320 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
323 for (loop_cnt = (height >> 2); loop_cnt--;) {
324 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
325 src0_ptr += (4 * src_stride);
326 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
327 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
328 src1_ptr += (4 * src2_stride);
333 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
334 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
336 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
338 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
339 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
340 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
341 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
342 dst += (4 * dst_stride);
355 v16u8 out0, out1, out2, out3, out4, out5;
356 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7,
zero = { 0 };
357 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
358 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
360 for (loop_cnt = 8; loop_cnt--;) {
361 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
362 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
363 src0_ptr += (4 * src_stride);
364 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
365 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
366 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
367 src1_ptr += (4 * src2_stride);
371 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
374 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
375 SLLI_4V(dst0, dst1, dst2, dst3, 6);
376 SLLI_4V(dst4, dst5, dst6, dst7, 6);
377 SLLI_4V(dst8, dst9, dst10, dst11, 6);
379 7, dst0, dst1, dst2, dst3);
381 7, dst4, dst5, dst6, dst7);
383 dst11, 7, dst8, dst9, dst10, dst11);
384 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
385 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
386 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
387 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
388 dst += (4 * dst_stride);
401 v16u8 out0, out1, out2, out3;
404 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
405 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
407 for (loop_cnt = (height >> 1); loop_cnt--;) {
408 LD_SB2(src0_ptr, 16, src0, src1);
409 src0_ptr += src_stride;
410 LD_SB2(src0_ptr, 16, src2, src3);
411 src0_ptr += src_stride;
412 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
413 src1_ptr += src2_stride;
414 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
415 src1_ptr += src2_stride;
421 SLLI_4V(dst0, dst1, dst2, dst3, 6);
422 SLLI_4V(dst4, dst5, dst6, dst7, 6);
424 7, dst0, dst1, dst2, dst3);
426 7, dst4, dst5, dst6, dst7);
429 ST_UB2(out0, out1, dst, 16);
431 ST_UB2(out2, out3, dst, 16);
445 v16u8 out0, out1, out2, out3, out4, out5;
446 v16i8
src0,
src1, src2, src3, src4, src5;
448 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
449 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
451 for (loop_cnt = (height >> 1); loop_cnt--;) {
452 LD_SB3(src0_ptr, 16, src0, src1, src2);
453 src0_ptr += src_stride;
454 LD_SB3(src0_ptr, 16, src3, src4, src5);
455 src0_ptr += src_stride;
457 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
458 src1_ptr += src2_stride;
459 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
460 src1_ptr += src2_stride;
469 SLLI_4V(dst0, dst1, dst2, dst3, 6);
470 SLLI_4V(dst4, dst5, dst6, dst7, 6);
471 SLLI_4V(dst8, dst9, dst10, dst11, 6);
474 7, dst0, dst1, dst2, dst3);
476 7, dst4, dst5, dst6, dst7);
478 dst11, 7, dst8, dst9, dst10, dst11);
479 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
480 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
481 ST_UB2(out0, out1, dst, 16);
482 ST_UB(out2, dst + 32);
484 ST_UB2(out3, out4, dst, 16);
485 ST_UB(out5, dst + 32);
499 v16u8 out0, out1, out2, out3;
502 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
503 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
505 for (loop_cnt = height; loop_cnt--;) {
506 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
507 src0_ptr += src_stride;
508 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
509 src1_ptr += src2_stride;
515 SLLI_4V(dst0, dst1, dst2, dst3, 6);
516 SLLI_4V(dst4, dst5, dst6, dst7, 6);
518 7, dst0, dst1, dst2, dst3);
520 7, dst4, dst5, dst6, dst7);
524 ST_UB4(out0, out1, out2, out3, dst, 16);
539 v8i16 filt0, filt1, filt2, filt3;
540 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
541 v16i8 mask1, mask2, mask3;
542 v16i8 vec0, vec1, vec2, vec3;
543 v8i16 dst0, dst1, dst2, dst3;
544 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
545 v8i16 filter_vec, const_vec;
551 filter_vec =
LD_SH(filter);
552 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
558 const_vec = __msa_ldi_h(128);
561 for (loop_cnt = (height >> 3); loop_cnt--;) {
562 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
563 src4, src5, src6, src7);
564 src0_ptr += (8 * src_stride);
565 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
566 src1_ptr += (8 * src2_stride);
576 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
577 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
578 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
580 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
581 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
582 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
584 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
585 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
586 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
588 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
589 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
590 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
594 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
597 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
598 dst += (8 * dst_stride);
612 v8i16 filt0, filt1, filt2, filt3;
614 v16i8 mask1, mask2, mask3;
615 v16i8 vec0, vec1, vec2, vec3;
616 v8i16 dst0, dst1, dst2, dst3;
617 v8i16 in0, in1, in2, in3;
618 v8i16 filter_vec, const_vec;
623 const_vec = __msa_ldi_h(128);
626 filter_vec =
LD_SH(filter);
627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
633 for (loop_cnt = (height >> 2); loop_cnt--;) {
634 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
635 src0_ptr += (4 * src_stride);
636 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
637 src1_ptr += (4 * src2_stride);
644 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
645 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
648 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
649 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
650 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
652 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
653 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
654 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
656 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
657 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
658 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
662 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
665 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
666 dst += (4 * dst_stride);
683 v16i8 vec0, vec1, vec2;
684 v8i16 filt0, filt1, filt2, filt3;
685 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
686 v8i16 dst0, dst1, dst2;
687 v8i16 in0, in1, in2, in3;
688 v8i16 filter_vec, const_vec;
691 const_vec = __msa_ldi_h(128);
694 filter_vec =
LD_SH(filter);
695 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
706 for (loop_cnt = 8; loop_cnt--;) {
707 LD_SB2(src0_ptr, 8, src0, src1);
708 src0_ptr += src_stride;
709 LD_SB2(src0_ptr, 8, src2, src3);
710 src0_ptr += src_stride;
711 LD_SH2(src1_ptr, 8, in0, in1);
712 src1_ptr += src2_stride;
713 LD_SH2(src1_ptr, 8, in2, in3);
714 src1_ptr += src2_stride;
721 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
724 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
725 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
728 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
729 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
732 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
733 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
736 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
738 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
740 dst2 = __msa_adds_s_h(in2, dst2);
741 dst2 = __msa_srari_h(dst2, 7);
745 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
746 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
747 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
748 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
769 v8i16 filt0, filt1, filt2, filt3;
770 v16i8 mask1, mask2, mask3;
771 v16i8 vec0, vec1, vec2, vec3;
772 v8i16 dst0, dst1, dst2, dst3;
773 v8i16 in0, in1, in2, in3;
774 v8i16 filter_vec, const_vec;
778 const_vec = __msa_ldi_h(128);
781 filter_vec =
LD_SH(filter);
782 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
788 for (loop_cnt = (height >> 1); loop_cnt--;) {
789 LD_SB2(src0_ptr, 8, src0, src1);
790 src0_ptr += src_stride;
791 LD_SB2(src0_ptr, 8, src2, src3);
792 src0_ptr += src_stride;
793 LD_SH2(src1_ptr, 8, in0, in1);
794 src1_ptr += src2_stride;
795 LD_SH2(src1_ptr, 8, in2, in3);
796 src1_ptr += src2_stride;
803 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
804 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
807 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
808 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
809 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
811 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
812 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
813 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
815 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
816 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
821 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
824 ST_SH2(dst0, dst1, dst, dst_stride);
825 dst += (2 * dst_stride);
841 v8i16 filt0, filt1, filt2, filt3;
842 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
843 v16i8 vec0, vec1, vec2, vec3;
844 v8i16 dst0, dst1, dst2;
846 v8i16 filter_vec, const_vec;
849 src0_ptr = src0_ptr - 3;
850 const_vec = __msa_ldi_h(128);
853 filter_vec =
LD_SH(filter);
854 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
864 for (loop_cnt = height; loop_cnt--;) {
865 LD_SB2(src0_ptr, 16, src0, src1);
866 src0_ptr += src_stride;
867 LD_SH2(src1_ptr, 8, in0, in1);
868 in2 =
LD_SH(src1_ptr + 16);
869 src1_ptr += src2_stride;
875 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
876 VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
877 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
879 VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
880 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
881 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
883 VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
884 VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
889 dst2 = __msa_adds_s_h(dst2, in2);
890 dst2 = __msa_srari_h(dst2, 7);
894 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
896 SD(dst_val0, dst + 16);
912 v8i16 filt0, filt1, filt2, filt3;
913 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
914 v16i8 vec0, vec1, vec2, vec3;
915 v8i16 dst0, dst1, dst2, dst3;
916 v8i16 in0, in1, in2, in3;
917 v8i16 filter_vec, const_vec;
921 const_vec = __msa_ldi_h(128);
924 filter_vec =
LD_SH(filter);
925 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
935 for (loop_cnt = height; loop_cnt--;) {
936 LD_SB2(src0_ptr, 16, src0, src1);
937 src2 =
LD_SB(src0_ptr + 24);
938 src0_ptr += src_stride;
939 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
940 src1_ptr += src2_stride;
947 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
948 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
949 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
951 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
952 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
953 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
955 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
956 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
957 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
959 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
960 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
961 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
965 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
968 ST_SB2(tmp0, tmp1, dst, 16);
984 v16i8 tmp0, tmp1, tmp2;
985 v8i16 filt0, filt1, filt2, filt3;
986 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
987 v16i8 vec0, vec1, vec2, vec3;
988 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
989 v8i16 in0, in1, in2, in3, in4, in5;
990 v8i16 filter_vec, const_vec;
995 const_vec = __msa_ldi_h(128);
998 filter_vec =
LD_SH(filter);
999 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1009 for (loop_cnt = 64; loop_cnt--;) {
1010 LD_SB3(src0_ptr, 16, src0, src1, src2);
1011 src3 =
LD_SB(src0_ptr + 40);
1012 src0_ptr += src_stride;
1013 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1021 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1022 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
1023 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1025 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1026 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
1027 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1029 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1030 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
1031 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1033 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1034 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
1035 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1041 ST_SB(tmp1, dst + 16);
1043 LD_SH2(src1_ptr + 32, 8, in4, in5);
1044 src1_ptr += src2_stride;
1048 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1049 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1050 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1052 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1053 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1054 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1059 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1060 ST_SB(tmp2, dst + 32);
1075 v16i8
src0,
src1, src2, src3, src4, src5, tmp0, tmp1;
1076 v8i16 filt0, filt1, filt2, filt3;
1078 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1079 v16i8 vec0, vec1, vec2, vec3;
1080 v8i16 dst0, dst1, dst2, dst3;
1081 v8i16 in0, in1, in2, in3;
1082 v8i16 filter_vec, const_vec;
1086 const_vec = __msa_ldi_h(128);
1089 filter_vec =
LD_SH(filter);
1090 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1100 for (loop_cnt = height; loop_cnt--;) {
1101 LD_SB2(src0_ptr, 16, src0, src1);
1102 src2 =
LD_SB(src0_ptr + 24);
1103 LD_SB2(src0_ptr + 32, 16, src3, src4);
1104 src5 =
LD_SB(src0_ptr + 56);
1105 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1113 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1114 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1115 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1117 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1118 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1119 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1121 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1122 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1123 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1125 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1126 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1127 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1131 dst0, dst1, dst2, dst3, 7,
1132 dst0, dst1, dst2, dst3);
1135 ST_SB2(tmp0, tmp1, dst, 16);
1141 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1148 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1149 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1150 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1152 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1153 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1154 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1156 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1157 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1158 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1160 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1161 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1162 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1165 dst0, dst1, dst2, dst3, 7,
1166 dst0, dst1, dst2, dst3);
1168 ST_SB2(tmp0, tmp1, dst + 32, 16);
1169 src1_ptr += src2_stride;
1170 src0_ptr += src_stride;
1185 v16i8
src0,
src1, src2, src3, src4, src5;
1186 v16i8 src6, src7, src8, src9, src10;
1187 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1188 v16i8 src11, src12, src13, src14;
1189 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1190 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1191 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1192 v16i8 src2110, src4332, src6554, src8776, src10998;
1193 v16i8 src12111110, src14131312;
1194 v8i16 dst10, dst32, dst54, dst76;
1195 v8i16 filt0, filt1, filt2, filt3;
1196 v8i16 filter_vec, const_vec;
1198 src0_ptr -= (3 * src_stride);
1200 const_vec = __msa_ldi_h(128);
1203 filter_vec =
LD_SH(filter);
1204 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1206 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1207 src0_ptr += (7 * src_stride);
1208 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1209 src10_r, src32_r, src54_r, src21_r);
1210 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1211 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1212 src2110, src4332, src6554);
1215 for (loop_cnt = (height >> 3); loop_cnt--;) {
1216 LD_SB8(src0_ptr, src_stride,
1217 src7, src8, src9, src10, src11, src12, src13, src14);
1218 src0_ptr += (8 * src_stride);
1219 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1220 src1_ptr += (8 * src2_stride);
1224 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1225 src76_r, src87_r, src98_r, src109_r);
1226 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1227 src1110_r, src1211_r, src1312_r, src1413_r);
1228 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1229 src1413_r, src1312_r,
1230 src8776, src10998, src12111110, src14131312);
1235 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1238 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1241 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1243 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1244 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1247 dst10, dst32, dst54, dst76, 7,
1248 dst10, dst32, dst54, dst76);
1250 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1251 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1252 dst += (8 * dst_stride);
1255 src4332 = src12111110;
1256 src6554 = src14131312;
1271 v16i8
src0,
src1, src2, src3, src4, src5;
1272 v16i8 src6, src7, src8, src9, src10;
1273 v8i16 in0, in1, in2, in3;
1274 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1275 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1276 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1277 v8i16 filt0, filt1, filt2, filt3;
1278 v8i16 filter_vec, const_vec;
1280 src0_ptr -= (3 * src_stride);
1281 const_vec = __msa_ldi_h(128);
1284 filter_vec =
LD_SH(filter);
1285 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1287 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1288 src0_ptr += (7 * src_stride);
1290 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1291 src10_r, src32_r, src54_r, src21_r);
1292 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1294 for (loop_cnt = (height >> 2); loop_cnt--;) {
1295 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1296 src0_ptr += (4 * src_stride);
1297 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1298 src1_ptr += (4 * src2_stride);
1300 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1301 src76_r, src87_r, src98_r, src109_r);
1305 filt0, filt1, filt2, filt3,
1306 dst0_r, dst0_r, dst0_r, dst0_r);
1309 filt0, filt1, filt2, filt3,
1310 dst1_r, dst1_r, dst1_r, dst1_r);
1313 filt0, filt1, filt2, filt3,
1314 dst2_r, dst2_r, dst2_r, dst2_r);
1317 filt0, filt1, filt2, filt3,
1318 dst3_r, dst3_r, dst3_r, dst3_r);
1321 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1322 dst0_r, dst1_r, dst2_r, dst3_r);
1324 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1325 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1326 dst += (4 * dst_stride);
1349 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1350 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1351 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1352 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1353 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1354 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1355 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1356 v16i8 src2110, src4332, src6554, src8776, src10998;
1357 v8i16 dst0_l, dst1_l;
1358 v8i16 filt0, filt1, filt2, filt3;
1359 v8i16 filter_vec, const_vec;
1361 src0_ptr -= (3 * src_stride);
1362 const_vec = __msa_ldi_h(128);
1365 filter_vec =
LD_SH(filter);
1366 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1368 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1369 src0_ptr += (7 * src_stride);
1372 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1373 src10_r, src32_r, src54_r, src21_r);
1374 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1375 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1376 src10_l, src32_l, src54_l, src21_l);
1377 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1378 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1379 src2110, src4332, src6554);
1381 for (loop_cnt = (height >> 2); loop_cnt--;) {
1382 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1383 src0_ptr += (4 * src_stride);
1384 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1385 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1386 src1_ptr += (4 * src2_stride);
1390 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1391 src76_r, src87_r, src98_r, src109_r);
1392 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1393 src76_l, src87_l, src98_l, src109_l);
1394 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1398 filt0, filt1, filt2, filt3,
1399 dst0_r, dst0_r, dst0_r, dst0_r);
1402 filt0, filt1, filt2, filt3,
1403 dst1_r, dst1_r, dst1_r, dst1_r);
1406 filt0, filt1, filt2, filt3,
1407 dst2_r, dst2_r, dst2_r, dst2_r);
1410 filt0, filt1, filt2, filt3,
1411 dst3_r, dst3_r, dst3_r, dst3_r);
1414 filt0, filt1, filt2, filt3,
1415 dst0_l, dst0_l, dst0_l, dst0_l);
1418 filt0, filt1, filt2, filt3,
1419 dst1_l, dst1_l, dst1_l, dst1_l);
1422 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1423 dst0_r, dst1_r, dst2_r, dst3_r);
1427 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1428 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1429 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1430 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
1431 dst += (4 * dst_stride);
1456 int16_t *src1_ptr_tmp;
1460 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
1461 v8i16 in0, in1, in2, in3;
1462 v16i8 src10_r, src32_r, src54_r, src76_r;
1463 v16i8 src21_r, src43_r, src65_r, src87_r;
1464 v8i16 dst0_r, dst1_r;
1465 v16i8 src10_l, src32_l, src54_l, src76_l;
1466 v16i8 src21_l, src43_l, src65_l, src87_l;
1467 v8i16 dst0_l, dst1_l;
1468 v8i16 filt0, filt1, filt2, filt3;
1469 v8i16 filter_vec, const_vec;
1471 src0_ptr -= (3 * src_stride);
1472 const_vec = __msa_ldi_h(128);
1475 filter_vec =
LD_SH(filter);
1476 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1478 for (cnt = (width >> 4); cnt--;) {
1479 src0_ptr_tmp = src0_ptr;
1480 src1_ptr_tmp = src1_ptr;
1483 LD_SB7(src0_ptr_tmp, src_stride,
1484 src0, src1, src2, src3, src4, src5, src6);
1485 src0_ptr_tmp += (7 * src_stride);
1488 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1489 src10_r, src32_r, src54_r, src21_r);
1490 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1491 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1492 src10_l, src32_l, src54_l, src21_l);
1493 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1495 for (loop_cnt = (height >> 1); loop_cnt--;) {
1496 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1497 src0_ptr_tmp += (2 * src_stride);
1498 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1499 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1500 src1_ptr_tmp += (2 * src2_stride);
1503 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1504 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1508 filt0, filt1, filt2, filt3,
1509 dst0_r, dst0_r, dst0_r, dst0_r);
1512 filt0, filt1, filt2, filt3,
1513 dst1_r, dst1_r, dst1_r, dst1_r);
1516 filt0, filt1, filt2, filt3,
1517 dst0_l, dst0_l, dst0_l, dst0_l);
1520 filt0, filt1, filt2, filt3,
1521 dst1_l, dst1_l, dst1_l, dst1_l);
1524 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1525 dst0_r, dst1_r, dst0_l, dst1_l);
1527 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1528 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1529 dst_tmp += (2 * dst_stride);
1562 dst, dst_stride, filter, height, 16);
1575 dst, dst_stride, filter, height, 16);
1577 dst + 16, dst_stride, filter, height);
1590 dst, dst_stride, filter, height, 32);
1603 dst, dst_stride, filter, height, 48);
1616 dst, dst_stride, filter, height, 64);
1625 const int8_t *filter_x,
1626 const int8_t *filter_y,
1632 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1633 v8i16 in0 = { 0 }, in1 = { 0 };
1634 v8i16 filt0, filt1, filt2, filt3;
1635 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1636 v16i8 mask1, mask2, mask3;
1637 v8i16 filter_vec, const_vec;
1638 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1639 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1641 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1642 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1643 v4i32 dst0, dst1, dst2, dst3;
1646 src0_ptr -= ((3 * src_stride) + 3);
1647 filter_vec =
LD_SH(filter_x);
1648 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1650 filter_vec =
LD_SH(filter_y);
1653 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1659 const_vec = __msa_ldi_h(128);
1662 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1663 src0_ptr += (7 * src_stride);
1667 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1668 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1669 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1670 vec8, vec9, vec10, vec11);
1671 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1672 vec12, vec13, vec14, vec15);
1687 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1689 for (loop_cnt = height >> 2; loop_cnt--;) {
1690 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1691 src0_ptr += (4 * src_stride);
1694 LD2(src1_ptr, src2_stride, tp0, tp1);
1696 src1_ptr += (2 * src2_stride);
1697 LD2(src1_ptr, src2_stride, tp0, tp1);
1699 src1_ptr += (2 * src2_stride);
1701 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1702 vec0, vec1, vec2, vec3);
1703 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1704 vec4, vec5, vec6, vec7);
1710 dst76 = __msa_ilvr_h(dst97, dst66);
1712 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1713 dst98 = __msa_ilvr_h(dst66, dst108);
1715 dst0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1717 dst1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1719 dst2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1721 dst3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1724 SRA_4V(dst0, dst1, dst2, dst3, 6);
1727 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1730 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1731 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1732 dst += (4 * dst_stride);
1740 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1750 const int8_t *filter_x,
1751 const int8_t *filter_y,
1757 int16_t *src1_ptr_tmp;
1760 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
1762 v8i16 filt0, filt1, filt2, filt3;
1763 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1765 v16i8 mask1, mask2, mask3;
1766 v8i16 filter_vec, const_vec;
1767 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1768 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1769 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1770 v4i32 dst0_r, dst0_l;
1771 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1772 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1774 src0_ptr -= ((3 * src_stride) + 3);
1775 const_vec = __msa_ldi_h(128);
1778 filter_vec =
LD_SH(filter_x);
1779 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1781 filter_vec =
LD_SH(filter_y);
1784 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1790 for (cnt = width >> 3; cnt--;) {
1791 src0_ptr_tmp = src0_ptr;
1793 src1_ptr_tmp = src1_ptr;
1795 LD_SB7(src0_ptr_tmp, src_stride,
1796 src0, src1, src2, src3, src4, src5, src6);
1797 src0_ptr_tmp += (7 * src_stride);
1801 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1802 vec0, vec1, vec2, vec3);
1803 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1804 vec4, vec5, vec6, vec7);
1805 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1806 vec8, vec9, vec10, vec11);
1807 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1808 vec12, vec13, vec14, vec15);
1818 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1819 vec0, vec1, vec2, vec3);
1820 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1821 vec4, vec5, vec6, vec7);
1822 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1823 vec8, vec9, vec10, vec11);
1831 for (loop_cnt = height; loop_cnt--;) {
1832 src7 =
LD_SB(src0_ptr_tmp);
1833 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1834 src0_ptr_tmp += src_stride;
1836 in0 =
LD_SH(src1_ptr_tmp);
1837 src1_ptr_tmp += src2_stride;
1839 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1840 vec0, vec1, vec2, vec3);
1848 filt_h0, filt_h1, filt_h2, filt_h3);
1850 filt_h0, filt_h1, filt_h2, filt_h3);
1854 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1856 tmp = __msa_srari_h(tmp, 7);
1858 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8) tmp);
1859 ST_D1(out, 0, dst_tmp);
1860 dst_tmp += dst_stride;
1883 const int8_t *filter_x,
1884 const int8_t *filter_y,
1888 dst, dst_stride, filter_x, filter_y,
1898 const int8_t *filter_x,
1899 const int8_t *filter_y,
1903 uint8_t *src0_ptr_tmp, *dst_tmp;
1904 int16_t *src1_ptr_tmp;
1907 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1908 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1909 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1910 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1911 v8i16 in0, in1 = { 0 }, out0, out1,
tmp, filter_vec, const_vec;
1912 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1913 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1914 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1915 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1916 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1917 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1918 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
1920 src0_ptr -= ((3 * src_stride) + 3);
1922 const_vec = __msa_ldi_h(128);
1925 filter_vec =
LD_SH(filter_x);
1926 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1928 filter_vec =
LD_SH(filter_y);
1931 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1938 src0_ptr_tmp = src0_ptr;
1940 src1_ptr_tmp = src1_ptr;
1942 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
1944 src0_ptr_tmp += (7 * src_stride);
1948 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1950 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1952 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1954 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1964 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1966 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1968 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1977 for (loop_cnt = 16; loop_cnt--;) {
1978 src7 =
LD_SB(src0_ptr_tmp);
1979 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1980 src0_ptr_tmp += src_stride;
1982 in0 =
LD_SH(src1_ptr_tmp);
1983 src1_ptr_tmp += src2_stride;
1985 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1993 dst0_r =
HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1994 filt_h1, filt_h2, filt_h3);
1995 dst0_l =
HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1996 filt_h1, filt_h2, filt_h3);
2000 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2002 tmp = __msa_srari_h(tmp, 7);
2004 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8) tmp);
2005 ST_D1(out, 0, dst_tmp);
2006 dst_tmp += dst_stride;
2026 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2027 src0_ptr += (7 * src_stride);
2031 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2032 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2033 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2034 vec8, vec9, vec10, vec11);
2035 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2036 vec12, vec13, vec14, vec15);
2050 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2052 for (loop_cnt = 4; loop_cnt--;) {
2053 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2054 src0_ptr += (4 * src_stride);
2057 LD2(src1_ptr, src2_stride, tp0, tp1);
2059 src1_ptr += (2 * src2_stride);
2060 LD2(src1_ptr, src2_stride, tp0, tp1);
2062 src1_ptr += (2 * src2_stride);
2064 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2066 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2073 dst76 = __msa_ilvr_h(dst97, dst66);
2075 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2076 dst98 = __msa_ilvr_h(dst66, dst108);
2078 tmp0 =
HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2080 tmp1 =
HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2082 tmp2 =
HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2084 tmp3 =
HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2086 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2089 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2092 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2093 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2094 dst += (4 * dst_stride);
2102 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2112 const int8_t *filter_x,
2113 const int8_t *filter_y,
2117 dst, dst_stride, filter_x, filter_y,
2127 const int8_t *filter_x,
2128 const int8_t *filter_y,
2132 dst, dst_stride, filter_x, filter_y,
2142 const int8_t *filter_x,
2143 const int8_t *filter_y,
2147 dst, dst_stride, filter_x, filter_y,
2157 const int8_t *filter_x,
2158 const int8_t *filter_y,
2162 dst, dst_stride, filter_x, filter_y,
2172 const int8_t *filter_x,
2173 const int8_t *filter_y,
2177 dst, dst_stride, filter_x, filter_y,
2191 v16i8
src0,
src1, dst0, vec0, vec1;
2196 v8i16 filter_vec, const_vec;
2200 const_vec = __msa_ldi_h(128);
2203 filter_vec =
LD_SH(filter);
2208 LD_SB2(src0_ptr, src_stride, src0, src1);
2209 LD_SH2(src1_ptr, src2_stride, in0, in1);
2210 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2212 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2216 tmp0 = __msa_adds_s_h(tmp0, in0);
2217 tmp0 = __msa_srari_h(tmp0, 7);
2219 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2221 ST_W2(dst0, 0, 1, dst, dst_stride);
2234 v16i8
src0,
src1, src2, src3, dst0, vec0, vec1;
2235 v8i16 in0, in1, in2, in3;
2240 v8i16 filter_vec, const_vec;
2244 const_vec = __msa_ldi_h(128);
2247 filter_vec =
LD_SH(filter);
2252 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2253 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2260 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2261 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2262 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2265 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2267 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2281 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2283 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2285 v16i8 mask1, vec0, vec1, vec2, vec3;
2286 v8i16 tmp0, tmp1, tmp2, tmp3;
2287 v8i16 filter_vec, const_vec;
2291 const_vec = __msa_ldi_h(128);
2294 filter_vec =
LD_SH(filter);
2299 for (loop_cnt = (height >> 3); loop_cnt--;) {
2300 LD_SB8(src0_ptr, src_stride,
2301 src0, src1, src2, src3, src4, src5, src6, src7);
2302 src0_ptr += (8 * src_stride);
2303 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2304 src1_ptr += (4 * src2_stride);
2305 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2306 src1_ptr += (4 * src2_stride);
2315 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2316 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2317 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2319 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
2320 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2321 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2325 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2328 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2329 dst += (8 * dst_stride);
2344 dst, dst_stride, filter, height);
2345 }
else if (4 == height) {
2347 dst, dst_stride, filter, height);
2348 }
else if (8 == height || 16 == height) {
2350 src1_ptr, src2_stride,
2351 dst, dst_stride, filter, height);
2367 v8i16 in0, in1, in2, in3;
2370 v16i8 vec0, vec1, vec2, vec3;
2371 v8i16 dst0, dst1, dst2, dst3;
2372 v8i16 filter_vec, const_vec;
2376 const_vec = __msa_ldi_h(128);
2379 filter_vec =
LD_SH(filter);
2384 for (loop_cnt = (height >> 2); loop_cnt--;) {
2385 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2386 src0_ptr += (4 * src_stride);
2387 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2388 src1_ptr += (4 * src2_stride);
2395 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2396 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2397 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2399 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2400 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2401 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2405 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2408 ST_W2(dst0, 0, 2, dst, dst_stride);
2409 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2410 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2411 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2412 dst += (4 * dst_stride);
2429 v16i8 mask1, vec0, vec1, vec2, vec3;
2431 v8i16 filter_vec, const_vec;
2435 const_vec = __msa_ldi_h(128);
2438 filter_vec =
LD_SH(filter);
2443 LD_SB2(src0_ptr, src_stride, src0, src1);
2444 LD_SH2(src1_ptr, src2_stride, in0, in1);
2449 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2450 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2451 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2455 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456 ST_D2(dst0, 0, 1, dst, dst_stride);
2469 v16i8
src0,
src1, src2, src3, src4, src5;
2470 v8i16 in0, in1, in2, in3, in4, in5;
2473 v16i8 vec0, vec1, vec2, vec3;
2474 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2475 v8i16 filter_vec, const_vec;
2479 const_vec = __msa_ldi_h(128);
2482 filter_vec =
LD_SH(filter);
2487 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2488 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2489 src1_ptr += (4 * src2_stride);
2490 LD_SH2(src1_ptr, src2_stride, in4, in5);
2497 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2498 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2499 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2501 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2502 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2503 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2508 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2509 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2510 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2514 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2518 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2519 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2520 ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
2535 v8i16 in0, in1, in2, in3;
2538 v16i8 vec0, vec1, vec2, vec3;
2539 v8i16 dst0, dst1, dst2, dst3;
2540 v8i16 filter_vec, const_vec;
2544 const_vec = __msa_ldi_h(128);
2547 filter_vec =
LD_SH(filter);
2552 for (loop_cnt = (height >> 2); loop_cnt--;) {
2553 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2554 src0_ptr += (4 * src_stride);
2555 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2556 src1_ptr += (4 * src2_stride);
2563 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2564 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2565 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2567 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2568 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2569 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2573 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2576 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2577 dst += (4 * dst_stride);
2592 dst, dst_stride, filter, height);
2593 }
else if (6 == height) {
2595 dst, dst_stride, filter, height);
2596 }
else if (0 == (height % 4)) {
2598 src1_ptr, src2_stride,
2599 dst, dst_stride, filter, height);
2615 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2618 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2621 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2622 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2623 v8i16 filter_vec, const_vec;
2627 const_vec = __msa_ldi_h(128);
2630 filter_vec =
LD_SH(filter);
2636 for (loop_cnt = (height >> 2); loop_cnt--;) {
2637 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2638 src0_ptr += (4 * src_stride);
2639 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2640 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2641 src1_ptr += (4 * src2_stride);
2652 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2653 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2654 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2655 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2658 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2659 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2660 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5);
2661 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2666 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2670 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2671 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2672 ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
2673 dst += (4 * dst_stride);
2687 v16i8
src0,
src1, src2, src3, vec0, vec1, vec2, vec3;
2688 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2692 v8i16 filter_vec, const_vec;
2696 const_vec = __msa_ldi_h(128);
2699 filter_vec =
LD_SH(filter);
2704 for (loop_cnt = (height >> 1); loop_cnt--;) {
2705 LD_SB2(src0_ptr, src_stride, src0, src2);
2706 LD_SB2(src0_ptr + 8, src_stride, src1, src3);
2707 src0_ptr += (2 * src_stride);
2708 LD_SH2(src1_ptr, src2_stride, in0, in2);
2709 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2710 src1_ptr += (2 * src2_stride);
2719 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2720 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2721 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2723 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2724 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2725 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2729 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2732 ST_SH2(dst0, dst1, dst, dst_stride);
2733 dst += (2 * dst_stride);
2746 int16_t *src1_ptr_tmp;
2749 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7;
2750 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2753 v16i8 mask1, mask2, mask3;
2754 v16i8 vec0, vec1, vec2, vec3;
2755 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2756 v8i16 filter_vec, const_vec;
2760 const_vec = __msa_ldi_h(128);
2763 filter_vec =
LD_SH(filter);
2771 src1_ptr_tmp = src1_ptr + 16;
2773 for (loop_cnt = (height >> 2); loop_cnt--;) {
2774 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2775 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2776 src0_ptr += (4 * src_stride);
2777 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2778 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2779 src1_ptr += (4 * src2_stride);
2786 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2787 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
2788 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2790 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2791 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
2792 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2799 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2800 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2801 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2803 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2804 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2809 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2811 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2814 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2815 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2816 dst += (4 * dst_stride);
2818 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2819 src1_ptr_tmp += (4 * src2_stride);
2825 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2826 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2827 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2829 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1);
2830 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2831 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2835 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2838 ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
2839 dst_tmp += (4 * dst_stride);
2854 v8i16 in0, in1, in2, in3;
2857 v16i8 mask1, mask2, mask3;
2858 v8i16 dst0, dst1, dst2, dst3;
2859 v16i8 vec0, vec1, vec2, vec3;
2860 v8i16 filter_vec, const_vec;
2864 const_vec = __msa_ldi_h(128);
2867 filter_vec =
LD_SH(filter);
2874 for (loop_cnt = height; loop_cnt--;) {
2875 LD_SB2(src0_ptr, 16, src0, src1);
2876 src2 =
LD_SB(src0_ptr + 24);
2877 src0_ptr += src_stride;
2878 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2879 src1_ptr += src2_stride;
2886 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2887 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2888 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2890 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2891 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2892 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2896 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2899 ST_SH2(dst0, dst1, dst, 16);
2913 v16i8
src0,
src1, src2, src3, src4;
2915 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2918 v8i16 filter_vec, const_vec;
2920 src0_ptr -= src_stride;
2922 const_vec = __msa_ldi_h(128);
2925 filter_vec =
LD_SH(filter);
2928 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2929 src0_ptr += (3 * src_stride);
2931 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2932 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2933 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2935 LD_SB2(src0_ptr, src_stride, src3, src4);
2936 LD_SH2(src1_ptr, src2_stride, in0, in1);
2937 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2938 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2939 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2940 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2943 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2944 dst10 = __msa_adds_s_h(dst10, in0);
2945 dst10 = __msa_srari_h(dst10, 7);
2948 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2949 ST_W2(dst10, 0, 1, dst, dst_stride);
2961 v16i8
src0,
src1, src2, src3, src4, src5, src6;
2962 v8i16 in0, in1, in2, in3;
2963 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2964 v16i8 src2110, src4332, src6554;
2967 v8i16 filter_vec, const_vec;
2969 src0_ptr -= src_stride;
2971 const_vec = __msa_ldi_h(128);
2974 filter_vec =
LD_SH(filter);
2977 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2978 src0_ptr += (3 * src_stride);
2979 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2980 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2981 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2983 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2984 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2986 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2987 src32_r, src43_r, src54_r, src65_r);
2988 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2992 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2994 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2997 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2998 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3011 v16i8
src0,
src1, src2, src3, src4, src5;
3012 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3013 v16i8 src6, src7, src8, src9;
3014 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3015 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3016 v16i8 src2110, src4332, src6554, src8776;
3017 v8i16 dst10, dst32, dst54, dst76;
3019 v8i16 filter_vec, const_vec;
3021 src0_ptr -= src_stride;
3023 const_vec = __msa_ldi_h(128);
3026 filter_vec =
LD_SH(filter);
3029 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3030 src0_ptr += (3 * src_stride);
3031 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3032 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3033 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3035 for (loop_cnt = (height >> 3); loop_cnt--;) {
3036 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3037 src0_ptr += (6 * src_stride);
3038 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3039 src1_ptr += (8 * src2_stride);
3042 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3043 src32_r, src43_r, src54_r, src65_r);
3044 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3045 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3046 src4332, src6554, src8776);
3050 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3052 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3054 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3056 LD_SB2(src0_ptr, src_stride, src9, src2);
3057 src0_ptr += (2 * src_stride);
3058 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3059 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3060 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3062 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3065 dst10, dst32, dst54, dst76, 7,
3066 dst10, dst32, dst54, dst76);
3068 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3069 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3070 dst += (8 * dst_stride);
3085 dst, dst_stride, filter, height);
3086 }
else if (4 == height) {
3088 dst, dst_stride, filter, height);
3091 src1_ptr, src2_stride,
3092 dst, dst_stride, filter, height);
3105 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3106 v8i16 in0, in1, in2, in3;
3107 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3108 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3110 v8i16 filter_vec, const_vec;
3112 src0_ptr -= src_stride;
3114 const_vec = __msa_ldi_h(128);
3117 filter_vec =
LD_SH(filter);
3120 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3121 src0_ptr += (3 * src_stride);
3122 LD_SB2(src0_ptr, src_stride, src3, src4);
3123 src0_ptr += (2 * src_stride);
3124 LD_SB2(src0_ptr, src_stride, src5, src6);
3125 src0_ptr += (2 * src_stride);
3126 LD_SB2(src0_ptr, src_stride, src7, src8);
3127 src0_ptr += (2 * src_stride);
3128 LD_SB2(src0_ptr, src_stride, src9, src10);
3129 src0_ptr += (2 * src_stride);
3131 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3132 src1_ptr += (4 * src2_stride);
3140 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3141 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3144 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3146 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3148 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3151 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3153 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3156 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3157 dst0_r, dst1_r, dst2_r, dst3_r);
3159 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3160 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3161 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3162 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3163 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3164 dst += (4 * dst_stride);
3166 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3167 src1_ptr += (4 * src2_stride);
3168 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3171 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3173 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3175 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3178 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3180 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3183 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3184 dst0_r, dst1_r, dst2_r, dst3_r);
3186 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3187 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3188 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3189 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3190 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3191 dst += (4 * dst_stride);
3203 v16i8
src0,
src1, src2, src3, src4;
3204 v8i16 in0, in1, dst0_r, dst1_r;
3205 v16i8 src10_r, src32_r, src21_r, src43_r;
3207 v8i16 filter_vec, const_vec;
3209 src0_ptr -= src_stride;
3211 const_vec = __msa_ldi_h(128);
3214 filter_vec =
LD_SH(filter);
3217 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3218 src0_ptr += (3 * src_stride);
3220 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3222 LD_SB2(src0_ptr, src_stride, src3, src4);
3223 LD_SH2(src1_ptr, src2_stride, in0, in1);
3225 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3228 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3230 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3233 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3235 ST_D2(dst0_r, 0, 1, dst, dst_stride);
3247 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
3248 v8i16 in0, in1, in2, in3, in4, in5;
3249 v16i8 src10_r, src32_r, src54_r, src76_r;
3250 v16i8 src21_r, src43_r, src65_r, src87_r;
3251 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3253 v8i16 filter_vec, const_vec;
3255 src0_ptr -= src_stride;
3257 const_vec = __msa_ldi_h(128);
3260 filter_vec =
LD_SH(filter);
3263 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3264 src0_ptr += (3 * src_stride);
3266 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3268 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3269 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3271 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3272 src32_r, src43_r, src54_r, src65_r);
3273 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3276 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3278 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3280 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3282 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3284 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3286 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3288 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3289 dst0_r, dst1_r, dst2_r, dst3_r);
3292 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3293 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3294 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3295 ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
3308 v16i8
src0,
src1, src2, src3, src4, src5;
3309 v8i16 in0, in1, in2, in3;
3310 v16i8 src10_r, src32_r, src21_r, src43_r;
3311 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3313 v8i16 filter_vec, const_vec;
3315 src0_ptr -= src_stride;
3317 const_vec = __msa_ldi_h(128);
3320 filter_vec =
LD_SH(filter);
3323 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3324 src0_ptr += (3 * src_stride);
3326 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3328 for (loop_cnt = (height >> 2); loop_cnt--;) {
3329 LD_SB2(src0_ptr, src_stride, src3, src4);
3330 src0_ptr += (2 * src_stride);
3331 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3332 src1_ptr += (4 * src2_stride);
3334 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3337 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3339 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3341 LD_SB2(src0_ptr, src_stride, src5, src2);
3342 src0_ptr += (2 * src_stride);
3344 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3347 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3349 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3351 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3352 dst0_r, dst1_r, dst2_r, dst3_r);
3354 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3355 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3356 dst += (4 * dst_stride);
3371 dst, dst_stride, filter, height);
3372 }
else if (6 == height) {
3374 dst, dst_stride, filter, height);
3377 src1_ptr, src2_stride,
3378 dst, dst_stride, filter, height);
3392 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3393 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3394 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3395 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3396 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3397 v16i8 src2110, src4332, src6554;
3398 v8i16 dst0_l, dst1_l, filt0, filt1;
3399 v8i16 filter_vec, const_vec;
3401 src0_ptr -= (1 * src_stride);
3403 const_vec = __msa_ldi_h(128);
3406 filter_vec =
LD_SH(filter);
3409 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3410 src0_ptr += (3 * src_stride);
3412 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3413 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3414 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3416 for (loop_cnt = (height >> 2); loop_cnt--;) {
3417 LD_SB2(src0_ptr, src_stride, src3, src4);
3418 src0_ptr += (2 * src_stride);
3419 LD_SB2(src0_ptr, src_stride, src5, src6);
3420 src0_ptr += (2 * src_stride);
3421 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3422 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3423 src1_ptr += (4 * src2_stride);
3428 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3429 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3430 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3431 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3432 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3433 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3436 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3438 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3440 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3442 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3444 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3446 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3448 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3449 dst0_r, dst1_r, dst2_r, dst3_r);
3452 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3453 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3454 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3455 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
3456 dst += (4 * dst_stride);
3475 v16i8
src0,
src1, src2, src3, src4, src5;
3476 v8i16 in0, in1, in2, in3;
3477 v16i8 src10_r, src32_r, src21_r, src43_r;
3478 v16i8 src10_l, src32_l, src21_l, src43_l;
3479 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3481 v8i16 filter_vec, const_vec;
3483 src0_ptr -= src_stride;
3485 const_vec = __msa_ldi_h(128);
3488 filter_vec =
LD_SH(filter);
3491 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3492 src0_ptr += (3 * src_stride);
3494 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3495 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3497 for (loop_cnt = (height >> 2); loop_cnt--;) {
3498 LD_SB2(src0_ptr, src_stride, src3, src4);
3499 src0_ptr += (2 * src_stride);
3500 LD_SH2(src1_ptr, src2_stride, in0, in1);
3501 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3502 src1_ptr += (2 * src2_stride);
3504 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3505 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3508 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3510 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3512 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3514 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3516 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3517 dst0_r, dst1_r, dst0_l, dst1_l);
3519 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3520 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3521 dst += (2 * dst_stride);
3523 LD_SB2(src0_ptr, src_stride, src5, src2);
3524 src0_ptr += (2 * src_stride);
3525 LD_SH2(src1_ptr, src2_stride, in0, in1);
3526 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3527 src1_ptr += (2 * src2_stride);
3529 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3530 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3533 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3535 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3537 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3539 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3541 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3542 dst0_r, dst1_r, dst0_l, dst1_l);
3544 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3545 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3546 dst += (2 * dst_stride);
3560 v16i8
src0,
src1, src2, src3, src4, src5;
3561 v16i8 src6, src7, src8, src9, src10, src11;
3562 v8i16 in0, in1, in2, in3, in4, in5;
3563 v16i8 src10_r, src32_r, src76_r, src98_r;
3564 v16i8 src21_r, src43_r, src87_r, src109_r;
3565 v16i8 src10_l, src32_l, src21_l, src43_l;
3566 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3567 v8i16 dst0_l, dst1_l;
3569 v8i16 filter_vec, const_vec;
3571 src0_ptr -= src_stride;
3573 const_vec = __msa_ldi_h(128);
3576 filter_vec =
LD_SH(filter);
3580 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3582 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3583 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3585 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3586 src0_ptr += (3 * src_stride);
3588 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3590 for (loop_cnt = (height >> 2); loop_cnt--;) {
3592 LD_SB2(src0_ptr, src_stride, src3, src4);
3593 LD_SH2(src1_ptr, src2_stride, in0, in1);
3594 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3595 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3596 src1_ptr += (2 * src2_stride);
3598 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3599 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3601 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3602 src0_ptr += (2 * src_stride);
3604 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3607 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3609 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3611 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3613 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3616 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3618 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3621 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3622 dst0_r, dst1_r, dst0_l, dst1_l);
3626 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3627 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3628 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3629 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3630 dst += (2 * dst_stride);
3633 LD_SB2(src0_ptr, src_stride, src5, src2);
3634 LD_SH2(src1_ptr, src2_stride, in0, in1);
3635 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3636 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3637 src1_ptr += (2 * src2_stride);
3639 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3640 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3642 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3643 src0_ptr += (2 * src_stride);
3645 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3648 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3650 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3652 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3654 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3657 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3659 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3662 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3663 dst0_r, dst1_r, dst0_l, dst1_l);
3666 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3667 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3668 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3669 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3670 dst += (2 * dst_stride);
3685 v16i8
src0,
src1, src2, src3, src4, src6, src7, src8, src9, src10;
3686 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3687 v16i8 src10_r, src32_r, src76_r, src98_r;
3688 v16i8 src21_r, src43_r, src87_r, src109_r;
3689 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3690 v16i8 src10_l, src32_l, src76_l, src98_l;
3691 v16i8 src21_l, src43_l, src87_l, src109_l;
3692 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3694 v8i16 filter_vec, const_vec;
3696 src0_ptr -= src_stride;
3698 const_vec = __msa_ldi_h(128);
3701 filter_vec =
LD_SH(filter);
3705 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3707 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3708 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3711 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3712 src0_ptr += (3 * src_stride);
3714 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3715 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3717 for (loop_cnt = (height >> 1); loop_cnt--;) {
3719 LD_SB2(src0_ptr, src_stride, src3, src4);
3720 LD_SH2(src1_ptr, src2_stride, in0, in1);
3721 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3722 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3723 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3724 src1_ptr += (2 * src2_stride);
3726 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3727 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3730 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3732 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3734 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3736 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3739 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3740 dst0_r, dst1_r, dst0_l, dst1_l);
3748 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3749 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3750 dst += (2 * dst_stride);
3753 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3754 src0_ptr += (2 * src_stride);
3756 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3760 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3762 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3764 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3766 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3769 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3770 dst2_r, dst3_r, dst2_l, dst3_l);
3772 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3773 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3774 dst_tmp += (2 * dst_stride);
3790 const int8_t *filter_x,
3791 const int8_t *filter_y)
3796 v16i8
src0,
src1, src2, src3, src4;
3798 v8i16 filt_h0, filt_h1;
3801 v8i16 filter_vec, const_vec;
3802 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3803 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43,
tmp;
3806 src0_ptr -= (src_stride + 1);
3808 filter_vec =
LD_SH(filter_x);
3811 filter_vec =
LD_SH(filter_y);
3818 const_vec = __msa_ldi_h(128);
3821 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
3824 LD2(src1_ptr, src2_stride, tp0, tp1);
3826 in0 = __msa_adds_s_h(in0, const_vec);
3828 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3829 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3830 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3843 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3844 tmp = __msa_adds_s_h(tmp, in0);
3845 tmp = __msa_srari_h(tmp, 7);
3847 out = (v16u8) __msa_pckev_b((v16i8)
tmp, (v16i8) tmp);
3848 ST_W2(out, 0, 1, dst, dst_stride);
3857 const int8_t *filter_x,
3858 const int8_t *filter_y)
3862 v16i8
src0,
src1, src2, src3, src4, src5, src6;
3864 v8i16 filt_h0, filt_h1;
3867 v8i16 filter_vec, const_vec;
3868 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3870 v8i16 in0 = { 0 }, in1 = { 0 };
3871 v8i16 dst30, dst41, dst52, dst63;
3872 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3873 v4i32 dst0, dst1, dst2, dst3;
3875 src0_ptr -= (src_stride + 1);
3877 filter_vec =
LD_SH(filter_x);
3880 filter_vec =
LD_SH(filter_y);
3887 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
3890 const_vec = __msa_ldi_h(128);
3893 LD2(src1_ptr, src2_stride, tp0, tp1);
3894 src1_ptr += 2 * src2_stride;
3896 LD2(src1_ptr, src2_stride, tp0, tp1);
3899 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
3901 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3902 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3903 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3904 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3918 SRA_4V(dst0, dst1, dst2, dst3, 6);
3923 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3924 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3933 const int8_t *filter_x,
3934 const int8_t *filter_y,
3940 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3942 v8i16 filt_h0, filt_h1;
3945 v8i16 filter_vec, const_vec;
3946 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3947 v8i16 tmp0, tmp1, tmp2, tmp3;
3948 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3949 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3950 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3951 v8i16 dst98_r, dst109_r;
3952 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
3953 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3955 src0_ptr -= (src_stride + 1);
3957 filter_vec =
LD_SH(filter_x);
3960 filter_vec =
LD_SH(filter_y);
3967 const_vec = __msa_ldi_h(128);
3970 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3971 src0_ptr += (3 * src_stride);
3974 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3975 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3979 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3982 for (loop_cnt = height >> 3; loop_cnt--;) {
3983 LD_SB8(src0_ptr, src_stride,
3984 src3, src4, src5, src6, src7, src8, src9, src10);
3985 src0_ptr += (8 * src_stride);
3987 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3988 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3989 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3990 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3997 dst32_r = __msa_ilvr_h(dst73, dst22);
4001 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4002 dst76_r = __msa_ilvr_h(dst22, dst106);
4004 LD2(src1_ptr, src2_stride, tp0, tp1);
4005 src1_ptr += 2 * src2_stride;
4007 LD2(src1_ptr, src2_stride, tp0, tp1);
4008 src1_ptr += 2 * src2_stride;
4011 LD2(src1_ptr, src2_stride, tp0, tp1);
4012 src1_ptr += 2 * src2_stride;
4014 LD2(src1_ptr, src2_stride, tp0, tp1);
4015 src1_ptr += 2 * src2_stride;
4018 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4019 const_vec, in0, in1, in2, in3);
4028 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4029 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4031 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4032 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4037 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4038 dst += (8 * dst_stride);
4042 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4052 const int8_t *filter_x,
4053 const int8_t *filter_y,
4058 dst, dst_stride, filter_x, filter_y);
4059 }
else if (4 == height) {
4061 dst, dst_stride, filter_x, filter_y);
4062 }
else if (0 == (height % 8)) {
4064 src1_ptr, src2_stride,
4066 filter_x, filter_y, height);
4076 const int8_t *filter_x,
4077 const int8_t *filter_y,
4080 uint32_t tpw0, tpw1, tpw2, tpw3;
4082 v16u8 out0, out1, out2;
4083 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4084 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4086 v8i16 filt_h0, filt_h1;
4089 v8i16 filter_vec, const_vec;
4090 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4091 v8i16 dsth10, tmp4, tmp5;
4092 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4093 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4094 v8i16 tmp0, tmp1, tmp2, tmp3;
4095 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4096 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4097 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4098 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4099 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4100 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4101 v8i16 in4 = { 0 }, in5 = { 0 };
4103 src0_ptr -= (src_stride + 1);
4105 filter_vec =
LD_SH(filter_x);
4108 filter_vec =
LD_SH(filter_y);
4115 const_vec = __msa_ldi_h(128);
4118 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4119 src0_ptr += (3 * src_stride);
4122 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4123 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4124 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4133 LD_SB8(src0_ptr, src_stride,
4134 src3, src4, src5, src6, src7, src8, src9, src10);
4137 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4138 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4139 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4140 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4147 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4148 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4149 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4150 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4165 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4166 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4167 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4180 dst3_l =
HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4181 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4182 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4183 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4184 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4185 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4186 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4188 LD2(src1_ptr, src2_stride, tp0, tp1);
4190 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4193 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4195 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4198 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4199 in0, in1, in2, in3);
4200 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4205 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4207 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4208 src1_ptr += (4 * src2_stride);
4210 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4212 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4216 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4217 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4226 const int8_t *filter_x,
4227 const int8_t *filter_y)
4230 v16i8
src0,
src1, src2, src3, src4;
4232 v8i16 filt_h0, filt_h1;
4235 v8i16 filter_vec, const_vec;
4236 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4237 v8i16 dst0, dst1, dst2, dst3, dst4;
4238 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4239 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4240 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4244 src0_ptr -= (src_stride + 1);
4246 filter_vec =
LD_SH(filter_x);
4249 filter_vec =
LD_SH(filter_y);
4256 const_vec = __msa_ldi_h(128);
4259 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4262 LD_SH2(src1_ptr, src2_stride, in0, in1);
4263 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4265 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4266 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4267 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4268 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4269 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4285 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4286 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4290 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4291 ST_D2(out, 0, 1, dst, dst_stride);
4300 const int8_t *filter_x,
4301 const int8_t *filter_y,
4306 v16i8
src0,
src1, src2, src3, src4, src5, src6, mask0, mask1;
4307 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4308 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4309 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4310 v8i16 in0, in1, in2, in3;
4311 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4312 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4313 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4315 src0_ptr -= (src_stride + 1);
4317 filter_vec =
LD_SH(filter_x);
4320 filter_vec =
LD_SH(filter_y);
4328 const_vec = __msa_ldi_h(128);
4331 for (cnt = width8mult; cnt--;) {
4332 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4336 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4338 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4339 const_vec, in0, in1, in2, in3);
4341 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4342 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4343 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4352 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4353 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4354 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4355 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4376 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4377 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4378 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4379 dst3_r, tmp0, tmp1, tmp2, tmp3);
4380 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4381 tmp0, tmp1, tmp2, tmp3);
4385 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4396 const int8_t *filter_x,
4397 const int8_t *filter_y)
4399 v16u8 out0, out1, out2;
4400 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8;
4401 v8i16 in0, in1, in2, in3, in4, in5;
4403 v8i16 filt_h0, filt_h1;
4406 v8i16 filter_vec, const_vec;
4407 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4408 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4409 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4410 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4411 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4412 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4413 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4414 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4415 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4416 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4418 src0_ptr -= (src_stride + 1);
4420 filter_vec =
LD_SH(filter_x);
4423 filter_vec =
LD_SH(filter_y);
4430 const_vec = __msa_ldi_h(128);
4433 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4434 src0_ptr += (5 * src_stride);
4435 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4440 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4441 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4442 in0, in1, in2, in3);
4443 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4445 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4446 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4447 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4448 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4449 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4450 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4451 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4452 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4453 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4487 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4488 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4489 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4490 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4491 tmp0, tmp1, tmp2, tmp3);
4492 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4493 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4494 tmp0, tmp1, tmp2, tmp3);
4501 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4502 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4503 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4512 const int8_t *filter_x,
4513 const int8_t *filter_y,
4517 uint32_t loop_cnt, cnt;
4519 int16_t *src1_ptr_tmp;
4522 v16i8
src0,
src1, src2, src3, src4, src5, src6;
4523 v8i16 in0, in1, in2, in3;
4525 v8i16 filt_h0, filt_h1;
4528 v8i16 filter_vec, const_vec;
4529 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4530 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4531 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4532 v8i16 tmp0, tmp1, tmp2, tmp3;
4533 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4534 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4535 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4537 src0_ptr -= (src_stride + 1);
4539 filter_vec =
LD_SH(filter_x);
4542 filter_vec =
LD_SH(filter_y);
4549 const_vec = __msa_ldi_h(128);
4552 for (cnt = width >> 3; cnt--;) {
4553 src0_ptr_tmp = src0_ptr;
4555 src1_ptr_tmp = src1_ptr;
4557 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4558 src0_ptr_tmp += (3 * src_stride);
4561 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4562 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4563 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4572 for (loop_cnt = height >> 2; loop_cnt--;) {
4573 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4574 src0_ptr_tmp += (4 * src_stride);
4575 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4576 src1_ptr_tmp += (4 * src2_stride);
4579 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4580 const_vec, in0, in1, in2, in3);
4582 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4583 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4584 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4585 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4606 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4607 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4608 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4609 dst3_r, tmp0, tmp1, tmp2, tmp3);
4610 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4611 tmp0, tmp1, tmp2, tmp3);
4615 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4616 dst_tmp += (4 * dst_stride);
4637 const int8_t *filter_x,
4638 const int8_t *filter_y,
4643 dst, dst_stride, filter_x, filter_y);
4644 }
else if (4 == height) {
4646 dst, dst_stride, filter_x, filter_y, 1);
4647 }
else if (6 == height) {
4649 dst, dst_stride, filter_x, filter_y);
4652 src1_ptr, src2_stride,
4654 filter_x, filter_y, height, 8);
4664 const int8_t *filter_x,
4665 const int8_t *filter_y,
4670 uint8_t *src0_ptr_tmp, *dst_tmp;
4671 int16_t *src1_ptr_tmp;
4673 v16i8
src0,
src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675 v16i8 mask0, mask1, mask2, mask3;
4676 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4677 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4678 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4679 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4680 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4681 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4682 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4683 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4684 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4686 src0_ptr -= (src_stride + 1);
4688 filter_vec =
LD_SH(filter_x);
4691 filter_vec =
LD_SH(filter_y);
4699 const_vec = __msa_ldi_h(128);
4702 src0_ptr_tmp = src0_ptr;
4704 src1_ptr_tmp = src1_ptr;
4706 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4707 src0_ptr_tmp += (3 * src_stride);
4711 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4712 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4713 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4722 for (loop_cnt = 4; loop_cnt--;) {
4723 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4724 src0_ptr_tmp += (4 * src_stride);
4727 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4728 src1_ptr_tmp += (4 * src2_stride);
4729 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4730 const_vec, in0, in1, in2, in3);
4732 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4733 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4734 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4735 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4756 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4757 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4758 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4759 dst3_r, tmp0, tmp1, tmp2, tmp3);
4760 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4761 tmp0, tmp1, tmp2, tmp3);
4765 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4766 dst_tmp += (4 * dst_stride);
4782 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4783 src0_ptr += (3 * src_stride);
4785 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4786 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4792 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4794 for (loop_cnt = 2; loop_cnt--;) {
4795 LD_SB8(src0_ptr, src_stride,
4796 src3, src4, src5, src6, src7, src8, src9, src10);
4797 src0_ptr += (8 * src_stride);
4799 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4800 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4801 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4802 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4809 dst32_r = __msa_ilvr_h(dst73, dst22);
4813 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4814 dst76_r = __msa_ilvr_h(dst22, dst106);
4816 LD2(src1_ptr, src2_stride, tp0, tp1);
4817 src1_ptr += 2 * src2_stride;
4819 LD2(src1_ptr, src2_stride, tp0, tp1);
4820 src1_ptr += 2 * src2_stride;
4823 LD2(src1_ptr, src2_stride, tp0, tp1);
4824 src1_ptr += 2 * src2_stride;
4826 LD2(src1_ptr, src2_stride, tp0, tp1);
4827 src1_ptr += 2 * src2_stride;
4830 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4831 const_vec, in0, in1, in2, in3);
4842 SRA_4V(dst0, dst1, dst2, dst3, 6);
4843 SRA_4V(dst4, dst5, dst6, dst7, 6);
4844 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4845 tmp0, tmp1, tmp2, tmp3);
4846 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4847 tmp0, tmp1, tmp2, tmp3);
4851 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4852 dst += (8 * dst_stride);
4856 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4866 const int8_t *filter_x,
4867 const int8_t *filter_y,
4872 dst, dst_stride, filter_x, filter_y, 2);
4875 src2_stride, dst, dst_stride, filter_x,
4876 filter_y, height, 16);
4886 const int8_t *filter_x,
4887 const int8_t *filter_y,
4891 dst, dst_stride, filter_x, filter_y,
4901 const int8_t *filter_x,
4902 const int8_t *filter_y,
4906 dst, dst_stride, filter_x, filter_y,
4910 #define BI_MC_COPY(WIDTH) \ 4911 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \ 4912 ptrdiff_t dst_stride, \ 4914 ptrdiff_t src_stride, \ 4915 int16_t *src_16bit, \ 4921 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \ 4922 dst, dst_stride, height); \ 4937 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \ 4938 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \ 4939 ptrdiff_t dst_stride, \ 4941 ptrdiff_t src_stride, \ 4942 int16_t *src_16bit, \ 4948 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \ 4950 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 4951 MAX_PB_SIZE, dst, dst_stride, \ 4955 BI_MC(qpel,
h, 4, 8, hz, mx);
4956 BI_MC(qpel,
h, 8, 8, hz, mx);
4957 BI_MC(qpel,
h, 12, 8, hz, mx);
4958 BI_MC(qpel,
h, 16, 8, hz, mx);
4959 BI_MC(qpel,
h, 24, 8, hz, mx);
4960 BI_MC(qpel,
h, 32, 8, hz, mx);
4961 BI_MC(qpel,
h, 48, 8, hz, mx);
4962 BI_MC(qpel,
h, 64, 8, hz, mx);
4964 BI_MC(qpel, v, 4, 8, vt, my);
4965 BI_MC(qpel, v, 8, 8, vt, my);
4966 BI_MC(qpel, v, 12, 8, vt, my);
4967 BI_MC(qpel, v, 16, 8, vt, my);
4968 BI_MC(qpel, v, 24, 8, vt, my);
4969 BI_MC(qpel, v, 32, 8, vt, my);
4970 BI_MC(qpel, v, 48, 8, vt, my);
4971 BI_MC(qpel, v, 64, 8, vt, my);
4973 BI_MC(epel,
h, 4, 4, hz, mx);
4974 BI_MC(epel,
h, 8, 4, hz, mx);
4975 BI_MC(epel,
h, 6, 4, hz, mx);
4976 BI_MC(epel,
h, 12, 4, hz, mx);
4977 BI_MC(epel,
h, 16, 4, hz, mx);
4978 BI_MC(epel,
h, 24, 4, hz, mx);
4979 BI_MC(epel,
h, 32, 4, hz, mx);
4981 BI_MC(epel, v, 4, 4, vt, my);
4982 BI_MC(epel, v, 8, 4, vt, my);
4983 BI_MC(epel, v, 6, 4, vt, my);
4984 BI_MC(epel, v, 12, 4, vt, my);
4985 BI_MC(epel, v, 16, 4, vt, my);
4986 BI_MC(epel, v, 24, 4, vt, my);
4987 BI_MC(epel, v, 32, 4, vt, my);
4991 #define BI_MC_HV(PEL, WIDTH, TAP) \ 4992 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \ 4993 ptrdiff_t dst_stride, \ 4995 ptrdiff_t src_stride, \ 4996 int16_t *src_16bit, \ 5002 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \ 5003 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \ 5005 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \ 5006 MAX_PB_SIZE, dst, dst_stride, \ 5007 filter_x, filter_y, height); \
static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1)
#define XORI_B5_128_SB(...)
#define XORI_B8_128_SB(...)
static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define ST_D2(in, idx0, idx1, pdst, stride)
static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define XORI_B2_128_SB(...)
static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B3_128_SB(...)
static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define UNPCK_R_SB_SH(in, out)
static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define DPADD_SB4_SH(...)
#define SPLATI_H2_SH(...)
static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height, int32_t width)
#define SRA_4V(in0, in1, in2, in3, shift)
static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t width8mult)
static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define LD4(psrc, stride, out0, out1, out2, out3)
#define BI_MC_HV(PEL, WIDTH, TAP)
static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W2_SH(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define CLIP_SH_0_255(in)
#define SPLATI_H4_SH(...)
static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static const uint8_t ff_hevc_mask_arr[16 *2]
#define ST_D1(in, idx, pdst)
static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST_H2(in, idx0, idx1, pdst, stride)
static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define CLIP_SH2_0_255(in0, in1)
#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5,idx6, idx7, pdst, stride)
static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define HEVC_BI_RND_CLIP4(in0, in1, in2, in3,vec0, vec1, vec2, vec3, rnd_val,out0, out1, out2, out3)
static int aligned(int val)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B7_128_SB(...)
#define LW2(psrc, stride, out0, out1)
static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define XORI_B4_128_SB(...)
static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define SPLATI_W4_SH(...)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define DPADD_SB2_SH(...)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define INSERT_D2_SH(...)
#define ST_W8(in0, in1, idx0, idx1, idx2, idx3,idx4, idx5, idx6, idx7, pdst, stride)
#define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR)
#define INSERT_W4_SH(...)
static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
#define HEVC_FILT_8TAP(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define BI_MC_COPY(WIDTH)
static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2,vec3,rnd_val, out0, out1, out2, out3)
#define HEVC_FILT_8TAP_SH(in0, in1, in2, in3,filt0, filt1, filt2, filt3)
static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define INSERT_W2_SB(...)
#define SLLI_4V(in0, in1, in2, in3, shift)
static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_FILT_4TAP(in0, in1, filt0, filt1)
#define LW4(psrc, stride, out0, out1, out2, out3)
static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define INSERT_D2_SB(...)
static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define XORI_B6_128_SB(...)
static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)
#define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val,out0, out1)
static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height)
#define ST_W2(in, idx0, idx1, pdst, stride)
#define SLLI_2V(in0, in1, shift)
#define INSERT_W4_SB(...)
static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y)
#define LD2(psrc, stride, out0, out1)
#define HEVC_FILT_4TAP_SH(in0, in1, filt0, filt1)
static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter_x, const int8_t *filter_y, int32_t height, int32_t width)
static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, int32_t height)
static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr, int32_t src_stride, int16_t *src1_ptr, int32_t src2_stride, uint8_t *dst, int32_t dst_stride, const int8_t *filter, int32_t height)