25 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
26 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
33 uint32_t coeff0, uint32_t coeff1)
40 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
41 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
42 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
49 res_r = __msa_dotp_u_h((v16u8)
src0, coeff_vec);
51 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
52 res_r = __msa_sat_u_h(res_r, 7);
53 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
55 out0 = __msa_copy_u_h(res, 0);
56 out1 = __msa_copy_u_h(res, 2);
64 uint32_t coeff0, uint32_t coeff1)
70 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
71 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
72 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
82 res_r = __msa_dotp_u_h(
src0, coeff_vec);
84 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
85 res_r = __msa_sat_u_h(res_r, 7);
86 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
92 uint32_t coeff0, uint32_t coeff1,
103 uint32_t coeff0, uint32_t coeff1)
109 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
110 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
111 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
118 res_r = __msa_dotp_u_h((v16u8)
src0, coeff_vec);
120 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
121 res_r = __msa_sat_u_h(res_r, 7);
122 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
128 uint32_t coeff0, uint32_t coeff1)
131 v8u16 res0_r, res1_r;
133 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
134 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
135 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
146 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
151 uint32_t coeff0, uint32_t coeff1)
153 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, out0, out1;
155 v8u16 res0, res1, res2, res3;
156 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
157 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
158 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
166 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
167 SLLI_4V(res0, res1, res2, res3, 3);
171 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst,
stride);
175 uint32_t coeff0, uint32_t coeff1,
188 uint32_t coeff0, uint32_t coeff1)
191 v8u16 res0, res1, res2, res3;
193 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
194 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
195 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
202 coeff_vec, res0, res1, res2, res3);
203 SLLI_4V(res0, res1, res2, res3, 3);
211 uint32_t coeff0, uint32_t coeff1)
214 v16u8 out0, out1, out2, out3;
215 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
217 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
218 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
219 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
229 coeff_vec, res0, res1, res2, res3);
230 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
231 coeff_vec, res4, res5, res6, res7);
232 SLLI_4V(res0, res1, res2, res3, 3);
233 SLLI_4V(res4, res5, res6, res7, 3);
240 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
249 v8u16 res0, res1, res2, res3;
251 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
252 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
253 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
257 for (row =
height >> 2; row--;) {
264 coeff_vec, res0, res1, res2, res3);
265 SLLI_4V(res0, res1, res2, res3, 3);
274 for (row = (
height % 4); row--;) {
280 res0 = __msa_dotp_u_h(
src0, coeff_vec);
282 res0 = (v8u16) __msa_srari_h((v8i16) res0, 6);
283 res0 = __msa_sat_u_h(res0, 7);
284 res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
293 uint32_t coeff0, uint32_t coeff1,
306 uint32_t coeff0, uint32_t coeff1)
313 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
314 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
315 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
321 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
323 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
325 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
326 res_r = __msa_sat_u_h(res_r, 7);
327 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
329 out0 = __msa_copy_u_h(res, 0);
330 out1 = __msa_copy_u_h(res, 2);
338 uint32_t coeff0, uint32_t coeff1)
341 v16u8 tmp0, tmp1, tmp2, tmp3;
344 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
345 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
346 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
350 tmp0, tmp1, tmp2, tmp3);
351 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
353 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
355 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
357 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
358 res_r = __msa_sat_u_h(res_r, 7);
360 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
366 uint32_t coeff0, uint32_t coeff1,
377 uint32_t coeff0, uint32_t coeff1)
383 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
384 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
385 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
390 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
391 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
393 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
394 res_r = __msa_sat_u_h(res_r, 7);
395 res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
401 uint32_t coeff0, uint32_t coeff1)
404 v16u8 tmp0, tmp1, tmp2, tmp3;
406 v8u16 res0_r, res1_r;
407 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
408 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
409 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
412 ILVR_B4_UB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3, tmp0, tmp1, tmp2,
414 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
415 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
420 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
425 uint32_t coeff0, uint32_t coeff1)
427 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
428 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
429 v8u16 res0, res1, res2, res3;
430 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
431 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
432 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
437 ILVR_B4_UB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3, tmp0, tmp1, tmp2,
439 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
441 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
442 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
443 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
444 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
445 SLLI_4V(res0, res1, res2, res3, 3);
449 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst,
stride);
453 uint32_t coeff0, uint32_t coeff1,
466 uint32_t coeff0, uint32_t coeff1)
469 v8u16 res0, res1, res2, res3;
470 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
471 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
472 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
475 ILVR_B4_UB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3,
src0,
src1,
src2,
478 coeff_vec, res0, res1, res2, res3);
479 SLLI_4V(res0, res1, res2, res3, 3);
487 uint32_t coeff0, uint32_t coeff1)
489 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
490 v16u8 out0, out1, out2, out3;
491 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
492 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
493 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
494 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
499 ILVR_B4_UB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3,
src0,
src1,
src2,
501 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, src4, src5, src6,
504 coeff_vec, res0, res1, res2, res3);
505 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
506 coeff_vec, res4, res5, res6, res7);
507 SLLI_4V(res0, res1, res2, res3, 3);
508 SLLI_4V(res4, res5, res6, res7, 3);
515 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
519 uint32_t coeff0, uint32_t coeff1,
530 uint32_t coef_hor0, uint32_t coef_hor1,
531 uint32_t coef_ver0, uint32_t coef_ver1)
535 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
538 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
539 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
540 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
541 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
542 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
549 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
552 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
553 res_vt0 = __msa_sat_u_h(res_vt0, 7);
554 res_vert = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
556 out0 = __msa_copy_u_h(res_vert, 0);
557 out1 = __msa_copy_u_h(res_vert, 1);
565 uint32_t coef_hor0, uint32_t coef_hor1,
566 uint32_t coef_ver0, uint32_t coef_ver1)
569 v16u8 tmp0, tmp1, tmp2, tmp3;
570 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
573 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
574 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
575 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
576 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
577 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
587 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
590 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
591 res_vt0 = __msa_sat_u_h(res_vt0, 7);
593 res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
599 uint32_t coef_hor0, uint32_t coef_hor1,
600 uint32_t coef_ver0, uint32_t coef_ver1,
613 uint32_t coef_hor0, uint32_t coef_hor1,
614 uint32_t coef_ver0, uint32_t coef_ver1)
617 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
620 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
621 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
622 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
623 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
624 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
630 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
633 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
634 res_vt0 = __msa_sat_u_h(res_vt0, 7);
635 res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
641 uint32_t coef_hor0, uint32_t coef_hor1,
642 uint32_t coef_ver0, uint32_t coef_ver1)
645 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
648 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
649 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
650 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
651 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
652 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
661 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
663 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
664 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
665 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
668 PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
674 uint32_t coef_hor0, uint32_t coef_hor1,
675 uint32_t coef_ver0, uint32_t coef_ver1)
677 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, res0, res1;
678 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
679 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
681 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
682 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
683 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
684 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
685 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
698 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
699 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
700 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
701 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
702 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
703 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
704 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
705 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
706 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
707 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
708 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
709 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
710 ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3,
dst,
stride);
714 uint32_t coef_hor0, uint32_t coef_hor1,
715 uint32_t coef_ver0, uint32_t coef_ver1,
731 uint32_t coef_hor0, uint32_t coef_hor1,
732 uint32_t coef_ver0, uint32_t coef_ver1)
735 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
736 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
738 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
739 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
740 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
741 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
742 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
750 res_hz0 = __msa_dotp_u_h(
src0, coeff_hz_vec);
758 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
759 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
760 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
762 res_vt0 += (res_hz0 * coeff_vt_vec1);
763 res_vt1 += (res_hz1 * coeff_vt_vec1);
764 res_vt2 += (res_hz2 * coeff_vt_vec1);
765 res_vt3 += (res_hz3 * coeff_vt_vec1);
767 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
768 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
769 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
774 uint32_t coef_hor0, uint32_t coef_hor1,
775 uint32_t coef_ver0, uint32_t coef_ver1)
777 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
778 v16u8 out0, out1, out2, out3;
779 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
780 v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
781 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
782 v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
784 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
785 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
786 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
787 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
788 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
800 res_hz0 = __msa_dotp_u_h(
src0, coeff_hz_vec);
802 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
804 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
805 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
806 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
807 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
809 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
810 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
812 res_vt0 += (res_hz0 * coeff_vt_vec1);
813 res_vt1 += (res_hz1 * coeff_vt_vec1);
814 res_vt2 += (res_hz2 * coeff_vt_vec1);
815 res_vt3 += (res_hz3 * coeff_vt_vec1);
816 res_vt4 += (res_hz4 * coeff_vt_vec1);
817 res_vt5 += (res_hz5 * coeff_vt_vec1);
818 res_vt6 += (res_hz6 * coeff_vt_vec1);
819 res_vt7 += (res_hz7 * coeff_vt_vec1);
820 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
821 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
822 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
823 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
824 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
825 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
826 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
830 uint32_t coef_hor0, uint32_t coef_hor1,
831 uint32_t coef_ver0, uint32_t coef_ver1,
849 v16u8 dst_data = { 0 };
853 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
854 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
855 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
864 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
865 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
869 res_r = __msa_dotp_u_h((v16u8)
src0, coeff_vec);
871 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
872 res_r = __msa_sat_u_h(res_r, 7);
874 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
875 dst_data = __msa_aver_u_b(res, dst_data);
877 out0 = __msa_copy_u_h((v8i16) dst_data, 0);
878 out1 = __msa_copy_u_h((v8i16) dst_data, 2);
889 uint16_t tp0, tp1, tp2, tp3;
891 v16u8 dst0, dst_data = { 0 };
894 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
895 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
896 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
905 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
906 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
907 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
908 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
912 src0 = (v16u8) __msa_ilvr_d((v2i64)
src2, (v2i64)
src0);
914 res_r = __msa_dotp_u_h(
src0, coeff_vec);
916 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
917 res_r = __msa_sat_u_h(res_r, 7);
919 dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
920 dst0 = __msa_aver_u_b(dst0, dst_data);
940 uint32_t load0, load1;
942 v16u8 dst_data = { 0 };
945 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
946 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
947 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
959 res_r = __msa_dotp_u_h((v16u8)
src0, coeff_vec);
961 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
962 res_r = __msa_sat_u_h(res_r, 7);
963 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
964 dst_data = __msa_aver_u_b((v16u8) res, dst_data);
973 uint32_t tp0, tp1, tp2, tp3;
975 v16u8
out, dst_data = { 0 };
977 v8u16 res0_r, res1_r;
978 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
979 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
980 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
993 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
994 out = __msa_aver_u_b(
out, dst_data);
1002 uint32_t tp0, tp1, tp2, tp3;
1003 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, out0, out1;
1004 v16u8 dst0 = { 0 }, dst1 = { 0 };
1006 v8u16 res0, res1, res2, res3;
1007 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1008 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1009 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1021 DOTP_UB2_UH(src4, src6, coeff_vec, coeff_vec, res2, res3);
1022 SLLI_4V(res0, res1, res2, res3, 3);
1027 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst,
stride);
1036 }
else if (4 ==
height) {
1038 }
else if (8 ==
height) {
1047 uint64_t tp0, tp1, tp2, tp3;
1049 v16u8 dst0 = { 0 }, dst1 = { 0 };
1050 v8u16 res0, res1, res2, res3;
1052 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1053 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1054 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1064 coeff_vec, res0, res1, res2, res3);
1065 SLLI_4V(res0, res1, res2, res3, 3);
1077 uint64_t tp0, tp1, tp2, tp3;
1079 v16u8 out0, out1, out2, out3;
1080 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1081 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1083 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1084 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1085 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1101 coeff_vec, res0, res1, res2, res3);
1102 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1103 coeff_vec, res4, res5, res6, res7);
1104 SLLI_4V(res0, res1, res2, res3, 3);
1105 SLLI_4V(res4, res5, res6, res7, 3);
1114 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
1123 }
else if (8 ==
height) {
1132 uint16_t out0, out1;
1134 v16u8 dst_data = { 0 };
1137 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1138 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1139 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1145 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, out0);
1146 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, out1);
1150 tmp0 = (v16i8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1151 res_r = __msa_dotp_u_h((v16u8) tmp0, coeff_vec);
1153 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1154 res_r = __msa_sat_u_h(res_r, 7);
1155 res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1156 out = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1157 out0 = __msa_copy_u_h(
out, 0);
1158 out1 = __msa_copy_u_h(
out, 2);
1169 uint16_t tp0, tp1, tp2, tp3;
1171 v16u8 tmp0, tmp1, tmp2, tmp3;
1174 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1175 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1176 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1177 v16u8 dst_data = { 0 };
1185 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 0, tp0);
1186 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 1, tp1);
1187 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 2, tp2);
1188 dst_data = (v16u8) __msa_insert_h((v8i16) dst_data, 3, tp3);
1191 tmp0, tmp1, tmp2, tmp3);
1192 ILVR_W2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1194 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp2, (v2i64) tmp0);
1196 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1198 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1199 res_r = __msa_sat_u_h(res_r, 7);
1201 res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1202 res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
1213 }
else if (4 ==
height) {
1222 uint32_t load0, load1;
1224 v16u8 dst_data = { 0 };
1227 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1228 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1229 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1238 tmp0 = (v16u8) __msa_ilvr_d((v2i64) tmp1, (v2i64) tmp0);
1240 res_r = __msa_dotp_u_h(tmp0, coeff_vec);
1242 res_r = (v8u16) __msa_srari_h((v8i16) res_r, 6);
1243 res_r = __msa_sat_u_h(res_r, 7);
1244 res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
1245 res = __msa_aver_u_b(res, dst_data);
1254 uint32_t tp0, tp1, tp2, tp3;
1256 v16u8 tmp0, tmp1, tmp2, tmp3;
1258 v8u16 res0_r, res1_r;
1260 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1261 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1262 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1267 ILVR_B4_UB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3, tmp0, tmp1, tmp2,
1269 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1270 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0_r, res1_r);
1275 out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
1276 out = __msa_aver_u_b(
out, dst0);
1284 uint32_t tp0, tp1, tp2, tp3;
1285 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1286 v16u8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, out0, out1;
1287 v16u8 dst0 = { 0 }, dst1 = { 0 };
1288 v8u16 res0, res1, res2, res3;
1289 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1290 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1291 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1300 ILVR_B4_UB(
src1,
src0,
src2,
src1, src3,
src2, src4, src3, tmp0, tmp1, tmp2,
1302 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7, tmp4, tmp5, tmp6,
1304 ILVR_D2_UB(tmp1, tmp0, tmp3, tmp2, tmp0, tmp2);
1305 ILVR_D2_UB(tmp5, tmp4, tmp7, tmp6, tmp4, tmp6);
1306 DOTP_UB2_UH(tmp0, tmp2, coeff_vec, coeff_vec, res0, res1);
1307 DOTP_UB2_UH(tmp4, tmp6, coeff_vec, coeff_vec, res2, res3);
1308 SLLI_4V(res0, res1, res2, res3, 3);
1313 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3,
dst,
stride);
1322 }
else if (4 ==
height) {
1324 }
else if (8 ==
height) {
1333 uint64_t tp0, tp1, tp2, tp3;
1336 v8u16 res0, res1, res2, res3;
1337 v16u8 dst0 = { 0 }, dst1 = { 0 };
1338 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1339 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1340 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1349 coeff_vec, res0, res1, res2, res3);
1350 SLLI_4V(res0, res1, res2, res3, 3);
1362 uint64_t tp0, tp1, tp2, tp3;
1363 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1364 v16u8 out0, out1, out2, out3;
1365 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1366 v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1367 v16i8 coeff_vec0 = __msa_fill_b(coeff0);
1368 v16i8 coeff_vec1 = __msa_fill_b(coeff1);
1369 v16u8 coeff_vec = (v16u8) __msa_ilvr_b(coeff_vec0, coeff_vec1);
1382 ILVR_B4_UB(src5, src4, src6, src5, src7, src6, src8, src7,
1383 src4, src5, src6, src7);
1385 coeff_vec, res0, res1, res2, res3);
1386 DOTP_UB4_UH(src4, src5, src6, src7, coeff_vec, coeff_vec, coeff_vec,
1387 coeff_vec, res4, res5, res6, res7);
1388 SLLI_4V(res0, res1, res2, res3, 3);
1389 SLLI_4V(res4, res5, res6, res7, 3);
1398 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
1407 }
else if (8 ==
height) {
1419 uint16_t out0, out1;
1422 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1424 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1425 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1426 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1427 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1428 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1435 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, out0);
1436 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, out1);
1439 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1442 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1443 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1444 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1445 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1446 out0 = __msa_copy_u_h((v8i16) dst0, 0);
1447 out1 = __msa_copy_u_h((v8i16) dst0, 1);
1461 uint16_t tp0, tp1, tp2, tp3;
1463 v16u8 tmp0, tmp1, tmp2, tmp3;
1465 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1467 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1468 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1469 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1470 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1471 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1480 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 0, tp0);
1481 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 1, tp1);
1482 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 2, tp2);
1483 dst0 = (v16u8) __msa_insert_h((v8i16) dst0, 3, tp3);
1488 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1491 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1492 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1493 res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1494 dst0 = __msa_aver_u_b((v16u8) res, dst0);
1509 coef_hor1, coef_ver0, coef_ver1);
1510 }
else if (4 ==
height) {
1512 coef_hor1, coef_ver0, coef_ver1);
1525 v16u8 dst0, dst_data = { 0 };
1526 v8u16 res_hz0, res_hz1, res_vt0, res_vt1;
1528 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1529 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1530 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1531 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1532 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1541 MUL2(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_vt0, res_vt1);
1544 res_vt0 = (v8u16) __msa_srari_h((v8i16) res_vt0, 6);
1545 res_vt0 = __msa_sat_u_h(res_vt0, 7);
1546 dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
1547 dst0 = __msa_aver_u_b(dst0, dst_data);
1559 uint32_t tp0, tp1, tp2, tp3;
1561 v16u8
out, dst_data = { 0 };
1562 v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
1563 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1565 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1566 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1567 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1568 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1569 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1579 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
1581 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1582 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1583 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1586 out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
1587 out = __msa_aver_u_b(
out, dst_data);
1598 uint32_t tp0, tp1, tp2, tp3;
1599 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8, res0, res1;
1600 v16u8 dst0 = { 0 }, dst1 = { 0 };
1601 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5, res_hz6, res_hz7;
1602 v8u16 res_vt0, res_vt1, res_vt2, res_vt3, res_vt4, res_vt5, res_vt6, res_vt7;
1604 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1605 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1606 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1607 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1608 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1624 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2, res_hz3);
1625 DOTP_UB4_UH(src4, src5, src6, src7, coeff_hz_vec, coeff_hz_vec,
1626 coeff_hz_vec, coeff_hz_vec, res_hz4, res_hz5, res_hz6, res_hz7);
1627 MUL4(res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec1,
1628 res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1629 MUL4(res_hz4, coeff_vt_vec1, res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec1,
1630 res_hz7, coeff_vt_vec0, res_vt4, res_vt5, res_vt6, res_vt7);
1631 ADD2(res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1);
1632 ADD2(res_vt4, res_vt5, res_vt6, res_vt7, res_vt2, res_vt3);
1633 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1634 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1635 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
1637 ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3,
dst,
stride);
1650 coef_hor1, coef_ver0, coef_ver1);
1651 }
else if (4 ==
height) {
1653 coef_hor1, coef_ver0, coef_ver1);
1654 }
else if (8 ==
height) {
1656 coef_hor1, coef_ver0, coef_ver1);
1667 uint64_t tp0, tp1, tp2, tp3;
1669 v8u16 res_hz0, res_hz1, res_hz2;
1670 v8u16 res_hz3, res_hz4;
1671 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1672 v16u8 dst0 = { 0 }, dst1 = { 0 };
1674 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1675 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1676 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1677 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1678 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1685 res_hz0 = __msa_dotp_u_h(
src0, coeff_hz_vec);
1694 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3, res_hz4);
1695 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3, coeff_vt_vec0,
1696 res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2, res_vt3);
1697 res_vt0 += (res_hz0 * coeff_vt_vec1);
1698 res_vt1 += (res_hz1 * coeff_vt_vec1);
1699 res_vt2 += (res_hz2 * coeff_vt_vec1);
1700 res_vt3 += (res_hz3 * coeff_vt_vec1);
1701 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1702 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1703 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1715 uint64_t tp0, tp1, tp2, tp3;
1716 v16u8
src0,
src1,
src2, src3, src4, src5, src6, src7, src8;
1717 v16u8 out0, out1, out2, out3;
1718 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1719 v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4;
1720 v8u16 res_hz5, res_hz6, res_hz7, res_hz8;
1721 v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
1722 v8u16 res_vt4, res_vt5, res_vt6, res_vt7;
1724 v16i8 coeff_hz_vec0 = __msa_fill_b(coef_hor0);
1725 v16i8 coeff_hz_vec1 = __msa_fill_b(coef_hor1);
1726 v16u8 coeff_hz_vec = (v16u8) __msa_ilvr_b(coeff_hz_vec0, coeff_hz_vec1);
1727 v8u16 coeff_vt_vec0 = (v8u16) __msa_fill_h(coef_ver0);
1728 v8u16 coeff_vt_vec1 = (v8u16) __msa_fill_h(coef_ver1);
1740 res_hz0 = __msa_dotp_u_h(
src0, coeff_hz_vec);
1742 coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
1744 DOTP_UB4_UH(src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
1745 coeff_hz_vec, coeff_hz_vec, res_hz5, res_hz6, res_hz7, res_hz8);
1746 MUL4(res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
1747 coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
1749 MUL4(res_hz5, coeff_vt_vec0, res_hz6, coeff_vt_vec0, res_hz7,
1750 coeff_vt_vec0, res_hz8, coeff_vt_vec0, res_vt4, res_vt5, res_vt6,
1758 res_vt0 += (res_hz0 * coeff_vt_vec1);
1759 res_vt1 += (res_hz1 * coeff_vt_vec1);
1760 res_vt2 += (res_hz2 * coeff_vt_vec1);
1761 res_vt3 += (res_hz3 * coeff_vt_vec1);
1762 res_vt4 += (res_hz4 * coeff_vt_vec1);
1763 res_vt5 += (res_hz5 * coeff_vt_vec1);
1764 res_vt6 += (res_hz6 * coeff_vt_vec1);
1765 res_vt7 += (res_hz7 * coeff_vt_vec1);
1766 SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
1767 SRARI_H4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 6);
1768 SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
1769 SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
1770 PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
1771 PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
1774 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
1787 coef_hor1, coef_ver0, coef_ver1);
1788 }
else if (8 ==
height) {
1790 coef_hor1, coef_ver0, coef_ver1);
1797 uint32_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1806 }
else if (4 ==
height) {
1809 }
else if (2 ==
height) {
1820 uint64_t
src0,
src1,
src2, src3, src4, src5, src6, src7;
1829 }
else if (4 ==
height) {
1838 uint32_t tp0, tp1, tp2, tp3;
1839 v16u8
src0 = { 0 },
src1 = { 0 }, dst0 = { 0 }, dst1 = { 0 };
1852 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3,
dst,
stride);
1853 }
else if (4 ==
height) {
1858 dst0 = __msa_aver_u_b(
src0, dst0);
1860 }
else if (2 ==
height) {
1865 dst0 = __msa_aver_u_b(
src0, dst0);
1873 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1874 v16u8
src0 = { 0 },
src1 = { 0 },
src2 = { 0 }, src3 = { 0 };
1875 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1891 AVER_UB4_UB(
src0, dst0,
src1, dst1,
src2, dst2, src3, dst3, dst0, dst1,
1893 ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1,
dst,
stride);
1894 }
else if (4 ==
height) {
1909 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1925 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1943 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1952 for (cnt =
height; cnt--;) {
1953 *((uint16_t *)
dst) = *((uint16_t *)
src);
1964 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
1982 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2001 av_assert2(x < 8 && y < 8 && x >= 0 && y >= 0);
2011 for (cnt =
height; cnt--;) {