26 #ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
27 #define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
40 #ifndef LOONGSON_INTRINSICS_H
41 #define LOONGSON_INTRINSICS_H
48 #define LSOM_VERSION_MAJOR 1
49 #define LSOM_VERSION_MINOR 0
50 #define LSOM_VERSION_MICRO 3
52 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
58 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
60 _OUT0 = _INS(_IN0, _IN1); \
61 _OUT1 = _INS(_IN2, _IN3); \
64 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
66 _OUT0 = _INS(_IN0, _IN1, _IN2); \
67 _OUT1 = _INS(_IN3, _IN4, _IN5); \
70 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
72 DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
73 DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
76 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
77 _OUT0, _OUT1, _OUT2, _OUT3) \
79 DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
80 DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
83 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, \
84 _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
86 DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
87 DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
91 #include <lsxintrin.h>
109 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h, __m128i in_l)
113 out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
114 out = __lsx_vmaddwod_h_b(
out, in_h, in_l);
135 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h, __m128i in_l)
139 out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
140 out = __lsx_vmaddwod_h_bu(
out, in_h, in_l);
161 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h, __m128i in_l)
165 out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
166 out = __lsx_vmaddwod_w_h(
out, in_h, in_l);
185 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l)
189 out = __lsx_vmulwev_h_b(in_h, in_l);
190 out = __lsx_vmaddwod_h_b(
out, in_h, in_l);
209 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l)
213 out = __lsx_vmulwev_h_bu(in_h, in_l);
214 out = __lsx_vmaddwod_h_bu(
out, in_h, in_l);
233 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l)
237 out = __lsx_vmulwev_h_bu_b(in_h, in_l);
238 out = __lsx_vmaddwod_h_bu_b(
out, in_h, in_l);
257 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l)
261 out = __lsx_vmulwev_w_h(in_h, in_l);
262 out = __lsx_vmaddwod_w_h(
out, in_h, in_l);
282 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i
min, __m128i
max)
286 out = __lsx_vmax_h(
min, _in);
303 static inline __m128i __lsx_vclip255_h(__m128i _in)
307 out = __lsx_vmaxi_h(_in, 0);
308 out = __lsx_vsat_hu(
out, 7);
324 static inline __m128i __lsx_vclip255_w(__m128i _in)
328 out = __lsx_vmaxi_w(_in, 0);
329 out = __lsx_vsat_wu(
out, 7);
346 #define LSX_SWAP(_in0, _in1) \
348 _in0 = __lsx_vxor_v(_in0, _in1); \
349 _in1 = __lsx_vxor_v(_in0, _in1); \
350 _in0 = __lsx_vxor_v(_in0, _in1); \
366 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
368 __m128i _t0, _t1, _t2, _t3; \
370 _t0 = __lsx_vilvl_w(_in1, _in0); \
371 _t1 = __lsx_vilvh_w(_in1, _in0); \
372 _t2 = __lsx_vilvl_w(_in3, _in2); \
373 _t3 = __lsx_vilvh_w(_in3, _in2); \
374 _out0 = __lsx_vilvl_d(_t2, _t0); \
375 _out1 = __lsx_vilvh_d(_t2, _t0); \
376 _out2 = __lsx_vilvl_d(_t3, _t1); \
377 _out3 = __lsx_vilvh_d(_t3, _t1); \
406 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
407 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
409 __m128i zero = {0}; \
410 __m128i shuf8 = {0x0F0E0D0C0B0A0908, 0x1716151413121110}; \
411 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
413 _t0 = __lsx_vilvl_b(_in2, _in0); \
414 _t1 = __lsx_vilvl_b(_in3, _in1); \
415 _t2 = __lsx_vilvl_b(_in6, _in4); \
416 _t3 = __lsx_vilvl_b(_in7, _in5); \
417 _t4 = __lsx_vilvl_b(_t1, _t0); \
418 _t5 = __lsx_vilvh_b(_t1, _t0); \
419 _t6 = __lsx_vilvl_b(_t3, _t2); \
420 _t7 = __lsx_vilvh_b(_t3, _t2); \
421 _out0 = __lsx_vilvl_w(_t6, _t4); \
422 _out2 = __lsx_vilvh_w(_t6, _t4); \
423 _out4 = __lsx_vilvl_w(_t7, _t5); \
424 _out6 = __lsx_vilvh_w(_t7, _t5); \
425 _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
426 _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
427 _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
428 _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
448 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
449 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
451 __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
453 _s0 = __lsx_vilvl_h(_in6, _in4); \
454 _s1 = __lsx_vilvl_h(_in7, _in5); \
455 _t0 = __lsx_vilvl_h(_s1, _s0); \
456 _t1 = __lsx_vilvh_h(_s1, _s0); \
457 _s0 = __lsx_vilvh_h(_in6, _in4); \
458 _s1 = __lsx_vilvh_h(_in7, _in5); \
459 _t2 = __lsx_vilvl_h(_s1, _s0); \
460 _t3 = __lsx_vilvh_h(_s1, _s0); \
461 _s0 = __lsx_vilvl_h(_in2, _in0); \
462 _s1 = __lsx_vilvl_h(_in3, _in1); \
463 _t4 = __lsx_vilvl_h(_s1, _s0); \
464 _t5 = __lsx_vilvh_h(_s1, _s0); \
465 _s0 = __lsx_vilvh_h(_in2, _in0); \
466 _s1 = __lsx_vilvh_h(_in3, _in1); \
467 _t6 = __lsx_vilvl_h(_s1, _s0); \
468 _t7 = __lsx_vilvh_h(_s1, _s0); \
470 _out0 = __lsx_vpickev_d(_t0, _t4); \
471 _out2 = __lsx_vpickev_d(_t1, _t5); \
472 _out4 = __lsx_vpickev_d(_t2, _t6); \
473 _out6 = __lsx_vpickev_d(_t3, _t7); \
474 _out1 = __lsx_vpickod_d(_t0, _t4); \
475 _out3 = __lsx_vpickod_d(_t1, _t5); \
476 _out5 = __lsx_vpickod_d(_t2, _t6); \
477 _out7 = __lsx_vpickod_d(_t3, _t7); \
503 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
504 _out0, _out1, _out2, _out3) \
506 __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
508 _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
509 _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
510 _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
511 _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
512 _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
514 _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
515 _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
516 _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
518 _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
519 _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
520 _out1 = __lsx_vilvh_d(_out2, _out0); \
521 _out3 = __lsx_vilvh_d(_out0, _out2); \
550 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _in8, \
551 _in9, _in10, _in11, _in12, _in13, _in14, _in15, _out0, \
552 _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
554 __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
555 __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
556 DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
557 _tmp0, _tmp1, _tmp2, _tmp3); \
558 DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
559 _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
560 DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
561 DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
562 DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
563 DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
564 DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
565 DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
566 DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
567 DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
568 DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
569 DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
570 DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
571 DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
587 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
589 _out0 = __lsx_vadd_b(_in0, _in3); \
590 _out1 = __lsx_vadd_b(_in1, _in2); \
591 _out2 = __lsx_vsub_b(_in1, _in2); \
592 _out3 = __lsx_vsub_b(_in0, _in3); \
594 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
596 _out0 = __lsx_vadd_h(_in0, _in3); \
597 _out1 = __lsx_vadd_h(_in1, _in2); \
598 _out2 = __lsx_vsub_h(_in1, _in2); \
599 _out3 = __lsx_vsub_h(_in0, _in3); \
601 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
603 _out0 = __lsx_vadd_w(_in0, _in3); \
604 _out1 = __lsx_vadd_w(_in1, _in2); \
605 _out2 = __lsx_vsub_w(_in1, _in2); \
606 _out3 = __lsx_vsub_w(_in0, _in3); \
608 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
610 _out0 = __lsx_vadd_d(_in0, _in3); \
611 _out1 = __lsx_vadd_d(_in1, _in2); \
612 _out2 = __lsx_vsub_d(_in1, _in2); \
613 _out3 = __lsx_vsub_d(_in0, _in3); \
633 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
634 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
636 _out0 = __lsx_vadd_b(_in0, _in7); \
637 _out1 = __lsx_vadd_b(_in1, _in6); \
638 _out2 = __lsx_vadd_b(_in2, _in5); \
639 _out3 = __lsx_vadd_b(_in3, _in4); \
640 _out4 = __lsx_vsub_b(_in3, _in4); \
641 _out5 = __lsx_vsub_b(_in2, _in5); \
642 _out6 = __lsx_vsub_b(_in1, _in6); \
643 _out7 = __lsx_vsub_b(_in0, _in7); \
646 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
647 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
649 _out0 = __lsx_vadd_h(_in0, _in7); \
650 _out1 = __lsx_vadd_h(_in1, _in6); \
651 _out2 = __lsx_vadd_h(_in2, _in5); \
652 _out3 = __lsx_vadd_h(_in3, _in4); \
653 _out4 = __lsx_vsub_h(_in3, _in4); \
654 _out5 = __lsx_vsub_h(_in2, _in5); \
655 _out6 = __lsx_vsub_h(_in1, _in6); \
656 _out7 = __lsx_vsub_h(_in0, _in7); \
659 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
660 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
662 _out0 = __lsx_vadd_w(_in0, _in7); \
663 _out1 = __lsx_vadd_w(_in1, _in6); \
664 _out2 = __lsx_vadd_w(_in2, _in5); \
665 _out3 = __lsx_vadd_w(_in3, _in4); \
666 _out4 = __lsx_vsub_w(_in3, _in4); \
667 _out5 = __lsx_vsub_w(_in2, _in5); \
668 _out6 = __lsx_vsub_w(_in1, _in6); \
669 _out7 = __lsx_vsub_w(_in0, _in7); \
672 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
673 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
675 _out0 = __lsx_vadd_d(_in0, _in7); \
676 _out1 = __lsx_vadd_d(_in1, _in6); \
677 _out2 = __lsx_vadd_d(_in2, _in5); \
678 _out3 = __lsx_vadd_d(_in3, _in4); \
679 _out4 = __lsx_vsub_d(_in3, _in4); \
680 _out5 = __lsx_vsub_d(_in2, _in5); \
681 _out6 = __lsx_vsub_d(_in1, _in6); \
682 _out7 = __lsx_vsub_d(_in0, _in7); \
687 #ifdef __loongarch_asx
688 #include <lasxintrin.h>
703 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l)
707 out = __lasx_xvmulwev_h_bu(in_h, in_l);
708 out = __lasx_xvmaddwod_h_bu(
out, in_h, in_l);
726 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l)
730 out = __lasx_xvmulwev_h_b(in_h, in_l);
731 out = __lasx_xvmaddwod_h_b(
out, in_h, in_l);
752 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l)
756 out = __lasx_xvmulwev_w_h(in_h, in_l);
757 out = __lasx_xvmaddwod_w_h(
out, in_h, in_l);
775 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l)
779 out = __lasx_xvmulwev_d_w(in_h, in_l);
780 out = __lasx_xvmaddwod_d_w(
out, in_h, in_l);
798 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l)
802 out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
803 out = __lasx_xvmaddwod_w_hu_h(
out, in_h, in_l);
821 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c,__m256i in_h, __m256i in_l)
825 out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
826 out = __lasx_xvmaddwod_h_b(
out, in_h, in_l);
848 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
852 out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
853 out = __lasx_xvmaddwod_w_h(
out, in_h, in_l);
871 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h, __m256i in_l)
875 out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
876 out = __lasx_xvmaddwod_w_hu(
out, in_h, in_l);
894 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h, __m256i in_l)
898 out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
899 out = __lasx_xvmaddwod_w_hu_h(
out, in_h, in_l);
918 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h, __m256i in_l)
922 out = __lasx_xvmulwev_h_bu(in_h, in_l);
923 out = __lasx_xvmaddwod_h_bu(
out, in_h, in_l);
924 out = __lasx_xvsub_h(in_c,
out);
947 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
951 out = __lasx_xvmulwev_w_h(in_h, in_l);
952 out = __lasx_xvmaddwod_w_h(
out, in_h, in_l);
953 out = __lasx_xvsub_w(in_c,
out);
974 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l)
978 out = __lasx_xvmulwev_w_h(in_h, in_l);
979 out = __lasx_xvmaddwod_w_h(
out, in_h, in_l);
996 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l)
1000 out = __lasx_xvilvh_b(in_h, in_l);
1020 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l)
1024 out = __lasx_xvilvh_h(in_h, in_l);
1041 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l)
1045 out = __lasx_xvilvl_b(in_h, in_l);
1065 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l)
1069 out = __lasx_xvilvl_h(in_h, in_l);
1086 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l)
1090 out = __lasx_xvilvl_b(in_h, in_l);
1106 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l)
1110 out = __lasx_xvsllwil_hu_bu(in_l, 0);
1111 out = __lasx_xvadd_h(in_h,
out);
1129 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l)
1133 out = __lasx_xvsllwil_w_h(in_l, 0);
1134 out = __lasx_xvadd_w(in_h,
out);
1156 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
1158 __m256i tmp0, tmp1,
out;
1160 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1161 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1162 tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1163 out = __lasx_xvadd_w(tmp0, in_c);
1180 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h, __m256i in_l)
1182 __m256i tmp0, tmp1,
out;
1184 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1185 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1186 tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1187 out = __lasx_xvadd_w(tmp0, in_c);
1206 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l)
1208 __m256i tmp0, tmp1,
out;
1210 tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1211 tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1212 out = __lasx_xvmul_w(tmp0, tmp1);
1231 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l)
1233 __m256i tmp0, tmp1,
out;
1235 tmp0 = __lasx_xvilvh_h(in_h, in_h);
1236 tmp1 = __lasx_xvilvh_h(in_l, in_l);
1237 out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1256 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l)
1261 tmp1 = __lasx_xvilvl_b(
zero, in_l);
1262 out = __lasx_xvsadd_hu(in_h, tmp1);
1282 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i
min, __m256i
max)
1286 out = __lasx_xvmax_h(
min, in);
1301 static inline __m256i __lasx_xvclip255_h(__m256i in)
1305 out = __lasx_xvmaxi_h(in, 0);
1306 out = __lasx_xvsat_hu(
out, 7);
1322 static inline __m256i __lasx_xvclip255_w(__m256i in)
1326 out = __lasx_xvmaxi_w(in, 0);
1327 out = __lasx_xvsat_wu(
out, 7);
1347 static inline __m256i __lasx_xvsplati_l_h(__m256i in,
int idx)
1351 out = __lasx_xvpermi_q(in, in, 0x02);
1352 out = __lasx_xvreplve_h(
out, idx);
1372 static inline __m256i __lasx_xvsplati_h_h(__m256i in,
int idx)
1376 out = __lasx_xvpermi_q(in, in, 0x13);
1377 out = __lasx_xvreplve_h(
out, idx);
1398 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1400 __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1401 _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1402 _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1403 _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1404 _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1405 _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1406 _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1407 _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1408 _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1436 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1437 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1439 __m256i _s0_m, _s1_m; \
1440 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1441 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1443 _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1444 _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1445 _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1446 _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1447 _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1448 _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1449 _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1450 _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1451 _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1452 _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1453 _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1454 _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1455 _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1456 _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1457 _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1458 _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1459 _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1460 _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1461 _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1462 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1463 _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1464 _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1465 _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1466 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1481 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1482 _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
1483 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1485 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1486 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1488 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1489 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1490 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1491 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1492 _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1493 _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1494 _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1495 _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1496 _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1497 _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1498 _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1499 _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1500 _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1501 _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1502 _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1503 _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1504 _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1505 _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1506 _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1507 _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1508 _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1509 _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1510 _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1511 _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1512 _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1513 _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1514 _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1515 _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1516 _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1517 _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1518 _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1519 _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1559 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1560 _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15, \
1561 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1563 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1564 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1565 __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1567 _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1568 _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1569 _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1570 _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1571 _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1572 _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1573 _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1574 _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1575 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1576 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1577 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1578 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1579 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1580 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1581 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1582 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1583 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1584 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1585 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1586 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1587 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1588 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1589 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1590 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1591 _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1592 _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1593 _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1594 _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1596 _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1597 _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1598 _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1599 _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1600 _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1601 _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1602 _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1603 _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1604 _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1605 _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1606 _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1607 _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1608 _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1609 _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1610 _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1611 _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1612 _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1613 _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1614 _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1615 _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1616 _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1617 _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1618 _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1619 _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1620 _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1621 _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1622 _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1623 _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1636 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1638 __m256i _s0_m, _s1_m; \
1640 _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1641 _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1642 _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1643 _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1644 _out1 = __lasx_xvilvh_d(_out0, _out0); \
1645 _out3 = __lasx_xvilvh_d(_out2, _out2); \
1658 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
1659 _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1661 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1662 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1663 _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1664 _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1665 _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1666 _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1667 _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1668 _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1669 _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1670 _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1671 _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1672 _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1673 _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1674 _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1675 _out1 = __lasx_xvbsrl_v(_out0, 8); \
1676 _out3 = __lasx_xvbsrl_v(_out2, 8); \
1677 _out5 = __lasx_xvbsrl_v(_out4, 8); \
1678 _out7 = __lasx_xvbsrl_v(_out6, 8); \
1707 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, _out0, \
1708 _out1, _out2, _out3, _out4, _out5, _out6, _out7) \
1710 __m256i _s0_m, _s1_m; \
1711 __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1712 __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1714 _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1715 _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1716 _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1717 _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1718 _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1719 _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1720 _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1721 _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1723 _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1724 _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1725 _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1726 _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1727 _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1728 _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1729 _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1730 _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1732 _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1733 _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1734 _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1735 _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1736 _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1737 _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1738 _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1739 _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1755 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1757 _out0 = __lasx_xvadd_b(_in0, _in3); \
1758 _out1 = __lasx_xvadd_b(_in1, _in2); \
1759 _out2 = __lasx_xvsub_b(_in1, _in2); \
1760 _out3 = __lasx_xvsub_b(_in0, _in3); \
1762 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1764 _out0 = __lasx_xvadd_h(_in0, _in3); \
1765 _out1 = __lasx_xvadd_h(_in1, _in2); \
1766 _out2 = __lasx_xvsub_h(_in1, _in2); \
1767 _out3 = __lasx_xvsub_h(_in0, _in3); \
1769 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1771 _out0 = __lasx_xvadd_w(_in0, _in3); \
1772 _out1 = __lasx_xvadd_w(_in1, _in2); \
1773 _out2 = __lasx_xvsub_w(_in1, _in2); \
1774 _out3 = __lasx_xvsub_w(_in0, _in3); \
1776 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1778 _out0 = __lasx_xvadd_d(_in0, _in3); \
1779 _out1 = __lasx_xvadd_d(_in1, _in2); \
1780 _out2 = __lasx_xvsub_d(_in1, _in2); \
1781 _out3 = __lasx_xvsub_d(_in0, _in3); \
1801 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1802 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1804 _out0 = __lasx_xvadd_b(_in0, _in7); \
1805 _out1 = __lasx_xvadd_b(_in1, _in6); \
1806 _out2 = __lasx_xvadd_b(_in2, _in5); \
1807 _out3 = __lasx_xvadd_b(_in3, _in4); \
1808 _out4 = __lasx_xvsub_b(_in3, _in4); \
1809 _out5 = __lasx_xvsub_b(_in2, _in5); \
1810 _out6 = __lasx_xvsub_b(_in1, _in6); \
1811 _out7 = __lasx_xvsub_b(_in0, _in7); \
1814 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1815 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1817 _out0 = __lasx_xvadd_h(_in0, _in7); \
1818 _out1 = __lasx_xvadd_h(_in1, _in6); \
1819 _out2 = __lasx_xvadd_h(_in2, _in5); \
1820 _out3 = __lasx_xvadd_h(_in3, _in4); \
1821 _out4 = __lasx_xvsub_h(_in3, _in4); \
1822 _out5 = __lasx_xvsub_h(_in2, _in5); \
1823 _out6 = __lasx_xvsub_h(_in1, _in6); \
1824 _out7 = __lasx_xvsub_h(_in0, _in7); \
1827 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1828 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1830 _out0 = __lasx_xvadd_w(_in0, _in7); \
1831 _out1 = __lasx_xvadd_w(_in1, _in6); \
1832 _out2 = __lasx_xvadd_w(_in2, _in5); \
1833 _out3 = __lasx_xvadd_w(_in3, _in4); \
1834 _out4 = __lasx_xvsub_w(_in3, _in4); \
1835 _out5 = __lasx_xvsub_w(_in2, _in5); \
1836 _out6 = __lasx_xvsub_w(_in1, _in6); \
1837 _out7 = __lasx_xvsub_w(_in0, _in7); \
1840 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1841 _out0, _out1, _out2, _out3, _out4, _out5, _out6, _out7)\
1843 _out0 = __lasx_xvadd_d(_in0, _in7); \
1844 _out1 = __lasx_xvadd_d(_in1, _in6); \
1845 _out2 = __lasx_xvadd_d(_in2, _in5); \
1846 _out3 = __lasx_xvadd_d(_in3, _in4); \
1847 _out4 = __lasx_xvsub_d(_in3, _in4); \
1848 _out5 = __lasx_xvsub_d(_in2, _in5); \
1849 _out6 = __lasx_xvsub_d(_in1, _in6); \
1850 _out7 = __lasx_xvsub_d(_in0, _in7); \
1866 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
1868 RTYPE _tmp0 = (RTYPE)in0; \
1872 for(_i = 0; _i < element_num; _i++) \
1873 printf("%d,",_tmp0[_i]); \