Go to the documentation of this file.
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
27 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
28 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
29 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
31 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
32 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
33 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
35 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
37 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
39 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
41 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
43 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
45 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
47 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
48 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
51 #if (__mips_isa_rev >= 6)
54 uint8_t *psrc_m = (uint8_t *) (psrc); \
58 "lw %[val_m], %[psrc_m] \n\t" \
60 : [val_m] "=r" (val_m) \
61 : [psrc_m] "m" (*psrc_m) \
70 uint8_t *psrc_m = (uint8_t *) (psrc); \
74 "ld %[val_m], %[psrc_m] \n\t" \
76 : [val_m] "=r" (val_m) \
77 : [psrc_m] "m" (*psrc_m) \
82 #else // !(__mips == 64)
85 uint8_t *psrc_m = (uint8_t *) (psrc); \
86 uint32_t val0_m, val1_m; \
89 val0_m = LW(psrc_m); \
90 val1_m = LW(psrc_m + 4); \
92 val_m = (uint64_t) (val1_m); \
93 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
94 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
98 #endif // (__mips == 64)
100 #define SH(val, pdst) \
102 uint8_t *pdst_m = (uint8_t *) (pdst); \
103 uint16_t val_m = (val); \
106 "sh %[val_m], %[pdst_m] \n\t" \
108 : [pdst_m] "=m" (*pdst_m) \
109 : [val_m] "r" (val_m) \
113 #define SW(val, pdst) \
115 uint8_t *pdst_m = (uint8_t *) (pdst); \
116 uint32_t val_m = (val); \
119 "sw %[val_m], %[pdst_m] \n\t" \
121 : [pdst_m] "=m" (*pdst_m) \
122 : [val_m] "r" (val_m) \
126 #define SD(val, pdst) \
128 uint8_t *pdst_m = (uint8_t *) (pdst); \
129 uint64_t val_m = (val); \
132 "sd %[val_m], %[pdst_m] \n\t" \
134 : [pdst_m] "=m" (*pdst_m) \
135 : [val_m] "r" (val_m) \
138 #else // !(__mips_isa_rev >= 6)
141 uint8_t *psrc_m = (uint8_t *) (psrc); \
145 "ulw %[val_m], %[psrc_m] \n\t" \
147 : [val_m] "=r" (val_m) \
148 : [psrc_m] "m" (*psrc_m) \
157 uint8_t *psrc_m = (uint8_t *) (psrc); \
158 uint64_t val_m = 0; \
161 "uld %[val_m], %[psrc_m] \n\t" \
163 : [val_m] "=r" (val_m) \
164 : [psrc_m] "m" (*psrc_m) \
169 #else // !(__mips == 64)
172 uint8_t *psrc_m1 = (uint8_t *) (psrc); \
173 uint32_t val0_m, val1_m; \
174 uint64_t val_m = 0; \
176 val0_m = LW(psrc_m1); \
177 val1_m = LW(psrc_m1 + 4); \
179 val_m = (uint64_t) (val1_m); \
180 val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
181 val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
185 #endif // (__mips == 64)
187 #define SH(val, pdst) \
189 uint8_t *pdst_m = (uint8_t *) (pdst); \
190 uint16_t val_m = (val); \
193 "ush %[val_m], %[pdst_m] \n\t" \
195 : [pdst_m] "=m" (*pdst_m) \
196 : [val_m] "r" (val_m) \
200 #define SW(val, pdst) \
202 uint8_t *pdst_m = (uint8_t *) (pdst); \
203 uint32_t val_m = (val); \
206 "usw %[val_m], %[pdst_m] \n\t" \
208 : [pdst_m] "=m" (*pdst_m) \
209 : [val_m] "r" (val_m) \
213 #define SD(val, pdst) \
215 uint8_t *pdst_m1 = (uint8_t *) (pdst); \
216 uint32_t val0_m, val1_m; \
218 val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
219 val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
221 SW(val0_m, pdst_m1); \
222 SW(val1_m, pdst_m1 + 4); \
224 #endif // (__mips_isa_rev >= 6)
235 #define LW4(psrc, stride, out0, out1, out2, out3) \
238 out1 = LW((psrc) + stride); \
239 out2 = LW((psrc) + 2 * stride); \
240 out3 = LW((psrc) + 3 * stride); \
250 #define LD2(psrc, stride, out0, out1) \
253 out1 = LD((psrc) + stride); \
255 #define LD4(psrc, stride, out0, out1, out2, out3) \
257 LD2((psrc), stride, out0, out1); \
258 LD2((psrc) + 2 * stride, stride, out2, out3); \
268 #define SW4(in0, in1, in2, in3, pdst, stride) \
271 SW(in1, (pdst) + stride); \
272 SW(in2, (pdst) + 2 * stride); \
273 SW(in3, (pdst) + 3 * stride); \
283 #define SD4(in0, in1, in2, in3, pdst, stride) \
286 SD(in1, (pdst) + stride); \
287 SD(in2, (pdst) + 2 * stride); \
288 SD(in3, (pdst) + 3 * stride); \
299 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
301 out0 = LD_B(RTYPE, (psrc)); \
302 out1 = LD_B(RTYPE, (psrc) + stride); \
304 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
305 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
307 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
309 LD_B2(RTYPE, (psrc), stride, out0, out1); \
310 out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
312 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
313 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
315 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
317 LD_B2(RTYPE, (psrc), stride, out0, out1); \
318 LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
320 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
321 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
323 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
325 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
326 out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
328 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
329 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
331 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
333 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
334 LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
336 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
338 #define LD_B7(RTYPE, psrc, stride, \
339 out0, out1, out2, out3, out4, out5, out6) \
341 LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
342 LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
344 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
346 #define LD_B8(RTYPE, psrc, stride, \
347 out0, out1, out2, out3, out4, out5, out6, out7) \
349 LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
350 LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
352 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
353 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
362 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
364 out0 = LD_H(RTYPE, (psrc)); \
365 out1 = LD_H(RTYPE, (psrc) + (stride)); \
367 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
368 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
370 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
372 LD_H2(RTYPE, (psrc), stride, out0, out1); \
373 LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
375 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
376 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
378 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
380 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
381 LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
383 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
384 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
386 #define LD_H8(RTYPE, psrc, stride, \
387 out0, out1, out2, out3, out4, out5, out6, out7) \
389 LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
390 LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
392 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
393 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
395 #define LD_H16(RTYPE, psrc, stride, \
396 out0, out1, out2, out3, out4, out5, out6, out7, \
397 out8, out9, out10, out11, out12, out13, out14, out15) \
399 LD_H8(RTYPE, (psrc), stride, \
400 out0, out1, out2, out3, out4, out5, out6, out7); \
401 LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
402 out8, out9, out10, out11, out12, out13, out14, out15); \
404 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
412 #define LD_SW2(psrc, stride, out0, out1) \
414 out0 = LD_SW((psrc)); \
415 out1 = LD_SW((psrc) + stride); \
424 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
426 ST_B(RTYPE, in0, (pdst)); \
427 ST_B(RTYPE, in1, (pdst) + stride); \
429 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
430 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
432 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
434 ST_B2(RTYPE, in0, in1, (pdst), stride); \
435 ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
437 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
438 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
440 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
443 ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
444 ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
446 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
454 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
456 ST_H(RTYPE, in0, (pdst)); \
457 ST_H(RTYPE, in1, (pdst) + stride); \
459 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
460 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
462 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
464 ST_H2(RTYPE, in0, in1, (pdst), stride); \
465 ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
467 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
469 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
471 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
472 ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
474 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
476 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
478 ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
479 ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
481 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
490 #define ST_SW2(in0, in1, pdst, stride) \
492 ST_SW(in0, (pdst)); \
493 ST_SW(in1, (pdst) + stride); \
508 #define ST2x4_UB(in, stidx, pdst, stride) \
510 uint16_t out0_m, out1_m, out2_m, out3_m; \
511 uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
513 out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
514 out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
515 out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
516 out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
518 SH(out0_m, pblk_2x4_m); \
519 SH(out1_m, pblk_2x4_m + stride); \
520 SH(out2_m, pblk_2x4_m + 2 * stride); \
521 SH(out3_m, pblk_2x4_m + 3 * stride); \
532 #define ST4x2_UB(in, pdst, stride) \
534 uint32_t out0_m, out1_m; \
535 uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
537 out0_m = __msa_copy_u_w((v4i32) in, 0); \
538 out1_m = __msa_copy_u_w((v4i32) in, 1); \
540 SW(out0_m, pblk_4x2_m); \
541 SW(out1_m, pblk_4x2_m + stride); \
556 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
558 uint32_t out0_m, out1_m, out2_m, out3_m; \
559 uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
561 out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
562 out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
563 out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
564 out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
566 SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
568 #define ST4x8_UB(in0, in1, pdst, stride) \
570 uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
572 ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
573 ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
589 #define ST6x4_UB(in0, in1, pdst, stride) \
591 uint32_t out0_m, out1_m, out2_m, out3_m; \
592 uint16_t out4_m, out5_m, out6_m, out7_m; \
593 uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
595 out0_m = __msa_copy_u_w((v4i32) in0, 0); \
596 out1_m = __msa_copy_u_w((v4i32) in0, 2); \
597 out2_m = __msa_copy_u_w((v4i32) in1, 0); \
598 out3_m = __msa_copy_u_w((v4i32) in1, 2); \
600 out4_m = __msa_copy_u_h((v8i16) in0, 2); \
601 out5_m = __msa_copy_u_h((v8i16) in0, 6); \
602 out6_m = __msa_copy_u_h((v8i16) in1, 2); \
603 out7_m = __msa_copy_u_h((v8i16) in1, 6); \
605 SW(out0_m, pblk_6x4_m); \
606 SH(out4_m, (pblk_6x4_m + 4)); \
607 pblk_6x4_m += stride; \
608 SW(out1_m, pblk_6x4_m); \
609 SH(out5_m, (pblk_6x4_m + 4)); \
610 pblk_6x4_m += stride; \
611 SW(out2_m, pblk_6x4_m); \
612 SH(out6_m, (pblk_6x4_m + 4)); \
613 pblk_6x4_m += stride; \
614 SW(out3_m, pblk_6x4_m); \
615 SH(out7_m, (pblk_6x4_m + 4)); \
623 #define ST8x1_UB(in, pdst) \
626 out0_m = __msa_copy_u_d((v2i64) in, 0); \
637 #define ST8x2_UB(in, pdst, stride) \
639 uint64_t out0_m, out1_m; \
640 uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
642 out0_m = __msa_copy_u_d((v2i64) in, 0); \
643 out1_m = __msa_copy_u_d((v2i64) in, 1); \
645 SD(out0_m, pblk_8x2_m); \
646 SD(out1_m, pblk_8x2_m + stride); \
661 #define ST8x4_UB(in0, in1, pdst, stride) \
663 uint64_t out0_m, out1_m, out2_m, out3_m; \
664 uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
666 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
667 out1_m = __msa_copy_u_d((v2i64) in0, 1); \
668 out2_m = __msa_copy_u_d((v2i64) in1, 0); \
669 out3_m = __msa_copy_u_d((v2i64) in1, 1); \
671 SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
673 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
675 uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
677 ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
678 ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
680 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
682 uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
685 ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
687 ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
699 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
701 uint64_t out0_m, out1_m, out2_m, out3_m; \
702 uint64_t out4_m, out5_m, out6_m, out7_m; \
703 uint32_t out8_m, out9_m, out10_m, out11_m; \
704 uint32_t out12_m, out13_m, out14_m, out15_m; \
705 uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
707 out0_m = __msa_copy_u_d((v2i64) in0, 0); \
708 out1_m = __msa_copy_u_d((v2i64) in1, 0); \
709 out2_m = __msa_copy_u_d((v2i64) in2, 0); \
710 out3_m = __msa_copy_u_d((v2i64) in3, 0); \
711 out4_m = __msa_copy_u_d((v2i64) in4, 0); \
712 out5_m = __msa_copy_u_d((v2i64) in5, 0); \
713 out6_m = __msa_copy_u_d((v2i64) in6, 0); \
714 out7_m = __msa_copy_u_d((v2i64) in7, 0); \
716 out8_m = __msa_copy_u_w((v4i32) in0, 2); \
717 out9_m = __msa_copy_u_w((v4i32) in1, 2); \
718 out10_m = __msa_copy_u_w((v4i32) in2, 2); \
719 out11_m = __msa_copy_u_w((v4i32) in3, 2); \
720 out12_m = __msa_copy_u_w((v4i32) in4, 2); \
721 out13_m = __msa_copy_u_w((v4i32) in5, 2); \
722 out14_m = __msa_copy_u_w((v4i32) in6, 2); \
723 out15_m = __msa_copy_u_w((v4i32) in7, 2); \
725 SD(out0_m, pblk_12x8_m); \
726 SW(out8_m, pblk_12x8_m + 8); \
727 pblk_12x8_m += stride; \
728 SD(out1_m, pblk_12x8_m); \
729 SW(out9_m, pblk_12x8_m + 8); \
730 pblk_12x8_m += stride; \
731 SD(out2_m, pblk_12x8_m); \
732 SW(out10_m, pblk_12x8_m + 8); \
733 pblk_12x8_m += stride; \
734 SD(out3_m, pblk_12x8_m); \
735 SW(out11_m, pblk_12x8_m + 8); \
736 pblk_12x8_m += stride; \
737 SD(out4_m, pblk_12x8_m); \
738 SW(out12_m, pblk_12x8_m + 8); \
739 pblk_12x8_m += stride; \
740 SD(out5_m, pblk_12x8_m); \
741 SW(out13_m, pblk_12x8_m + 8); \
742 pblk_12x8_m += stride; \
743 SD(out6_m, pblk_12x8_m); \
744 SW(out14_m, pblk_12x8_m + 8); \
745 pblk_12x8_m += stride; \
746 SD(out7_m, pblk_12x8_m); \
747 SW(out15_m, pblk_12x8_m + 8); \
757 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
759 v16i8 zero_m = { 0 }; \
760 out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
761 out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
763 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
765 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
766 out0, out1, out2, out3, slide_val) \
768 SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
769 SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
771 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
780 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
782 out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
783 out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
785 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
786 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
787 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
799 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
801 out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
802 out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
804 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
805 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
806 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
807 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
809 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
812 VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
813 out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
815 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
817 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
818 out0, out1, out2, out3) \
820 VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
821 VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
823 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
834 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
836 out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
837 out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
839 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
853 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
855 out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
856 out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
858 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
860 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
863 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
864 out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
866 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
868 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
869 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
871 DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
872 DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
874 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
888 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
890 out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
891 out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
893 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
895 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
896 cnst0, cnst1, cnst2, cnst3, \
897 out0, out1, out2, out3) \
899 DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
900 DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
902 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
916 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
918 out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
919 (v16i8) mult0, (v16i8) cnst0); \
920 out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
921 (v16i8) mult1, (v16i8) cnst1); \
923 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
925 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
926 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
928 DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
929 DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
931 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
945 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
947 out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
948 (v8i16) mult0, (v8i16) cnst0); \
949 out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
950 (v8i16) mult1, (v8i16) cnst1); \
952 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
954 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
955 cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
957 DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
958 DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
960 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
970 #define CLIP_SH(in, min, max) \
974 out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
975 out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
985 #define CLIP_SH_0_255(in) \
987 v8i16 max_m = __msa_ldi_h(255); \
990 out_m = __msa_maxi_s_h((v8i16) in, 0); \
991 out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
994 #define CLIP_SH2_0_255(in0, in1) \
996 in0 = CLIP_SH_0_255(in0); \
997 in1 = CLIP_SH_0_255(in1); \
999 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1001 CLIP_SH2_0_255(in0, in1); \
1002 CLIP_SH2_0_255(in2, in3); \
1011 #define CLIP_SW_0_255(in) \
1013 v4i32 max_m = __msa_ldi_w(255); \
1016 out_m = __msa_maxi_s_w((v4i32) in, 0); \
1017 out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1029 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1031 out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1032 out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1034 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1035 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1037 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1039 out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1040 out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1041 out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1042 out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1044 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1045 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1046 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1054 #define INSERT_D2(RTYPE, in0, in1, out) \
1056 out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1057 out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1059 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1060 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1061 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1072 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1074 out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1075 out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1077 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1088 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1090 out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1091 out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1093 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1104 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1106 out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1107 out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1109 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1120 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1122 out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1123 out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1125 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1126 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1128 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1129 out0, out1, out2, out3) \
1131 ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1132 ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1134 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1135 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1136 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1147 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1149 out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1150 out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1152 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1153 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1155 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1156 out0, out1, out2, out3) \
1158 ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1159 ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1161 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1172 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1174 out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1175 out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1177 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1189 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1191 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1192 out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1194 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1195 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1196 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1197 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1198 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1200 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1202 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1203 out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1205 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1206 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1207 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1209 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1210 out0, out1, out2, out3) \
1212 ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1213 ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1215 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1216 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1217 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1229 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1231 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1232 out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1234 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1235 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1237 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1239 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1240 out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1242 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1244 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1245 out0, out1, out2, out3) \
1247 ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1248 ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1250 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1252 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1254 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1255 out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1257 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1258 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1260 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1261 out0, out1, out2, out3) \
1263 ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1264 ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1266 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1277 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1279 out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
1280 out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
1282 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1283 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1285 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1287 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1288 out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \
1290 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1292 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1293 out0, out1, out2, out3) \
1295 ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1296 ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1298 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1309 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1311 out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1312 out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1314 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1315 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1317 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1319 out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1320 out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1322 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1323 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1324 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1326 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1328 out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1329 out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1331 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1332 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1333 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1343 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1345 in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
1346 in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
1348 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1350 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1352 MAXI_SH2(RTYPE, in0, in1, max_val); \
1353 MAXI_SH2(RTYPE, in2, in3, max_val); \
1355 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1367 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1369 in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1370 in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1372 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1374 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1376 SAT_UH2(RTYPE, in0, in1, sat_val); \
1377 SAT_UH2(RTYPE, in2, in3, sat_val) \
1379 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1391 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1393 in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1394 in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1396 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1398 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1400 SAT_SH2(RTYPE, in0, in1, sat_val) \
1401 in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1403 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1405 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1407 SAT_SH2(RTYPE, in0, in1, sat_val); \
1408 SAT_SH2(RTYPE, in2, in3, sat_val); \
1410 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1422 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1424 in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1425 in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1427 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1429 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1431 SAT_SW2(RTYPE, in0, in1, sat_val); \
1432 SAT_SW2(RTYPE, in2, in3, sat_val); \
1434 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1445 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1447 out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1448 out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1450 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1451 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1453 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1454 out0, out1, out2, out3) \
1456 SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1457 SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1459 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1460 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1473 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1475 out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1476 out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1478 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1479 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1481 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1483 SPLATI_W2(RTYPE, in, 0, out0, out1); \
1484 SPLATI_W2(RTYPE, in, 2, out2, out3); \
1486 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1487 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1500 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1502 out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1503 out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1505 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1506 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1507 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1508 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1510 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1512 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1513 out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1515 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1516 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1518 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1519 out0, out1, out2, out3) \
1521 PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1522 PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1524 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1525 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1526 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1527 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1540 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1542 out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1543 out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1545 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1546 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1548 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1549 out0, out1, out2, out3) \
1551 PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1552 PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1554 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1555 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1568 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1570 out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1571 out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1573 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1574 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1575 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1577 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1578 out0, out1, out2, out3) \
1580 PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1581 PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1583 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1597 #define XORI_B2_128(RTYPE, in0, in1) \
1599 in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1600 in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1602 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1603 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1604 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1606 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1608 XORI_B2_128(RTYPE, in0, in1); \
1609 in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1611 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1613 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1615 XORI_B2_128(RTYPE, in0, in1); \
1616 XORI_B2_128(RTYPE, in2, in3); \
1618 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1619 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1620 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1622 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1624 XORI_B3_128(RTYPE, in0, in1, in2); \
1625 XORI_B2_128(RTYPE, in3, in4); \
1627 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1629 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1631 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1632 XORI_B2_128(RTYPE, in4, in5); \
1634 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1636 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1638 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1639 XORI_B3_128(RTYPE, in4, in5, in6); \
1641 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1643 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1645 XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1646 XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1648 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1659 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1661 out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1662 out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1664 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1666 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1667 out0, out1, out2, out3) \
1669 ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1670 ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1672 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1673 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1683 #define SLLI_4V(in0, in1, in2, in3, shift) \
1685 in0 = in0 << shift; \
1686 in1 = in1 << shift; \
1687 in2 = in2 << shift; \
1688 in3 = in3 << shift; \
1701 #define SRA_4V(in0, in1, in2, in3, shift) \
1703 in0 = in0 >> shift; \
1704 in1 = in1 >> shift; \
1705 in2 = in2 >> shift; \
1706 in3 = in3 >> shift; \
1719 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1721 in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1722 in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1723 in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1724 in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1726 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1739 #define SRAR_H2(RTYPE, in0, in1, shift) \
1741 in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
1742 in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
1744 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
1745 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
1747 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
1749 SRAR_H2(RTYPE, in0, in1, shift) \
1750 in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
1752 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
1754 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
1756 SRAR_H2(RTYPE, in0, in1, shift) \
1757 SRAR_H2(RTYPE, in2, in3, shift) \
1759 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
1760 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
1773 #define SRAR_W2(RTYPE, in0, in1, shift) \
1775 in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
1776 in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
1778 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
1780 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1782 SRAR_W2(RTYPE, in0, in1, shift) \
1783 SRAR_W2(RTYPE, in2, in3, shift) \
1785 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1797 #define SRARI_H2(RTYPE, in0, in1, shift) \
1799 in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
1800 in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
1802 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1803 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1805 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1807 SRARI_H2(RTYPE, in0, in1, shift); \
1808 SRARI_H2(RTYPE, in2, in3, shift); \
1810 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1811 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1823 #define SRARI_W2(RTYPE, in0, in1, shift) \
1825 in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
1826 in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
1828 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1830 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1832 SRARI_W2(RTYPE, in0, in1, shift); \
1833 SRARI_W2(RTYPE, in2, in3, shift); \
1835 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1836 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1845 #define MUL2(in0, in1, in2, in3, out0, out1) \
1850 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1852 MUL2(in0, in1, in2, in3, out0, out1); \
1853 MUL2(in4, in5, in6, in7, out2, out3); \
1862 #define ADD2(in0, in1, in2, in3, out0, out1) \
1867 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1869 ADD2(in0, in1, in2, in3, out0, out1); \
1870 ADD2(in4, in5, in6, in7, out2, out3); \
1880 #define UNPCK_UB_SH(in, out0, out1) \
1882 v16i8 zero_m = { 0 }; \
1884 ILVRL_B2_SH(zero_m, in, out0, out1); \
1898 #define UNPCK_SH_SW(in, out0, out1) \
1902 tmp_m = __msa_clti_s_h((v8i16) in, 0); \
1903 ILVRL_H2_SW(tmp_m, in, out0, out1); \
1911 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1926 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
1928 v16i8 zero_m = { 0 }; \
1929 v16i8 s0_m, s1_m, s2_m, s3_m; \
1931 ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
1932 ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
1934 out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
1935 out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
1936 out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
1937 out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
1946 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1947 out0, out1, out2, out3) \
1949 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1951 ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
1952 tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1953 ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
1955 tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1956 ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
1958 ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
1959 out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
1960 out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
1963 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
1972 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
1973 in8, in9, in10, in11, in12, in13, in14, in15, \
1974 out0, out1, out2, out3, out4, out5, out6, out7) \
1976 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1977 v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1979 ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
1980 ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
1981 ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
1982 ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
1984 tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
1985 tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
1986 tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
1987 tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
1988 out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
1989 tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
1990 out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
1991 tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
1993 ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
1994 out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1995 out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1997 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
1998 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
1999 out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2000 out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2002 ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2003 out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2004 out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2006 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2007 tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2008 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2009 tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2010 out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2011 out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2020 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2021 out0, out1, out2, out3, out4, out5, out6, out7) \
2024 v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2025 v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2027 ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2028 ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2029 ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2030 ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2031 ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2032 ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2033 ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2034 ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2035 PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2036 tmp3_m, tmp7_m, out0, out2, out4, out6); \
2037 out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2038 out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2039 out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2040 out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2042 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2043 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2051 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2053 v4i32 s0_m, s1_m, s2_m, s3_m; \
2055 ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2056 ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2058 out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2059 out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2060 out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2061 out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2072 #define PCKEV_XORI128_UB(in0, in1) \
2075 out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2076 out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2085 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2087 uint32_t out0_m, out1_m, out2_m, out3_m; \
2088 v16i8 tmp0_m, tmp1_m; \
2090 PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2092 out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2093 out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2094 out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2095 out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2097 SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \