Go to the documentation of this file.
   21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H 
   22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H 
   28 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1))) 
   30 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc)) 
   31 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__) 
   32 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__) 
   34 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc)) 
   35 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__) 
   36 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__) 
   38 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc)) 
   39 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__) 
   40 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__) 
   42 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 
   43 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__) 
   44 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__) 
   46 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 
   47 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__) 
   48 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__) 
   50 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) 
   51 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__) 
   52 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__) 
   54 #if (__mips_isa_rev >= 6) 
   57         uint8_t *psrc_m = (uint8_t *) (psrc);  \ 
   61             "lw  %[val_m],  %[psrc_m]  \n\t"   \ 
   63             : [val_m] "=r" (val_m)             \ 
   64             : [psrc_m] "m" (*psrc_m)           \ 
   73             uint8_t *psrc_m = (uint8_t *) (psrc);  \ 
   77                 "ld  %[val_m],  %[psrc_m]  \n\t"   \ 
   79                 : [val_m] "=r" (val_m)             \ 
   80                 : [psrc_m] "m" (*psrc_m)           \ 
   85     #else  // !(__mips == 64) 
   88             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                  \ 
   89             uint32_t val0_m, val1_m;                                  \ 
   92             val0_m = LW(psrc_ld_m);                                   \ 
   93             val1_m = LW(psrc_ld_m + 4);                               \ 
   95             val_m = (uint64_t) (val1_m);                              \ 
   96             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \ 
   97             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \ 
  101     #endif  // (__mips == 64) 
  103     #define SH(val, pdst)                      \ 
  105         uint8_t *pdst_m = (uint8_t *) (pdst);  \ 
  106         uint16_t val_m = (val);                \ 
  109             "sh  %[val_m],  %[pdst_m]  \n\t"   \ 
  111             : [pdst_m] "=m" (*pdst_m)          \ 
  112             : [val_m] "r" (val_m)              \ 
  116     #define SW(val, pdst)                      \ 
  118         uint8_t *pdst_m = (uint8_t *) (pdst);  \ 
  119         uint32_t val_m = (val);                \ 
  122             "sw  %[val_m],  %[pdst_m]  \n\t"   \ 
  124             : [pdst_m] "=m" (*pdst_m)          \ 
  125             : [val_m] "r" (val_m)              \ 
  129     #define SD(val, pdst)                      \ 
  131         uint8_t *pdst_m = (uint8_t *) (pdst);  \ 
  132         uint64_t val_m = (val);                \ 
  135             "sd  %[val_m],  %[pdst_m]  \n\t"   \ 
  137             : [pdst_m] "=m" (*pdst_m)          \ 
  138             : [val_m] "r" (val_m)              \ 
  141 #else  // !(__mips_isa_rev >= 6) 
  144         uint8_t *psrc_m = (uint8_t *) (psrc);  \ 
  148             "ulw  %[val_m],  %[psrc_m]  \n\t"  \ 
  150             : [val_m] "=r" (val_m)             \ 
  151             : [psrc_m] "m" (*psrc_m)           \ 
  160             uint8_t *psrc_m = (uint8_t *) (psrc);  \ 
  161             uint64_t val_m = 0;                    \ 
  164                 "uld  %[val_m],  %[psrc_m]  \n\t"  \ 
  166                 : [val_m] "=r" (val_m)             \ 
  167                 : [psrc_m] "m" (*psrc_m)           \ 
  172     #else  // !(__mips == 64) 
  175             uint8_t *psrc_ld_m = (uint8_t *) (psrc);                  \ 
  176             uint32_t val0_m, val1_m;                                  \ 
  177             uint64_t val_m = 0;                                       \ 
  179             val0_m = LW(psrc_ld_m);                                   \ 
  180             val1_m = LW(psrc_ld_m + 4);                               \ 
  182             val_m = (uint64_t) (val1_m);                              \ 
  183             val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000);  \ 
  184             val_m = (uint64_t) (val_m | (uint64_t) val0_m);           \ 
  188     #endif  // (__mips == 64) 
  190     #define SH(val, pdst)                      \ 
  192         uint8_t *pdst_m = (uint8_t *) (pdst);  \ 
  193         uint16_t val_m = (val);                \ 
  196             "ush  %[val_m],  %[pdst_m]  \n\t"  \ 
  198             : [pdst_m] "=m" (*pdst_m)          \ 
  199             : [val_m] "r" (val_m)              \ 
  203     #define SW(val, pdst)                      \ 
  205         uint8_t *pdst_m = (uint8_t *) (pdst);  \ 
  206         uint32_t val_m = (val);                \ 
  209             "usw  %[val_m],  %[pdst_m]  \n\t"  \ 
  211             : [pdst_m] "=m" (*pdst_m)          \ 
  212             : [val_m] "r" (val_m)              \ 
  216     #define SD(val, pdst)                                          \ 
  218         uint8_t *pdst_m1 = (uint8_t *) (pdst);                     \ 
  219         uint32_t val0_m, val1_m;                                   \ 
  221         val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF);          \ 
  222         val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF);  \ 
  224         SW(val0_m, pdst_m1);                                       \ 
  225         SW(val1_m, pdst_m1 + 4);                                   \ 
  227 #endif // (__mips_isa_rev >= 6) 
  238 #define LW4(psrc, stride, out0, out1, out2, out3)  \ 
  241     out1 = LW((psrc) + stride);                    \ 
  242     out2 = LW((psrc) + 2 * stride);                \ 
  243     out3 = LW((psrc) + 3 * stride);                \ 
  253 #define LD2(psrc, stride, out0, out1)  \ 
  256     out1 = LD((psrc) + stride);        \ 
  258 #define LD4(psrc, stride, out0, out1, out2, out3)  \ 
  260     LD2((psrc), stride, out0, out1);               \ 
  261     LD2((psrc) + 2 * stride, stride, out2, out3);  \ 
  271 #define SW4(in0, in1, in2, in3, pdst, stride)  \ 
  274     SW(in1, (pdst) + stride);                  \ 
  275     SW(in2, (pdst) + 2 * stride);              \ 
  276     SW(in3, (pdst) + 3 * stride);              \ 
  286 #define SD4(in0, in1, in2, in3, pdst, stride)  \ 
  289     SD(in1, (pdst) + stride);                  \ 
  290     SD(in2, (pdst) + 2 * stride);              \ 
  291     SD(in3, (pdst) + 3 * stride);              \ 
  302 #define LD_B2(RTYPE, psrc, stride, out0, out1)  \ 
  304     out0 = LD_B(RTYPE, (psrc));                 \ 
  305     out1 = LD_B(RTYPE, (psrc) + stride);        \ 
  307 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__) 
  308 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__) 
  310 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2)  \ 
  312     LD_B2(RTYPE, (psrc), stride, out0, out1);         \ 
  313     out2 = LD_B(RTYPE, (psrc) + 2 * stride);          \ 
  315 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__) 
  316 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__) 
  318 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3)   \ 
  320     LD_B2(RTYPE, (psrc), stride, out0, out1);                \ 
  321     LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3);  \ 
  323 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__) 
  324 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__) 
  326 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4)  \ 
  328     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);         \ 
  329     out4 = LD_B(RTYPE, (psrc) + 4 * stride);                      \ 
  331 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__) 
  332 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__) 
  334 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \ 
  336     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \ 
  337     LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \ 
  339 #define LD_UB6(...) LD_B6(v16u8, __VA_ARGS__) 
  340 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__) 
  342 #define LD_B7(RTYPE, psrc, stride,                               \ 
  343               out0, out1, out2, out3, out4, out5, out6)          \ 
  345     LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4);  \ 
  346     LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6);       \ 
  348 #define LD_UB7(...) LD_B7(v16u8, __VA_ARGS__) 
  349 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__) 
  351 #define LD_B8(RTYPE, psrc, stride,                                      \ 
  352               out0, out1, out2, out3, out4, out5, out6, out7)           \ 
  354     LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \ 
  355     LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \ 
  357 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__) 
  358 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__) 
  367 #define LD_H2(RTYPE, psrc, stride, out0, out1)  \ 
  369     out0 = LD_H(RTYPE, (psrc));                 \ 
  370     out1 = LD_H(RTYPE, (psrc) + (stride));      \ 
  372 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__) 
  373 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__) 
  375 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3)  \ 
  377     LD_H2(RTYPE, (psrc), stride, out0, out1);               \ 
  378     LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3);  \ 
  380 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__) 
  381 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__) 
  383 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5)  \ 
  385     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \ 
  386     LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5);              \ 
  388 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__) 
  389 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__) 
  391 #define LD_H8(RTYPE, psrc, stride,                                      \ 
  392               out0, out1, out2, out3, out4, out5, out6, out7)           \ 
  394     LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3);               \ 
  395     LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7);  \ 
  397 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__) 
  398 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__) 
  400 #define LD_H16(RTYPE, psrc, stride,                                   \ 
  401                out0, out1, out2, out3, out4, out5, out6, out7,        \ 
  402                out8, out9, out10, out11, out12, out13, out14, out15)  \ 
  404     LD_H8(RTYPE, (psrc), stride,                                      \ 
  405           out0, out1, out2, out3, out4, out5, out6, out7);            \ 
  406     LD_H8(RTYPE, (psrc) + 8 * stride, stride,                         \ 
  407           out8, out9, out10, out11, out12, out13, out14, out15);      \ 
  409 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__) 
  416 #define LD4x4_SH(psrc, out0, out1, out2, out3)                \ 
  418     out0 = LD_SH(psrc);                                       \ 
  419     out2 = LD_SH(psrc + 8);                                   \ 
  420     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);  \ 
  421     out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2);  \ 
  430 #define LD_SW2(psrc, stride, out0, out1)  \ 
  432     out0 = LD_SW((psrc));                 \ 
  433     out1 = LD_SW((psrc) + stride);        \ 
  442 #define ST_B2(RTYPE, in0, in1, pdst, stride)  \ 
  444     ST_B(RTYPE, in0, (pdst));                 \ 
  445     ST_B(RTYPE, in1, (pdst) + stride);        \ 
  447 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__) 
  448 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__) 
  450 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride)    \ 
  452     ST_B2(RTYPE, in0, in1, (pdst), stride);               \ 
  453     ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \ 
  455 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__) 
  456 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__) 
  458 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \ 
  461     ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride);                 \ 
  462     ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);  \ 
  464 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__) 
  472 #define ST_H2(RTYPE, in0, in1, pdst, stride)  \ 
  474     ST_H(RTYPE, in0, (pdst));                 \ 
  475     ST_H(RTYPE, in1, (pdst) + stride);        \ 
  477 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__) 
  478 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__) 
  480 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride)    \ 
  482     ST_H2(RTYPE, in0, in1, (pdst), stride);               \ 
  483     ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride);  \ 
  485 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__) 
  487 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride)  \ 
  489     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);             \ 
  490     ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride);          \ 
  492 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__) 
  494 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \ 
  496     ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride);                       \ 
  497     ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride);          \ 
  499 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__) 
  508 #define ST_SW2(in0, in1, pdst, stride)  \ 
  510     ST_SW(in0, (pdst));                 \ 
  511     ST_SW(in1, (pdst) + stride);        \ 
  513 #define ST_SW8(in0, in1, in2, in3, in4, in5, in6, in7,  \ 
  516     ST_SW2(in0, in1, (pdst), stride);                   \ 
  517     ST_SW2(in2, in3, (pdst) + 2 * stride, stride);      \ 
  518     ST_SW2(in4, in5, (pdst) + 4 * stride, stride);      \ 
  519     ST_SW2(in6, in7, (pdst) + 6 * stride, stride);      \ 
  534 #define ST2x4_UB(in, stidx, pdst, stride)              \ 
  536     uint16_t out0_m, out1_m, out2_m, out3_m;           \ 
  537     uint8_t *pblk_2x4_m = (uint8_t *) (pdst);          \ 
  539     out0_m = __msa_copy_u_h((v8i16) in, (stidx));      \ 
  540     out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1));  \ 
  541     out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2));  \ 
  542     out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3));  \ 
  544     SH(out0_m, pblk_2x4_m);                            \ 
  545     SH(out1_m, pblk_2x4_m + stride);                   \ 
  546     SH(out2_m, pblk_2x4_m + 2 * stride);               \ 
  547     SH(out3_m, pblk_2x4_m + 3 * stride);               \ 
  558 #define ST4x2_UB(in, pdst, stride)             \ 
  560     uint32_t out0_m, out1_m;                   \ 
  561     uint8_t *pblk_4x2_m = (uint8_t *) (pdst);  \ 
  563     out0_m = __msa_copy_u_w((v4i32) in, 0);    \ 
  564     out1_m = __msa_copy_u_w((v4i32) in, 1);    \ 
  566     SW(out0_m, pblk_4x2_m);                    \ 
  567     SW(out1_m, pblk_4x2_m + stride);           \ 
  582 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)  \ 
  584     uint32_t out0_m, out1_m, out2_m, out3_m;                      \ 
  585     uint8_t *pblk_4x4_m = (uint8_t *) (pdst);                     \ 
  587     out0_m = __msa_copy_u_w((v4i32) in0, idx0);                   \ 
  588     out1_m = __msa_copy_u_w((v4i32) in0, idx1);                   \ 
  589     out2_m = __msa_copy_u_w((v4i32) in1, idx2);                   \ 
  590     out3_m = __msa_copy_u_w((v4i32) in1, idx3);                   \ 
  592     SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride);      \ 
  594 #define ST4x8_UB(in0, in1, pdst, stride)                            \ 
  596     uint8_t *pblk_4x8 = (uint8_t *) (pdst);                         \ 
  598     ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride);               \ 
  599     ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride);  \ 
  615 #define ST6x4_UB(in0, in1, pdst, stride)       \ 
  617     uint32_t out0_m, out1_m, out2_m, out3_m;   \ 
  618     uint16_t out4_m, out5_m, out6_m, out7_m;   \ 
  619     uint8_t *pblk_6x4_m = (uint8_t *) (pdst);  \ 
  621     out0_m = __msa_copy_u_w((v4i32) in0, 0);   \ 
  622     out1_m = __msa_copy_u_w((v4i32) in0, 2);   \ 
  623     out2_m = __msa_copy_u_w((v4i32) in1, 0);   \ 
  624     out3_m = __msa_copy_u_w((v4i32) in1, 2);   \ 
  626     out4_m = __msa_copy_u_h((v8i16) in0, 2);   \ 
  627     out5_m = __msa_copy_u_h((v8i16) in0, 6);   \ 
  628     out6_m = __msa_copy_u_h((v8i16) in1, 2);   \ 
  629     out7_m = __msa_copy_u_h((v8i16) in1, 6);   \ 
  631     SW(out0_m, pblk_6x4_m);                    \ 
  632     SH(out4_m, (pblk_6x4_m + 4));              \ 
  633     pblk_6x4_m += stride;                      \ 
  634     SW(out1_m, pblk_6x4_m);                    \ 
  635     SH(out5_m, (pblk_6x4_m + 4));              \ 
  636     pblk_6x4_m += stride;                      \ 
  637     SW(out2_m, pblk_6x4_m);                    \ 
  638     SH(out6_m, (pblk_6x4_m + 4));              \ 
  639     pblk_6x4_m += stride;                      \ 
  640     SW(out3_m, pblk_6x4_m);                    \ 
  641     SH(out7_m, (pblk_6x4_m + 4));              \ 
  649 #define ST8x1_UB(in, pdst)                   \ 
  652     out0_m = __msa_copy_u_d((v2i64) in, 0);  \ 
  663 #define ST8x2_UB(in, pdst, stride)             \ 
  665     uint64_t out0_m, out1_m;                   \ 
  666     uint8_t *pblk_8x2_m = (uint8_t *) (pdst);  \ 
  668     out0_m = __msa_copy_u_d((v2i64) in, 0);    \ 
  669     out1_m = __msa_copy_u_d((v2i64) in, 1);    \ 
  671     SD(out0_m, pblk_8x2_m);                    \ 
  672     SD(out1_m, pblk_8x2_m + stride);           \ 
  687 #define ST8x4_UB(in0, in1, pdst, stride)                      \ 
  689     uint64_t out0_m, out1_m, out2_m, out3_m;                  \ 
  690     uint8_t *pblk_8x4_m = (uint8_t *) (pdst);                 \ 
  692     out0_m = __msa_copy_u_d((v2i64) in0, 0);                  \ 
  693     out1_m = __msa_copy_u_d((v2i64) in0, 1);                  \ 
  694     out2_m = __msa_copy_u_d((v2i64) in1, 0);                  \ 
  695     out3_m = __msa_copy_u_d((v2i64) in1, 1);                  \ 
  697     SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride);  \ 
  699 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride)        \ 
  701     uint8_t *pblk_8x8_m = (uint8_t *) (pdst);             \ 
  703     ST8x4_UB(in0, in1, pblk_8x8_m, stride);               \ 
  704     ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride);  \ 
  706 #define ST12x4_UB(in0, in1, in2, pdst, stride)                \ 
  708     uint8_t *pblk_12x4_m = (uint8_t *) (pdst);                \ 
  711     ST8x4_UB(in0, in1, pblk_12x4_m, stride);                  \ 
  713     ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride);  \ 
  725 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \ 
  727     uint64_t out0_m, out1_m, out2_m, out3_m;                             \ 
  728     uint64_t out4_m, out5_m, out6_m, out7_m;                             \ 
  729     uint32_t out8_m, out9_m, out10_m, out11_m;                           \ 
  730     uint32_t out12_m, out13_m, out14_m, out15_m;                         \ 
  731     uint8_t *pblk_12x8_m = (uint8_t *) (pdst);                           \ 
  733     out0_m = __msa_copy_u_d((v2i64) in0, 0);                             \ 
  734     out1_m = __msa_copy_u_d((v2i64) in1, 0);                             \ 
  735     out2_m = __msa_copy_u_d((v2i64) in2, 0);                             \ 
  736     out3_m = __msa_copy_u_d((v2i64) in3, 0);                             \ 
  737     out4_m = __msa_copy_u_d((v2i64) in4, 0);                             \ 
  738     out5_m = __msa_copy_u_d((v2i64) in5, 0);                             \ 
  739     out6_m = __msa_copy_u_d((v2i64) in6, 0);                             \ 
  740     out7_m = __msa_copy_u_d((v2i64) in7, 0);                             \ 
  742     out8_m =  __msa_copy_u_w((v4i32) in0, 2);                            \ 
  743     out9_m =  __msa_copy_u_w((v4i32) in1, 2);                            \ 
  744     out10_m = __msa_copy_u_w((v4i32) in2, 2);                            \ 
  745     out11_m = __msa_copy_u_w((v4i32) in3, 2);                            \ 
  746     out12_m = __msa_copy_u_w((v4i32) in4, 2);                            \ 
  747     out13_m = __msa_copy_u_w((v4i32) in5, 2);                            \ 
  748     out14_m = __msa_copy_u_w((v4i32) in6, 2);                            \ 
  749     out15_m = __msa_copy_u_w((v4i32) in7, 2);                            \ 
  751     SD(out0_m, pblk_12x8_m);                                             \ 
  752     SW(out8_m, pblk_12x8_m + 8);                                         \ 
  753     pblk_12x8_m += stride;                                               \ 
  754     SD(out1_m, pblk_12x8_m);                                             \ 
  755     SW(out9_m, pblk_12x8_m + 8);                                         \ 
  756     pblk_12x8_m += stride;                                               \ 
  757     SD(out2_m, pblk_12x8_m);                                             \ 
  758     SW(out10_m, pblk_12x8_m + 8);                                        \ 
  759     pblk_12x8_m += stride;                                               \ 
  760     SD(out3_m, pblk_12x8_m);                                             \ 
  761     SW(out11_m, pblk_12x8_m + 8);                                        \ 
  762     pblk_12x8_m += stride;                                               \ 
  763     SD(out4_m, pblk_12x8_m);                                             \ 
  764     SW(out12_m, pblk_12x8_m + 8);                                        \ 
  765     pblk_12x8_m += stride;                                               \ 
  766     SD(out5_m, pblk_12x8_m);                                             \ 
  767     SW(out13_m, pblk_12x8_m + 8);                                        \ 
  768     pblk_12x8_m += stride;                                               \ 
  769     SD(out6_m, pblk_12x8_m);                                             \ 
  770     SW(out14_m, pblk_12x8_m + 8);                                        \ 
  771     pblk_12x8_m += stride;                                               \ 
  772     SD(out7_m, pblk_12x8_m);                                             \ 
  773     SW(out15_m, pblk_12x8_m + 8);                                        \ 
  788 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)       \ 
  790     out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1);  \ 
  791     out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3);  \ 
  793 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__) 
  795 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \ 
  796                  out0, out1, out2, out3)                        \ 
  798     AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1)             \ 
  799     AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3)             \ 
  801 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__) 
  810 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val)                 \ 
  812     v16i8 zero_m = { 0 };                                                 \ 
  813     out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val);  \ 
  814     out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val);  \ 
  816 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__) 
  817 #define SLDI_B2_0_SB(...) SLDI_B2_0(v16i8, __VA_ARGS__) 
  818 #define SLDI_B2_0_SW(...) SLDI_B2_0(v4i32, __VA_ARGS__) 
  820 #define SLDI_B3_0(RTYPE, in0, in1, in2, out0, out1, out2,  slide_val)     \ 
  822     v16i8 zero_m = { 0 };                                                 \ 
  823     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);                    \ 
  824     out2 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in2, slide_val);  \ 
  826 #define SLDI_B3_0_UB(...) SLDI_B3_0(v16u8, __VA_ARGS__) 
  827 #define SLDI_B3_0_SB(...) SLDI_B3_0(v16i8, __VA_ARGS__) 
  829 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3,            \ 
  830                   out0, out1, out2, out3, slide_val)    \ 
  832     SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val);  \ 
  833     SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val);  \ 
  835 #define SLDI_B4_0_UB(...) SLDI_B4_0(v16u8, __VA_ARGS__) 
  836 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__) 
  837 #define SLDI_B4_0_SH(...) SLDI_B4_0(v8i16, __VA_ARGS__) 
  846 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)  \ 
  848     out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val);  \ 
  849     out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val);  \ 
  851 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__) 
  852 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__) 
  853 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__) 
  855 #define SLDI_B3(RTYPE, in0_0, in0_1, in0_2, in1_0, in1_1, in1_2,           \ 
  856                 out0, out1, out2, slide_val)                               \ 
  858     SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val)      \ 
  859     out2 = (RTYPE) __msa_sldi_b((v16i8) in0_2, (v16i8) in1_2, slide_val);  \ 
  861 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__) 
  862 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__) 
  873 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \ 
  875     out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0);  \ 
  876     out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2);  \ 
  878 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__) 
  879 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__) 
  880 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__) 
  881 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__) 
  883 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \ 
  886     VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \ 
  887     out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4);  \ 
  889 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__) 
  891 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3,       \ 
  892                 out0, out1, out2, out3)                            \ 
  894     VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1);  \ 
  895     VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3);  \ 
  897 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__) 
  898 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__) 
  909 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)       \ 
  911     out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0);  \ 
  912     out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2);  \ 
  914 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__) 
  916 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2,  \ 
  919     VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1);          \ 
  920     out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4);  \ 
  922 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__) 
  933 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1)      \ 
  935     out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \ 
  936     out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \ 
  938 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__) 
  952 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \ 
  954     out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0);  \ 
  955     out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1);  \ 
  957 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__) 
  959 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3,           \ 
  960                  cnst0, cnst1, cnst2, cnst3,                  \ 
  961                  out0, out1, out2, out3)                      \ 
  963     DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \ 
  964     DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \ 
  966 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__) 
  980 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \ 
  982     out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0);  \ 
  983     out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1);  \ 
  985 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__) 
  987 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2,  \ 
  990     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);       \ 
  991     out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2);   \ 
  993 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__) 
  995 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \ 
  996                  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \ 
  998     DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \ 
  999     DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \ 
 1001 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__) 
 1015 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \ 
 1017     out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0);  \ 
 1018     out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1);  \ 
 1020 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__) 
 1022 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3,           \ 
 1023                  cnst0, cnst1, cnst2, cnst3,                  \ 
 1024                  out0, out1, out2, out3)                      \ 
 1026     DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);  \ 
 1027     DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);  \ 
 1029 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__) 
 1043 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \ 
 1045     out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0,                   \ 
 1046                                    (v16i8) mult0, (v16i8) cnst0);  \ 
 1047     out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1,                   \ 
 1048                                    (v16i8) mult1, (v16i8) cnst1);  \ 
 1050 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__) 
 1052 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3,                   \ 
 1053                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \ 
 1055     DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \ 
 1056     DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \ 
 1058 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__) 
 1072 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \ 
 1074     out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0,                   \ 
 1075                                    (v16u8) mult0, (v16u8) cnst0);  \ 
 1076     out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1,                   \ 
 1077                                    (v16u8) mult1, (v16u8) cnst1);  \ 
 1079 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__) 
 1093 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1)   \ 
 1095     out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0,                   \ 
 1096                                    (v8i16) mult0, (v8i16) cnst0);  \ 
 1097     out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1,                   \ 
 1098                                    (v8i16) mult1, (v8i16) cnst1);  \ 
 1100 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__) 
 1102 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3,                   \ 
 1103                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3)  \ 
 1105     DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1);          \ 
 1106     DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3);          \ 
 1108 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__) 
 1118 #define MIN_UH2(RTYPE, in0, in1, min_vec)               \ 
 1120     in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec);  \ 
 1121     in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec);  \ 
 1123 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__) 
 1125 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec)  \ 
 1127     MIN_UH2(RTYPE, in0, in1, min_vec);               \ 
 1128     MIN_UH2(RTYPE, in2, in3, min_vec);               \ 
 1130 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__) 
 1140 #define CLIP_SH(in, min, max)                           \ 
 1144     out_m = __msa_max_s_h((v8i16) min, (v8i16) in);     \ 
 1145     out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m);  \ 
 1155 #define CLIP_SH_0_255(in)                                 \ 
 1157     v8i16 max_m = __msa_ldi_h(255);                       \ 
 1160     out_m = __msa_maxi_s_h((v8i16) in, 0);                \ 
 1161     out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m);  \ 
 1164 #define CLIP_SH2_0_255(in0, in1)  \ 
 1166     in0 = CLIP_SH_0_255(in0);     \ 
 1167     in1 = CLIP_SH_0_255(in1);     \ 
 1169 #define CLIP_SH4_0_255(in0, in1, in2, in3)  \ 
 1171     CLIP_SH2_0_255(in0, in1);               \ 
 1172     CLIP_SH2_0_255(in2, in3);               \ 
 1181 #define CLIP_SW_0_255(in)                                 \ 
 1183     v4i32 max_m = __msa_ldi_w(255);                       \ 
 1186     out_m = __msa_maxi_s_w((v4i32) in, 0);                \ 
 1187     out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m);  \ 
 1198 #define HADD_SW_S32(in)                               \ 
 1200     v2i64 res0_m, res1_m;                             \ 
 1203     res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in);  \ 
 1204     res1_m = __msa_splati_d(res0_m, 1);               \ 
 1205     res0_m = res0_m + res1_m;                         \ 
 1206     sum_m = __msa_copy_s_w((v4i32) res0_m, 0);        \ 
 1217 #define HADD_UH_U32(in)                                  \ 
 1220     v2u64 res0_m, res1_m;                                \ 
 1223     res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in);      \ 
 1224     res0_m = __msa_hadd_u_d(res_m, res_m);               \ 
 1225     res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1);  \ 
 1226     res0_m = res0_m + res1_m;                            \ 
 1227     sum_m = __msa_copy_u_w((v4i32) res0_m, 0);           \ 
 1239 #define HADD_SB2(RTYPE, in0, in1, out0, out1)                 \ 
 1241     out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0);  \ 
 1242     out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1);  \ 
 1244 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__) 
 1246 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \ 
 1248     HADD_SB2(RTYPE, in0, in1, out0, out1);                           \ 
 1249     HADD_SB2(RTYPE, in2, in3, out2, out3);                           \ 
 1251 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__) 
 1252 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__) 
 1262 #define HADD_UB2(RTYPE, in0, in1, out0, out1)                 \ 
 1264     out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0);  \ 
 1265     out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1);  \ 
 1267 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__) 
 1269 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2)      \ 
 1271     HADD_UB2(RTYPE, in0, in1, out0, out1);                    \ 
 1272     out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2);  \ 
 1274 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__) 
 1276 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \ 
 1278     HADD_UB2(RTYPE, in0, in1, out0, out1);                           \ 
 1279     HADD_UB2(RTYPE, in2, in3, out2, out3);                           \ 
 1281 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__) 
 1282 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__) 
 1283 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__) 
 1293 #define HSUB_UB2(RTYPE, in0, in1, out0, out1)                 \ 
 1295     out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0);  \ 
 1296     out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1);  \ 
 1298 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__) 
 1299 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__) 
 1301 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3)  \ 
 1303     HSUB_UB2(RTYPE, in0, in1, out0, out1);                           \ 
 1304     HSUB_UB2(RTYPE, in2, in3, out2, out3);                           \ 
 1306 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__) 
 1307 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__) 
 1318 #define SAD_UB2_UH(in0, in1, ref0, ref1)                        \ 
 1320     v16u8 diff0_m, diff1_m;                                     \ 
 1321     v8u16 sad_m = { 0 };                                        \ 
 1323     diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0);        \ 
 1324     diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1);        \ 
 1326     sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m);  \ 
 1327     sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m);  \ 
 1338 #define INSERT_W2(RTYPE, in0, in1, out)                 \ 
 1340     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \ 
 1341     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \ 
 1343 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__) 
 1344 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__) 
 1346 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out)       \ 
 1348     out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0);  \ 
 1349     out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1);  \ 
 1350     out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2);  \ 
 1351     out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3);  \ 
 1353 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__) 
 1354 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__) 
 1355 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__) 
 1363 #define INSERT_D2(RTYPE, in0, in1, out)                 \ 
 1365     out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0);  \ 
 1366     out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1);  \ 
 1368 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__) 
 1369 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__) 
 1370 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__) 
 1381 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1383     out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0);  \ 
 1384     out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2);  \ 
 1386 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__) 
 1387 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__) 
 1388 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__) 
 1389 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__) 
 1400 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1402     out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0);  \ 
 1403     out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2);  \ 
 1405 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__) 
 1406 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__) 
 1407 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__) 
 1418 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1420     out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0);  \ 
 1421     out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2);  \ 
 1423 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__) 
 1424 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__) 
 1425 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__) 
 1426 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__) 
 1437 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1439     out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0);  \ 
 1440     out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2);  \ 
 1442 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__) 
 1443 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__) 
 1444 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__) 
 1455 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1457     out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \ 
 1458     out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3);  \ 
 1460 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__) 
 1461 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__) 
 1462 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__) 
 1463 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__) 
 1465 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1466                 out0, out1, out2, out3)                         \ 
 1468     ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1469     ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1471 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__) 
 1472 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__) 
 1473 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__) 
 1474 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__) 
 1485 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1487     out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \ 
 1488     out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3);  \ 
 1490 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__) 
 1491 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__) 
 1493 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1494                 out0, out1, out2, out3)                         \ 
 1496     ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1497     ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1499 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__) 
 1500 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__) 
 1511 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1513     out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \ 
 1514     out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3);  \ 
 1516 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__) 
 1517 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__) 
 1518 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__) 
 1530 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1532     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \ 
 1533     out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3);  \ 
 1535 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__) 
 1536 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__) 
 1537 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__) 
 1538 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__) 
 1539 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__) 
 1541 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \ 
 1543     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \ 
 1544     out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5);              \ 
 1546 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__) 
 1547 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__) 
 1548 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__) 
 1550 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1551                 out0, out1, out2, out3)                         \ 
 1553     ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1554     ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1556 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__) 
 1557 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__) 
 1558 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__) 
 1559 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__) 
 1560 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__) 
 1562 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \ 
 1563                 in8, in9, in10, in11, in12, in13, in14, in15,     \ 
 1564                 out0, out1, out2, out3, out4, out5, out6, out7)   \ 
 1566     ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,        \ 
 1567             out0, out1, out2, out3);                              \ 
 1568     ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15,  \ 
 1569             out4, out5, out6, out7);                              \ 
 1571 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__) 
 1583 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1585     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \ 
 1586     out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3);  \ 
 1588 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__) 
 1589 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__) 
 1591 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \ 
 1593     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);                     \ 
 1594     out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5);              \ 
 1596 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__) 
 1598 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1599                 out0, out1, out2, out3)                         \ 
 1601     ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1602     ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1604 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__) 
 1605 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__) 
 1607 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1609     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \ 
 1610     out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3);  \ 
 1612 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__) 
 1613 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__) 
 1614 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__) 
 1616 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1617                 out0, out1, out2, out3)                         \ 
 1619     ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1620     ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1622 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__) 
 1623 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__) 
 1634 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1)          \ 
 1636     out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1));  \ 
 1637     out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3));  \ 
 1639 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__) 
 1640 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__) 
 1641 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__) 
 1643 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \ 
 1645     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);                     \ 
 1646     out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5));          \ 
 1648 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__) 
 1650 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1651                 out0, out1, out2, out3)                         \ 
 1653     ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1654     ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1656 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__) 
 1657 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__) 
 1668 #define ILVRL_B2(RTYPE, in0, in1, out0, out1)               \ 
 1670     out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1);  \ 
 1671     out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1);  \ 
 1673 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__) 
 1674 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__) 
 1675 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__) 
 1676 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__) 
 1677 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__) 
 1679 #define ILVRL_H2(RTYPE, in0, in1, out0, out1)               \ 
 1681     out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1);  \ 
 1682     out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1);  \ 
 1684 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__) 
 1685 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__) 
 1686 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__) 
 1688 #define ILVRL_W2(RTYPE, in0, in1, out0, out1)               \ 
 1690     out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1);  \ 
 1691     out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1);  \ 
 1693 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__) 
 1694 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__) 
 1695 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__) 
 1705 #define MAXI_SH2(RTYPE, in0, in1, max_val)                 \ 
 1707     in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val));  \ 
 1708     in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val));  \ 
 1710 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__) 
 1711 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__) 
 1713 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val)  \ 
 1715     MAXI_SH2(RTYPE, in0, in1, max_val);               \ 
 1716     MAXI_SH2(RTYPE, in2, in3, max_val);               \ 
 1718 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__) 
 1730 #define SAT_UH2(RTYPE, in0, in1, sat_val)               \ 
 1732     in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val);  \ 
 1733     in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val);  \ 
 1735 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__) 
 1736 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__) 
 1738 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val)  \ 
 1740     SAT_UH2(RTYPE, in0, in1, sat_val);               \ 
 1741     SAT_UH2(RTYPE, in2, in3, sat_val)                \ 
 1743 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__) 
 1755 #define SAT_SH2(RTYPE, in0, in1, sat_val)               \ 
 1757     in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val);  \ 
 1758     in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val);  \ 
 1760 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__) 
 1762 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val)          \ 
 1764     SAT_SH2(RTYPE, in0, in1, sat_val)                   \ 
 1765     in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val);  \ 
 1767 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__) 
 1769 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val)  \ 
 1771     SAT_SH2(RTYPE, in0, in1, sat_val);               \ 
 1772     SAT_SH2(RTYPE, in2, in3, sat_val);               \ 
 1774 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__) 
 1786 #define SAT_SW2(RTYPE, in0, in1, sat_val)               \ 
 1788     in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val);  \ 
 1789     in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val);  \ 
 1791 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__) 
 1793 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val)  \ 
 1795     SAT_SW2(RTYPE, in0, in1, sat_val);               \ 
 1796     SAT_SW2(RTYPE, in2, in3, sat_val);               \ 
 1798 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__) 
 1809 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1)  \ 
 1811     out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0);  \ 
 1812     out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1);  \ 
 1814 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__) 
 1815 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__) 
 1817 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2,        \ 
 1820     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \ 
 1821     out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2);  \ 
 1823 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__) 
 1824 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__) 
 1826 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3,  \ 
 1827                   out0, out1, out2, out3)             \ 
 1829     SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1);     \ 
 1830     SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3);     \ 
 1832 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__) 
 1833 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__) 
 1846 #define SPLATI_W2(RTYPE, in, stidx, out0, out1)            \ 
 1848     out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx);      \ 
 1849     out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1));  \ 
 1851 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__) 
 1852 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__) 
 1854 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3)  \ 
 1856     SPLATI_W2(RTYPE, in, 0, out0, out1);              \ 
 1857     SPLATI_W2(RTYPE, in, 2, out2, out3);              \ 
 1859 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__) 
 1860 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__) 
 1873 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1875     out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1);  \ 
 1876     out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3);  \ 
 1878 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__) 
 1879 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__) 
 1880 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__) 
 1881 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__) 
 1883 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2)  \ 
 1885     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);                     \ 
 1886     out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5);              \ 
 1888 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__) 
 1889 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__) 
 1891 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1892                  out0, out1, out2, out3)                         \ 
 1894     PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1895     PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1897 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__) 
 1898 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__) 
 1899 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__) 
 1900 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__) 
 1913 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1915     out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1);  \ 
 1916     out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3);  \ 
 1918 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__) 
 1919 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__) 
 1921 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1922                  out0, out1, out2, out3)                         \ 
 1924     PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1925     PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1927 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__) 
 1928 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__) 
 1941 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1943     out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1);  \ 
 1944     out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3);  \ 
 1946 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__) 
 1947 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__) 
 1948 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__) 
 1950 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 1951                  out0, out1, out2, out3)                         \ 
 1953     PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 1954     PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 1956 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__) 
 1967 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1)      \ 
 1969     out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1);  \ 
 1970     out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3);  \ 
 1972 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__) 
 1973 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__) 
 1974 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__) 
 1988 #define XORI_B2_128(RTYPE, in0, in1)               \ 
 1990     in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128);  \ 
 1991     in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128);  \ 
 1993 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__) 
 1994 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__) 
 1995 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__) 
 1997 #define XORI_B3_128(RTYPE, in0, in1, in2)          \ 
 1999     XORI_B2_128(RTYPE, in0, in1);                  \ 
 2000     in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128);  \ 
 2002 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__) 
 2004 #define XORI_B4_128(RTYPE, in0, in1, in2, in3)  \ 
 2006     XORI_B2_128(RTYPE, in0, in1);               \ 
 2007     XORI_B2_128(RTYPE, in2, in3);               \ 
 2009 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__) 
 2010 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__) 
 2011 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__) 
 2013 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4)  \ 
 2015     XORI_B3_128(RTYPE, in0, in1, in2);               \ 
 2016     XORI_B2_128(RTYPE, in3, in4);                    \ 
 2018 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__) 
 2020 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5)  \ 
 2022     XORI_B4_128(RTYPE, in0, in1, in2, in3);               \ 
 2023     XORI_B2_128(RTYPE, in4, in5);                         \ 
 2025 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__) 
 2027 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6)  \ 
 2029     XORI_B4_128(RTYPE, in0, in1, in2, in3);                    \ 
 2030     XORI_B3_128(RTYPE, in4, in5, in6);                         \ 
 2032 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__) 
 2034 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7)  \ 
 2036     XORI_B4_128(RTYPE, in0, in1, in2, in3);                         \ 
 2037     XORI_B4_128(RTYPE, in4, in5, in6, in7);                         \ 
 2039 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__) 
 2050 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1)       \ 
 2052     out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1);  \ 
 2053     out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3);  \ 
 2055 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__) 
 2057 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 2058                  out0, out1, out2, out3)                         \ 
 2060     ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1);             \ 
 2061     ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3);             \ 
 2063 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__) 
 2064 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__) 
 2074 #define SLLI_4V(in0, in1, in2, in3, shift)  \ 
 2076     in0 = in0 << shift;                     \ 
 2077     in1 = in1 << shift;                     \ 
 2078     in2 = in2 << shift;                     \ 
 2079     in3 = in3 << shift;                     \ 
 2092 #define SRA_4V(in0, in1, in2, in3, shift)  \ 
 2094     in0 = in0 >> shift;                    \ 
 2095     in1 = in1 >> shift;                    \ 
 2096     in2 = in2 >> shift;                    \ 
 2097     in3 = in3 >> shift;                    \ 
 2110 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift)            \ 
 2112     in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift);  \ 
 2113     in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift);  \ 
 2114     in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift);  \ 
 2115     in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift);  \ 
 2117 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__) 
 2130 #define SRAR_H2(RTYPE, in0, in1, shift)                      \ 
 2132     in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift);  \ 
 2133     in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift);  \ 
 2135 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__) 
 2136 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__) 
 2138 #define SRAR_H3(RTYPE, in0, in1, in2, shift)                 \ 
 2140     SRAR_H2(RTYPE, in0, in1, shift)                          \ 
 2141     in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift);  \ 
 2143 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__) 
 2145 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift)  \ 
 2147     SRAR_H2(RTYPE, in0, in1, shift)                \ 
 2148     SRAR_H2(RTYPE, in2, in3, shift)                \ 
 2150 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__) 
 2151 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__) 
 2164 #define SRAR_W2(RTYPE, in0, in1, shift)                      \ 
 2166     in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift);  \ 
 2167     in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift);  \ 
 2169 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__) 
 2171 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift)  \ 
 2173     SRAR_W2(RTYPE, in0, in1, shift)                \ 
 2174     SRAR_W2(RTYPE, in2, in3, shift)                \ 
 2176 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__) 
 2188 #define SRARI_H2(RTYPE, in0, in1, shift)              \ 
 2190     in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift);  \ 
 2191     in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift);  \ 
 2193 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__) 
 2194 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__) 
 2196 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift)    \ 
 2198     SRARI_H2(RTYPE, in0, in1, shift);                 \ 
 2199     SRARI_H2(RTYPE, in2, in3, shift);                 \ 
 2201 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__) 
 2202 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__) 
 2214 #define SRARI_W2(RTYPE, in0, in1, shift)              \ 
 2216     in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift);  \ 
 2217     in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift);  \ 
 2219 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__) 
 2221 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift)  \ 
 2223     SRARI_W2(RTYPE, in0, in1, shift);               \ 
 2224     SRARI_W2(RTYPE, in2, in3, shift);               \ 
 2226 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__) 
 2227 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__) 
 2236 #define MUL2(in0, in1, in2, in3, out0, out1)  \ 
 2241 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \ 
 2243     MUL2(in0, in1, in2, in3, out0, out1);                                     \ 
 2244     MUL2(in4, in5, in6, in7, out2, out3);                                     \ 
 2253 #define ADD2(in0, in1, in2, in3, out0, out1)  \ 
 2258 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \ 
 2260     ADD2(in0, in1, in2, in3, out0, out1);                                     \ 
 2261     ADD2(in4, in5, in6, in7, out2, out3);                                     \ 
 2270 #define SUB2(in0, in1, in2, in3, out0, out1)  \ 
 2275 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)  \ 
 2291 #define UNPCK_R_SH_SW(in, out)                       \ 
 2295     sign_m = __msa_clti_s_h((v8i16) in, 0);          \ 
 2296     out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in);  \ 
 2310 #define UNPCK_SB_SH(in, out0, out1)                  \ 
 2314     tmp_m = __msa_clti_s_b((v16i8) in, 0);           \ 
 2315     ILVRL_B2_SH(tmp_m, in, out0, out1);              \ 
 2325 #define UNPCK_UB_SH(in, out0, out1)                   \ 
 2327     v16i8 zero_m = { 0 };                             \ 
 2329     ILVRL_B2_SH(zero_m, in, out0, out1);              \ 
 2343 #define UNPCK_SH_SW(in, out0, out1)                  \ 
 2347     tmp_m = __msa_clti_s_h((v8i16) in, 0);           \ 
 2348     ILVRL_H2_SW(tmp_m, in, out0, out1);              \ 
 2356 #define SWAP(in0, in1)  \ 
 2368 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3)  \ 
 2382 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7,          \ 
 2383                     out0, out1, out2, out3, out4, out5, out6, out7)  \ 
 2401 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7,                \ 
 2402                      in8, in9,  in10, in11, in12, in13, in14, in15,         \ 
 2403                      out0, out1, out2, out3, out4, out5, out6, out7,        \ 
 2404                      out8, out9, out10, out11, out12, out13, out14, out15)  \ 
 2406     out0 = in0 + in15;                                                      \ 
 2407     out1 = in1 + in14;                                                      \ 
 2408     out2 = in2 + in13;                                                      \ 
 2409     out3 = in3 + in12;                                                      \ 
 2410     out4 = in4 + in11;                                                      \ 
 2411     out5 = in5 + in10;                                                      \ 
 2417     out10 = in5 - in10;                                                     \ 
 2418     out11 = in4 - in11;                                                     \ 
 2419     out12 = in3 - in12;                                                     \ 
 2420     out13 = in2 - in13;                                                     \ 
 2421     out14 = in1 - in14;                                                     \ 
 2422     out15 = in0 - in15;                                                     \ 
 2431 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3)  \ 
 2433     v16i8 zero_m = { 0 };                                               \ 
 2434     v16i8 s0_m, s1_m, s2_m, s3_m;                                       \ 
 2436     ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m);                         \ 
 2437     ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m);                                \ 
 2439     out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m);                            \ 
 2440     out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4);               \ 
 2441     out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4);               \ 
 2442     out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4);               \ 
 2451 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 2452                         out0, out1, out2, out3)                         \ 
 2454     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \ 
 2456     ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m);                    \ 
 2457     tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \ 
 2458     ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m);                    \ 
 2460     tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m);                              \ 
 2461     ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m);                        \ 
 2463     ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2);                        \ 
 2464     out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0);            \ 
 2465     out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \ 
 2467 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__) 
 2468 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__) 
 2478 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \ 
 2479                         out0, out1, out2, out3, out4, out5, out6, out7)  \ 
 2481     v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                \ 
 2482     v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                \ 
 2484     ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5,                   \ 
 2485                tmp0_m, tmp1_m, tmp2_m, tmp3_m);                          \ 
 2486     ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m);                         \ 
 2487     ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m);                         \ 
 2488     ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2);                         \ 
 2489     ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6);                         \ 
 2490     SLDI_B2_0(RTYPE, out0, out2, out1, out3, 8);                         \ 
 2491     SLDI_B2_0(RTYPE, out4, out6, out5, out7, 8);                         \ 
 2493 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__) 
 2494 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__) 
 2503 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,        \ 
 2504                             in8, in9, in10, in11, in12, in13, in14, in15,  \ 
 2505                             out0, out1, out2, out3)                        \ 
 2507     v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \ 
 2509     ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m);                      \ 
 2510     out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \ 
 2512     ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m);                      \ 
 2513     out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m);                          \ 
 2515     ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m);                     \ 
 2517     tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \ 
 2518     ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m);                     \ 
 2520     tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m);                                \ 
 2521     ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m);               \ 
 2522     out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \ 
 2523     out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \ 
 2525     tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1);            \ 
 2526     tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m);        \ 
 2527     out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \ 
 2528     out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \ 
 2538 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7,          \ 
 2539                             in8, in9, in10, in11, in12, in13, in14, in15,    \ 
 2540                             out0, out1, out2, out3, out4, out5, out6, out7)  \ 
 2542     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \ 
 2543     v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \ 
 2545     ILVEV_D2_UB(in0, in8, in1, in9, out7, out6);                             \ 
 2546     ILVEV_D2_UB(in2, in10, in3, in11, out5, out4);                           \ 
 2547     ILVEV_D2_UB(in4, in12, in5, in13, out3, out2);                           \ 
 2548     ILVEV_D2_UB(in6, in14, in7, in15, out1, out0);                           \ 
 2550     tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7);              \ 
 2551     tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7);              \ 
 2552     tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5);              \ 
 2553     tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5);              \ 
 2554     out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3);                \ 
 2555     tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3);              \ 
 2556     out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1);                \ 
 2557     tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1);              \ 
 2559     ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m);                 \ 
 2560     out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \ 
 2561     out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \ 
 2563     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m);          \ 
 2564     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5);              \ 
 2565     out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \ 
 2566     out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \ 
 2568     ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m);             \ 
 2569     out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \ 
 2570     out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \ 
 2572     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \ 
 2573     tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m);          \ 
 2574     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \ 
 2575     tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m);          \ 
 2576     out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \ 
 2577     out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m);            \ 
 2586 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)  \ 
 2590     ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m);                         \ 
 2591     ILVRL_W2_SH(s1_m, s0_m, out0, out2);                                \ 
 2592     out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0);            \ 
 2593     out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2);            \ 
 2602 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,   \ 
 2603                        out0, out1, out2, out3, out4, out5, out6, out7)  \ 
 2606     v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \ 
 2607     v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                               \ 
 2609     ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \ 
 2610     ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m);                            \ 
 2611     ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m);                         \ 
 2612     ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m);                            \ 
 2613     ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \ 
 2614     ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m);                            \ 
 2615     ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m);                         \ 
 2616     ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m);                            \ 
 2617     PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,     \ 
 2618              tmp3_m, tmp7_m, out0, out2, out4, out6);                   \ 
 2619     out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m);       \ 
 2620     out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m);       \ 
 2621     out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m);       \ 
 2622     out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m);       \ 
 2624 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__) 
 2625 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__) 
 2633 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)  \ 
 2635     v4i32 s0_m, s1_m, s2_m, s3_m;                                       \ 
 2637     ILVRL_W2_SW(in1, in0, s0_m, s1_m);                                  \ 
 2638     ILVRL_W2_SW(in3, in2, s2_m, s3_m);                                  \ 
 2640     out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m);            \ 
 2641     out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m);            \ 
 2642     out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m);            \ 
 2643     out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m);            \ 
 2662 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \ 
 2664     uint64_t out0_m, out1_m, out2_m, out3_m;                                \ 
 2665     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                   \ 
 2667     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                       \ 
 2668     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                       \ 
 2669     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                       \ 
 2670     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                       \ 
 2672     out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0);                             \ 
 2673     out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0);                             \ 
 2674     out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0);                             \ 
 2675     out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0);                             \ 
 2676     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                      \ 
 2695 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \ 
 2697     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \ 
 2699     tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1);                        \ 
 2700     tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3);                        \ 
 2701     tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5);                        \ 
 2702     tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7);                        \ 
 2704     ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride);                    \ 
 2723 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \ 
 2725     uint64_t out0_m, out1_m, out2_m, out3_m;                                 \ 
 2726     v16u8 tp0_m, tp1_m, tp2_m, tp3_m;                                        \ 
 2728     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                      \ 
 2729                 tp0_m, tp1_m, tp2_m, tp3_m);                                 \ 
 2731     out0_m = __msa_copy_u_d((v2i64) tp0_m, 0);                               \ 
 2732     out1_m = __msa_copy_u_d((v2i64) tp1_m, 0);                               \ 
 2733     out2_m = __msa_copy_u_d((v2i64) tp2_m, 0);                               \ 
 2734     out3_m = __msa_copy_u_d((v2i64) tp3_m, 0);                               \ 
 2735     SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride);                       \ 
 2754 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride)  \ 
 2756     v16u8 t0_m, t1_m, t2_m, t3_m;                                             \ 
 2758     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,                       \ 
 2759                 t0_m, t1_m, t2_m, t3_m);                                      \ 
 2760     ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride);                             \ 
 2780 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 2783     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                          \ 
 2784     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                          \ 
 2786     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);          \ 
 2787     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,            \ 
 2788                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                   \ 
 2789     AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \ 
 2790                   dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \ 
 2810 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7,  \ 
 2813     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                           \ 
 2814     v16u8 dst0_m, dst1_m, dst2_m, dst3_m;                           \ 
 2816     LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m);           \ 
 2817     AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7,             \ 
 2818                 tmp0_m, tmp1_m, tmp2_m, tmp3_m);                    \ 
 2819     AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m,                  \ 
 2820                    dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride);   \ 
 2830 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride)         \ 
 2832     uint32_t src0_m, src1_m, src2_m, src3_m;                      \ 
 2833     uint32_t out0_m, out1_m, out2_m, out3_m;                      \ 
 2834     v8i16 inp0_m, inp1_m, res0_m, res1_m;                         \ 
 2835     v16i8 dst0_m = { 0 };                                         \ 
 2836     v16i8 dst1_m = { 0 };                                         \ 
 2837     v16i8 zero_m = { 0 };                                         \ 
 2839     ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m)                \ 
 2840     LW4(pdst, stride,  src0_m, src1_m, src2_m, src3_m);           \ 
 2841     INSERT_W2_SB(src0_m, src1_m, dst0_m);                         \ 
 2842     INSERT_W2_SB(src2_m, src3_m, dst1_m);                         \ 
 2843     ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m);   \ 
 2844     ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m);         \ 
 2845     CLIP_SH2_0_255(res0_m, res1_m);                               \ 
 2846     PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m);  \ 
 2848     out0_m = __msa_copy_u_w((v4i32) dst0_m, 0);                   \ 
 2849     out1_m = __msa_copy_u_w((v4i32) dst0_m, 1);                   \ 
 2850     out2_m = __msa_copy_u_w((v4i32) dst1_m, 0);                   \ 
 2851     out3_m = __msa_copy_u_w((v4i32) dst1_m, 1);                   \ 
 2852     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);            \ 
 2866 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2)         \ 
 2871     out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0);           \ 
 2872     out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1);  \ 
 2873     tmp1_m = __msa_dotp_s_h((v16i8) in2, (v16i8) coeff2);           \ 
 2874     out0_m = __msa_adds_s_h(out0_m, tmp1_m);                        \ 
 2887 #define PCKEV_XORI128_UB(in0, in1)                            \ 
 2890     out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0);  \ 
 2891     out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128);         \ 
 2900 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3,                    \ 
 2901                                 dst0, dst1, dst2, dst3, pdst, stride)  \ 
 2903     v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                              \ 
 2904     uint8_t *pdst_m = (uint8_t *) (pdst);                              \ 
 2906     tmp0_m = PCKEV_XORI128_UB(in0, in1);                               \ 
 2907     tmp1_m = PCKEV_XORI128_UB(in2, in3);                               \ 
 2908     ILVR_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m);                \ 
 2909     AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m);       \ 
 2910     ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride);                          \ 
 2918 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride)  \ 
 2920     uint32_t out0_m, out1_m, out2_m, out3_m;              \ 
 2921     v16i8 tmp0_m, tmp1_m;                                 \ 
 2923     PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m);      \ 
 2925     out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0);           \ 
 2926     out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2);           \ 
 2927     out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0);           \ 
 2928     out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2);           \ 
 2930     SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride);    \ 
 2937 #define PCKEV_ST_SB(in0, in1, pdst)                   \ 
 2940     tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0);  \ 
 2941     ST_SB(tmp_m, (pdst));                             \ 
 2947 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift)            \ 
 2952     tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0);  \ 
 2953     tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff);         \ 
 2954     tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift);          \ 
 2955     tmp1_m = __msa_sat_u_h(tmp1_m, shift);                          \