FFmpeg
generic_macros_msa.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 #include <config.h>
27 
28 #define ALIGNMENT 16
29 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
30 
31 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
32 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
33 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
34 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
35 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
36 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
37 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
38 
39 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
41 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
42 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
43 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
44 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
45 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
46 
47 #if HAVE_MIPS32R6 || HAVE_MIPS64R6
48  #define LH(psrc) \
49  ( { \
50  uint16_t val_lh_m = *(uint16_t *)(psrc); \
51  val_lh_m; \
52  } )
53 
54  #define LW(psrc) \
55  ( { \
56  uint32_t val_lw_m = *(uint32_t *)(psrc); \
57  val_lw_m; \
58  } )
59 
60  #if (__mips == 64)
61  #define LD(psrc) \
62  ( { \
63  uint64_t val_ld_m = *(uint64_t *)(psrc); \
64  val_ld_m; \
65  } )
66  #else // !(__mips == 64)
67  #define LD(psrc) \
68  ( { \
69  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
70  uint32_t val0_ld_m, val1_ld_m; \
71  uint64_t val_ld_m = 0; \
72  \
73  val0_ld_m = LW(psrc_ld_m); \
74  val1_ld_m = LW(psrc_ld_m + 4); \
75  \
76  val_ld_m = (uint64_t) (val1_ld_m); \
77  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
78  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
79  \
80  val_ld_m; \
81  } )
82  #endif // (__mips == 64)
83 
84  #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
85  #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
86  #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
87 
88 #else // !HAVE_MIPS32R6 && !HAVE_MIPS64R6
89  #define LH(psrc) \
90  ( { \
91  uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
92  uint16_t val_lh_m; \
93  \
94  __asm__ volatile ( \
95  "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
96  \
97  : [val_lh_m] "=r" (val_lh_m) \
98  : [psrc_lh_m] "m" (*psrc_lh_m) \
99  ); \
100  \
101  val_lh_m; \
102  } )
103 
104  #define LW(psrc) \
105  ( { \
106  uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
107  uint32_t val_lw_m; \
108  \
109  __asm__ volatile ( \
110  "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
111  "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
112  \
113  : [val_lw_m] "=&r"(val_lw_m) \
114  : [psrc_lw_m] "r"(psrc_lw_m) \
115  ); \
116  \
117  val_lw_m; \
118  } )
119 
120  #if (__mips == 64)
121  #define LD(psrc) \
122  ( { \
123  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
124  uint64_t val_ld_m = 0; \
125  \
126  __asm__ volatile ( \
127  "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
128  "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
129  \
130  : [val_ld_m] "=&r" (val_ld_m) \
131  : [psrc_ld_m] "r" (psrc_ld_m) \
132  ); \
133  \
134  val_ld_m; \
135  } )
136  #else // !(__mips == 64)
137  #define LD(psrc) \
138  ( { \
139  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
140  uint32_t val0_ld_m, val1_ld_m; \
141  uint64_t val_ld_m = 0; \
142  \
143  val0_ld_m = LW(psrc_ld_m); \
144  val1_ld_m = LW(psrc_ld_m + 4); \
145  \
146  val_ld_m = (uint64_t) (val1_ld_m); \
147  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
148  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
149  \
150  val_ld_m; \
151  } )
152  #endif // (__mips == 64)
153 
154  #define SH(val, pdst) \
155  { \
156  uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
157  uint16_t val_sh_m = (val); \
158  \
159  __asm__ volatile ( \
160  "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
161  \
162  : [pdst_sh_m] "=m" (*pdst_sh_m) \
163  : [val_sh_m] "r" (val_sh_m) \
164  ); \
165  }
166 
167  #define SW(val, pdst) \
168  { \
169  uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
170  uint32_t val_sw_m = (val); \
171  \
172  __asm__ volatile ( \
173  "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
174  \
175  : [pdst_sw_m] "=m" (*pdst_sw_m) \
176  : [val_sw_m] "r" (val_sw_m) \
177  ); \
178  }
179 
180  #define SD(val, pdst) \
181  { \
182  uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
183  uint32_t val0_sd_m, val1_sd_m; \
184  \
185  val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
186  val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
187  \
188  SW(val0_sd_m, pdst_sd_m); \
189  SW(val1_sd_m, pdst_sd_m + 4); \
190  }
191 #endif // HAVE_MIPS32R6 || HAVE_MIPS64R6
192 
193 /* Description : Load 4 words with stride
194  Arguments : Inputs - psrc (source pointer to load from)
195  - stride
196  Outputs - out0, out1, out2, out3
197  Details : Loads word in 'out0' from (psrc)
198  Loads word in 'out1' from (psrc + stride)
199  Loads word in 'out2' from (psrc + 2 * stride)
200  Loads word in 'out3' from (psrc + 3 * stride)
201 */
202 #define LW4(psrc, stride, out0, out1, out2, out3) \
203 { \
204  out0 = LW((psrc)); \
205  out1 = LW((psrc) + stride); \
206  out2 = LW((psrc) + 2 * stride); \
207  out3 = LW((psrc) + 3 * stride); \
208 }
209 
210 #define LW2(psrc, stride, out0, out1) \
211 { \
212  out0 = LW((psrc)); \
213  out1 = LW((psrc) + stride); \
214 }
215 
216 /* Description : Load double words with stride
217  Arguments : Inputs - psrc (source pointer to load from)
218  - stride
219  Outputs - out0, out1
220  Details : Loads double word in 'out0' from (psrc)
221  Loads double word in 'out1' from (psrc + stride)
222 */
223 #define LD2(psrc, stride, out0, out1) \
224 { \
225  out0 = LD((psrc)); \
226  out1 = LD((psrc) + stride); \
227 }
228 #define LD4(psrc, stride, out0, out1, out2, out3) \
229 { \
230  LD2((psrc), stride, out0, out1); \
231  LD2((psrc) + 2 * stride, stride, out2, out3); \
232 }
233 
234 /* Description : Store 4 words with stride
235  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
236  Details : Stores word from 'in0' to (pdst)
237  Stores word from 'in1' to (pdst + stride)
238  Stores word from 'in2' to (pdst + 2 * stride)
239  Stores word from 'in3' to (pdst + 3 * stride)
240 */
241 #define SW4(in0, in1, in2, in3, pdst, stride) \
242 { \
243  SW(in0, (pdst)) \
244  SW(in1, (pdst) + stride); \
245  SW(in2, (pdst) + 2 * stride); \
246  SW(in3, (pdst) + 3 * stride); \
247 }
248 
249 /* Description : Store 4 double words with stride
250  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
251  Details : Stores double word from 'in0' to (pdst)
252  Stores double word from 'in1' to (pdst + stride)
253  Stores double word from 'in2' to (pdst + 2 * stride)
254  Stores double word from 'in3' to (pdst + 3 * stride)
255 */
256 #define SD4(in0, in1, in2, in3, pdst, stride) \
257 { \
258  SD(in0, (pdst)) \
259  SD(in1, (pdst) + stride); \
260  SD(in2, (pdst) + 2 * stride); \
261  SD(in3, (pdst) + 3 * stride); \
262 }
263 
264 /* Description : Load vector elements with stride
265  Arguments : Inputs - psrc (source pointer to load from)
266  - stride
267  Outputs - out0, out1
268  Return Type - as per RTYPE
269  Details : Loads elements in 'out0' from (psrc)
270  Loads elements in 'out1' from (psrc + stride)
271 */
272 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
273 { \
274  out0 = LD_V(RTYPE, (psrc)); \
275  out1 = LD_V(RTYPE, (psrc) + stride); \
276 }
277 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
278 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
279 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
280 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
281 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
282 
283 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
284 { \
285  LD_V2(RTYPE, (psrc), stride, out0, out1); \
286  out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
287 }
288 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
289 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
290 
291 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
292 { \
293  LD_V2(RTYPE, (psrc), stride, out0, out1); \
294  LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
295 }
296 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
297 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
298 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
299 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
300 #define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
301 
302 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
303 { \
304  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
305  out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
306 }
307 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
308 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
309 
310 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
311 { \
312  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
313  LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
314 }
315 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
316 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
317 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
318 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
319 
320 #define LD_V7(RTYPE, psrc, stride, \
321  out0, out1, out2, out3, out4, out5, out6) \
322 { \
323  LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
324  LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
325 }
326 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
327 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
328 
329 #define LD_V8(RTYPE, psrc, stride, \
330  out0, out1, out2, out3, out4, out5, out6, out7) \
331 { \
332  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
333  LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
334 }
335 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
336 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
337 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
338 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
339 #define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
340 
341 #define LD_V16(RTYPE, psrc, stride, \
342  out0, out1, out2, out3, out4, out5, out6, out7, \
343  out8, out9, out10, out11, out12, out13, out14, out15) \
344 { \
345  LD_V8(RTYPE, (psrc), stride, \
346  out0, out1, out2, out3, out4, out5, out6, out7); \
347  LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
348  out8, out9, out10, out11, out12, out13, out14, out15); \
349 }
350 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
351 
352 /* Description : Store vectors with stride
353  Arguments : Inputs - in0, in1, stride
354  Outputs - pdst (destination pointer to store to)
355  Details : Stores elements from 'in0' to (pdst)
356  Stores elements from 'in1' to (pdst + stride)
357 */
358 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
359 { \
360  ST_V(RTYPE, in0, (pdst)); \
361  ST_V(RTYPE, in1, (pdst) + stride); \
362 }
363 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
364 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
365 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
366 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
367 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
368 
369 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
370 { \
371  ST_V2(RTYPE, in0, in1, (pdst), stride); \
372  ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
373 }
374 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
375 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
376 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
377 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
378 
379 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
380 { \
381  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
382  ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
383 }
384 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
385 
386 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
387 { \
388  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
389  ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
390 }
391 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
392 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
393 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
394 
395 /* Description : Store half word elements of vector with stride
396  * Arguments : Inputs - in source vector
397  * - pdst (destination pointer to store to)
398  * - stride
399  * Details : Stores half word 'idx0' from 'in' to (pdst)
400  * Stores half word 'idx1' from 'in' to (pdst + stride)
401  * Similar for other elements
402  */
403 #define ST_H1(in, idx, pdst) \
404 { \
405  uint16_t out0_m; \
406  out0_m = __msa_copy_u_h((v8i16) in, idx); \
407  SH(out0_m, (pdst)); \
408 }
409 #define ST_H2(in, idx0, idx1, pdst, stride) \
410 { \
411  uint16_t out0_m, out1_m; \
412  out0_m = __msa_copy_u_h((v8i16) in, idx0); \
413  out1_m = __msa_copy_u_h((v8i16) in, idx1); \
414  SH(out0_m, (pdst)); \
415  SH(out1_m, (pdst) + stride); \
416 }
417 #define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
418 { \
419  uint16_t out0_m, out1_m, out2_m, out3_m; \
420  out0_m = __msa_copy_u_h((v8i16) in, idx0); \
421  out1_m = __msa_copy_u_h((v8i16) in, idx1); \
422  out2_m = __msa_copy_u_h((v8i16) in, idx2); \
423  out3_m = __msa_copy_u_h((v8i16) in, idx3); \
424  SH(out0_m, (pdst)); \
425  SH(out1_m, (pdst) + stride); \
426  SH(out2_m, (pdst) + 2 * stride); \
427  SH(out3_m, (pdst) + 3 * stride); \
428 }
429 #define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \
430  idx6, idx7, pdst, stride) \
431 { \
432  ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
433  ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
434 }
435 
436 /* Description : Store word elements of vector with stride
437  * Arguments : Inputs - in source vector
438  * - pdst (destination pointer to store to)
439  * - stride
440  * Details : Stores word 'idx0' from 'in' to (pdst)
441  * Stores word 'idx1' from 'in' to (pdst + stride)
442  * Similar for other elements
443  */
444 #define ST_W1(in, idx, pdst) \
445 { \
446  uint32_t out0_m; \
447  out0_m = __msa_copy_u_w((v4i32) in, idx); \
448  SW(out0_m, (pdst)); \
449 }
450 #define ST_W2(in, idx0, idx1, pdst, stride) \
451 { \
452  uint32_t out0_m, out1_m; \
453  out0_m = __msa_copy_u_w((v4i32) in, idx0); \
454  out1_m = __msa_copy_u_w((v4i32) in, idx1); \
455  SW(out0_m, (pdst)); \
456  SW(out1_m, (pdst) + stride); \
457 }
458 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \
459 { \
460  uint32_t out0_m, out1_m, out2_m, out3_m; \
461  out0_m = __msa_copy_u_w((v4i32) in, idx0); \
462  out1_m = __msa_copy_u_w((v4i32) in, idx1); \
463  out2_m = __msa_copy_u_w((v4i32) in, idx2); \
464  out3_m = __msa_copy_u_w((v4i32) in, idx3); \
465  SW(out0_m, (pdst)); \
466  SW(out1_m, (pdst) + stride); \
467  SW(out2_m, (pdst) + 2*stride); \
468  SW(out3_m, (pdst) + 3*stride); \
469 }
470 #define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \
471  idx4, idx5, idx6, idx7, pdst, stride) \
472 { \
473  ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \
474  ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
475 }
476 
477 /* Description : Store double word elements of vector with stride
478  * Arguments : Inputs - in source vector
479  * - pdst (destination pointer to store to)
480  * - stride
481  * Details : Stores double word 'idx0' from 'in' to (pdst)
482  * Stores double word 'idx1' from 'in' to (pdst + stride)
483  * Similar for other elements
484  */
485 #define ST_D1(in, idx, pdst) \
486 { \
487  uint64_t out0_m; \
488  out0_m = __msa_copy_u_d((v2i64) in, idx); \
489  SD(out0_m, (pdst)); \
490 }
491 #define ST_D2(in, idx0, idx1, pdst, stride) \
492 { \
493  uint64_t out0_m, out1_m; \
494  out0_m = __msa_copy_u_d((v2i64) in, idx0); \
495  out1_m = __msa_copy_u_d((v2i64) in, idx1); \
496  SD(out0_m, (pdst)); \
497  SD(out1_m, (pdst) + stride); \
498 }
499 #define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
500 { \
501  uint64_t out0_m, out1_m, out2_m, out3_m; \
502  out0_m = __msa_copy_u_d((v2i64) in0, idx0); \
503  out1_m = __msa_copy_u_d((v2i64) in0, idx1); \
504  out2_m = __msa_copy_u_d((v2i64) in1, idx2); \
505  out3_m = __msa_copy_u_d((v2i64) in1, idx3); \
506  SD(out0_m, (pdst)); \
507  SD(out1_m, (pdst) + stride); \
508  SD(out2_m, (pdst) + 2 * stride); \
509  SD(out3_m, (pdst) + 3 * stride); \
510 }
511 #define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \
512  idx4, idx5, idx6, idx7, pdst, stride) \
513 { \
514  ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
515  ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
516 }
517 
518 /* Description : Store as 12x8 byte block to destination memory from
519  input vectors
520  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
521  Details : Index 0 double word element from input vector 'in0' is copied
522  and stored to destination memory at (pblk_12x8_m) followed by
523  index 2 word element from same input vector 'in0' at
524  (pblk_12x8_m + 8)
525  Similar to remaining lines
526 */
527 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
528 { \
529  uint64_t out0_m, out1_m, out2_m, out3_m; \
530  uint64_t out4_m, out5_m, out6_m, out7_m; \
531  uint32_t out8_m, out9_m, out10_m, out11_m; \
532  uint32_t out12_m, out13_m, out14_m, out15_m; \
533  uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
534  \
535  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
536  out1_m = __msa_copy_u_d((v2i64) in1, 0); \
537  out2_m = __msa_copy_u_d((v2i64) in2, 0); \
538  out3_m = __msa_copy_u_d((v2i64) in3, 0); \
539  out4_m = __msa_copy_u_d((v2i64) in4, 0); \
540  out5_m = __msa_copy_u_d((v2i64) in5, 0); \
541  out6_m = __msa_copy_u_d((v2i64) in6, 0); \
542  out7_m = __msa_copy_u_d((v2i64) in7, 0); \
543  \
544  out8_m = __msa_copy_u_w((v4i32) in0, 2); \
545  out9_m = __msa_copy_u_w((v4i32) in1, 2); \
546  out10_m = __msa_copy_u_w((v4i32) in2, 2); \
547  out11_m = __msa_copy_u_w((v4i32) in3, 2); \
548  out12_m = __msa_copy_u_w((v4i32) in4, 2); \
549  out13_m = __msa_copy_u_w((v4i32) in5, 2); \
550  out14_m = __msa_copy_u_w((v4i32) in6, 2); \
551  out15_m = __msa_copy_u_w((v4i32) in7, 2); \
552  \
553  SD(out0_m, pblk_12x8_m); \
554  SW(out8_m, pblk_12x8_m + 8); \
555  pblk_12x8_m += stride; \
556  SD(out1_m, pblk_12x8_m); \
557  SW(out9_m, pblk_12x8_m + 8); \
558  pblk_12x8_m += stride; \
559  SD(out2_m, pblk_12x8_m); \
560  SW(out10_m, pblk_12x8_m + 8); \
561  pblk_12x8_m += stride; \
562  SD(out3_m, pblk_12x8_m); \
563  SW(out11_m, pblk_12x8_m + 8); \
564  pblk_12x8_m += stride; \
565  SD(out4_m, pblk_12x8_m); \
566  SW(out12_m, pblk_12x8_m + 8); \
567  pblk_12x8_m += stride; \
568  SD(out5_m, pblk_12x8_m); \
569  SW(out13_m, pblk_12x8_m + 8); \
570  pblk_12x8_m += stride; \
571  SD(out6_m, pblk_12x8_m); \
572  SW(out14_m, pblk_12x8_m + 8); \
573  pblk_12x8_m += stride; \
574  SD(out7_m, pblk_12x8_m); \
575  SW(out15_m, pblk_12x8_m + 8); \
576 }
577 
578 /* Description : average with rounding (in0 + in1 + 1) / 2.
579  Arguments : Inputs - in0, in1, in2, in3,
580  Outputs - out0, out1
581  Return Type - as per RTYPE
582  Details : Each byte element from 'in0' vector is added with each byte
583  element from 'in1' vector. The addition of the elements plus 1
584  (for rounding) is done unsigned with full precision,
585  i.e. the result has one extra bit. Unsigned division by 2
586  (or logical shift right by one bit) is performed before writing
587  the result to vector 'out0'
588  Similar for the pair of 'in2' and 'in3'
589 */
590 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
591 { \
592  out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
593  out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
594 }
595 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
596 
597 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
598  out0, out1, out2, out3) \
599 { \
600  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
601  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
602 }
603 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
604 
605 /* Description : Immediate number of columns to slide
606  Arguments : Inputs - s, d, slide_val
607  Outputs - out
608  Return Type - as per RTYPE
609  Details : Byte elements from 'd' vector are slide into 's' by
610  number of elements specified by 'slide_val'
611 */
612 #define SLDI_B(RTYPE, d, s, slide_val, out) \
613 { \
614  out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val); \
615 }
616 
617 #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
618 { \
619  SLDI_B(RTYPE, d0, s0, slide_val, out0) \
620  SLDI_B(RTYPE, d1, s1, slide_val, out1) \
621 }
622 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
623 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
624 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
625 #define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
626 
627 #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val, \
628  out0, out1, out2) \
629 { \
630  SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
631  SLDI_B(RTYPE, d2, s2, slide_val, out2) \
632 }
633 #define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
634 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
635 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
636 
637 #define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3, \
638  slide_val, out0, out1, out2, out3) \
639 { \
640  SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
641  SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3) \
642 }
643 #define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
644 #define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
645 #define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
646 
647 /* Description : Shuffle byte vector elements as per mask vector
648  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
649  Outputs - out0, out1
650  Return Type - as per RTYPE
651  Details : Selective byte elements from in0 & in1 are copied to out0 as
652  per control vector mask0
653  Selective byte elements from in2 & in3 are copied to out1 as
654  per control vector mask1
655 */
656 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
657 { \
658  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
659  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
660 }
661 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
662 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
663 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
664 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
665 
666 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
667  out0, out1, out2) \
668 { \
669  VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
670  out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
671 }
672 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
673 
674 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
675  out0, out1, out2, out3) \
676 { \
677  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
678  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
679 }
680 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
681 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
682 
683 /* Description : Shuffle halfword vector elements as per mask vector
684  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
685  Outputs - out0, out1
686  Return Type - as per RTYPE
687  Details : Selective halfword elements from in0 & in1 are copied to out0
688  as per control vector mask0
689  Selective halfword elements from in2 & in3 are copied to out1
690  as per control vector mask1
691 */
692 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
693 { \
694  out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
695  out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
696 }
697 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
698 
699 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
700  out0, out1, out2) \
701 { \
702  VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
703  out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
704 }
705 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
706 
707 /* Description : Shuffle byte vector elements as per mask vector
708  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
709  Outputs - out0, out1
710  Return Type - as per RTYPE
711  Details : Selective byte elements from in0 & in1 are copied to out0 as
712  per control vector mask0
713  Selective byte elements from in2 & in3 are copied to out1 as
714  per control vector mask1
715 */
716 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
717 { \
718  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
719  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
720 }
721 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
722 
723 /* Description : Dot product of byte vector elements
724  Arguments : Inputs - mult0, mult1
725  cnst0, cnst1
726  Outputs - out0, out1
727  Return Type - as per RTYPE
728  Details : Unsigned byte elements from mult0 are multiplied with
729  unsigned byte elements from cnst0 producing a result
730  twice the size of input i.e. unsigned halfword.
731  Then this multiplication results of adjacent odd-even elements
732  are added together and stored to the out vector
733  (2 unsigned halfword results)
734 */
735 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
736 { \
737  out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
738  out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
739 }
740 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
741 
742 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
743  cnst0, cnst1, cnst2, cnst3, \
744  out0, out1, out2, out3) \
745 { \
746  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
747  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
748 }
749 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
750 
751 /* Description : Dot product of byte vector elements
752  Arguments : Inputs - mult0, mult1
753  cnst0, cnst1
754  Outputs - out0, out1
755  Return Type - as per RTYPE
756  Details : Signed byte elements from mult0 are multiplied with
757  signed byte elements from cnst0 producing a result
758  twice the size of input i.e. signed halfword.
759  Then this multiplication results of adjacent odd-even elements
760  are added together and stored to the out vector
761  (2 signed halfword results)
762 */
763 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
764 { \
765  out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
766  out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
767 }
768 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
769 
770 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
771  out0, out1, out2) \
772 { \
773  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
774  out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
775 }
776 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
777 
778 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
779  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
780 { \
781  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
782  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
783 }
784 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
785 
786 /* Description : Dot product of halfword vector elements
787  Arguments : Inputs - mult0, mult1
788  cnst0, cnst1
789  Outputs - out0, out1
790  Return Type - as per RTYPE
791  Details : Signed halfword elements from mult0 are multiplied with
792  signed halfword elements from cnst0 producing a result
793  twice the size of input i.e. signed word.
794  Then this multiplication results of adjacent odd-even elements
795  are added together and stored to the out vector
796  (2 signed word results)
797 */
798 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
799 { \
800  out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
801  out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
802 }
803 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
804 
805 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
806  cnst0, cnst1, cnst2, cnst3, \
807  out0, out1, out2, out3) \
808 { \
809  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
810  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
811 }
812 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
813 
814 /* Description : Dot product & addition of byte vector elements
815  Arguments : Inputs - mult0, mult1
816  cnst0, cnst1
817  Outputs - out0, out1
818  Return Type - as per RTYPE
819  Details : Signed byte elements from mult0 are multiplied with
820  signed byte elements from cnst0 producing a result
821  twice the size of input i.e. signed halfword.
822  Then this multiplication results of adjacent odd-even elements
823  are added to the out vector
824  (2 signed halfword results)
825 */
826 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
827 { \
828  out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
829  (v16i8) mult0, (v16i8) cnst0); \
830  out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
831  (v16i8) mult1, (v16i8) cnst1); \
832 }
833 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
834 
835 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
836  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
837 { \
838  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
839  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
840 }
841 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
842 
843 /* Description : Dot product & addition of byte vector elements
844  Arguments : Inputs - mult0, mult1
845  cnst0, cnst1
846  Outputs - out0, out1
847  Return Type - as per RTYPE
848  Details : Unsigned byte elements from mult0 are multiplied with
849  unsigned byte elements from cnst0 producing a result
850  twice the size of input i.e. unsigned halfword.
851  Then this multiplication results of adjacent odd-even elements
852  are added to the out vector
853  (2 unsigned halfword results)
854 */
855 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
856 { \
857  out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
858  (v16u8) mult0, (v16u8) cnst0); \
859  out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
860  (v16u8) mult1, (v16u8) cnst1); \
861 }
862 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
863 
864 /* Description : Dot product & addition of halfword vector elements
865  Arguments : Inputs - mult0, mult1
866  cnst0, cnst1
867  Outputs - out0, out1
868  Return Type - as per RTYPE
869  Details : Signed halfword elements from mult0 are multiplied with
870  signed halfword elements from cnst0 producing a result
871  twice the size of input i.e. signed word.
872  Then this multiplication results of adjacent odd-even elements
873  are added to the out vector
874  (2 signed word results)
875 */
876 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
877 { \
878  out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
879  (v8i16) mult0, (v8i16) cnst0); \
880  out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
881  (v8i16) mult1, (v8i16) cnst1); \
882 }
883 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
884 
885 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
886  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
887 { \
888  DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
889  DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
890 }
891 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
892 
893 /* Description : Minimum values between unsigned elements of
894  either vector are copied to the output vector
895  Arguments : Inputs - in0, in1, min_vec
896  Outputs - in0, in1, (in place)
897  Return Type - as per RTYPE
898  Details : Minimum of unsigned halfword element values from 'in0' and
899  'min_value' are written to output vector 'in0'
900 */
901 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
902 { \
903  in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
904  in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
905 }
906 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
907 
908 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
909 { \
910  MIN_UH2(RTYPE, in0, in1, min_vec); \
911  MIN_UH2(RTYPE, in2, in3, min_vec); \
912 }
913 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
914 
915 /* Description : Clips all halfword elements of input vector between min & max
916  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
917  Arguments : Inputs - in (input vector)
918  - min (min threshold)
919  - max (max threshold)
920  Outputs - in (output vector with clipped elements)
921  Return Type - signed halfword
922 */
923 #define CLIP_SH(in, min, max) \
924 { \
925  in = __msa_max_s_h((v8i16) min, (v8i16) in); \
926  in = __msa_min_s_h((v8i16) max, (v8i16) in); \
927 }
928 
929 /* Description : Clips all signed halfword elements of input vector
930  between 0 & 255
931  Arguments : Inputs - in (input vector)
932  Outputs - in (output vector with clipped elements)
933  Return Type - signed halfwords
934 */
935 #define CLIP_SH_0_255(in) \
936 { \
937  in = __msa_maxi_s_h((v8i16) in, 0); \
938  in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \
939 }
940 
941 #define CLIP_SH2_0_255(in0, in1) \
942 { \
943  CLIP_SH_0_255(in0); \
944  CLIP_SH_0_255(in1); \
945 }
946 
947 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
948 { \
949  CLIP_SH2_0_255(in0, in1); \
950  CLIP_SH2_0_255(in2, in3); \
951 }
952 
953 #define CLIP_SH8_0_255(in0, in1, in2, in3, \
954  in4, in5, in6, in7) \
955 { \
956  CLIP_SH4_0_255(in0, in1, in2, in3); \
957  CLIP_SH4_0_255(in4, in5, in6, in7); \
958 }
959 
960 /* Description : Clips all signed word elements of input vector
961  between 0 & 255
962  Arguments : Inputs - in (input vector)
963  Outputs - in (output vector with clipped elements)
964  Return Type - signed word
965 */
966 #define CLIP_SW_0_255(in) \
967 { \
968  in = __msa_maxi_s_w((v4i32) in, 0); \
969  in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \
970 }
971 
972 #define CLIP_SW2_0_255(in0, in1) \
973 { \
974  CLIP_SW_0_255(in0); \
975  CLIP_SW_0_255(in1); \
976 }
977 
978 #define CLIP_SW4_0_255(in0, in1, in2, in3) \
979 { \
980  CLIP_SW2_0_255(in0, in1); \
981  CLIP_SW2_0_255(in2, in3); \
982 }
983 
984 #define CLIP_SW8_0_255(in0, in1, in2, in3, \
985  in4, in5, in6, in7) \
986 { \
987  CLIP_SW4_0_255(in0, in1, in2, in3); \
988  CLIP_SW4_0_255(in4, in5, in6, in7); \
989 }
990 
991 /* Description : Addition of 4 signed word elements
992  4 signed word elements of input vector are added together and
993  resulted integer sum is returned
994  Arguments : Inputs - in (signed word vector)
995  Outputs - sum_m (i32 sum)
996  Return Type - signed word
997 */
998 #define HADD_SW_S32(in) \
999 ( { \
1000  v2i64 res0_m, res1_m; \
1001  int32_t sum_m; \
1002  \
1003  res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1004  res1_m = __msa_splati_d(res0_m, 1); \
1005  res0_m += res1_m; \
1006  sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1007  sum_m; \
1008 } )
1009 
1010 /* Description : Addition of 8 unsigned halfword elements
1011  8 unsigned halfword elements of input vector are added
1012  together and resulted integer sum is returned
1013  Arguments : Inputs - in (unsigned halfword vector)
1014  Outputs - sum_m (u32 sum)
1015  Return Type - unsigned word
1016 */
1017 #define HADD_UH_U32(in) \
1018 ( { \
1019  v4u32 res_m; \
1020  v2u64 res0_m, res1_m; \
1021  uint32_t sum_m; \
1022  \
1023  res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1024  res0_m = __msa_hadd_u_d(res_m, res_m); \
1025  res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1026  res0_m += res1_m; \
1027  sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1028  sum_m; \
1029 } )
1030 
1031 /* Description : Horizontal addition of signed byte vector elements
1032  Arguments : Inputs - in0, in1
1033  Outputs - out0, out1
1034  Return Type - as per RTYPE
1035  Details : Each signed odd byte element from 'in0' is added to
1036  even signed byte element from 'in0' (pairwise) and the
1037  halfword result is stored in 'out0'
1038 */
1039 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1040 { \
1041  out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1042  out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1043 }
1044 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1045 
1046 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1047 { \
1048  HADD_SB2(RTYPE, in0, in1, out0, out1); \
1049  HADD_SB2(RTYPE, in2, in3, out2, out3); \
1050 }
1051 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1052 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1053 
1054 /* Description : Horizontal addition of unsigned byte vector elements
1055  Arguments : Inputs - in0, in1
1056  Outputs - out0, out1
1057  Return Type - as per RTYPE
1058  Details : Each unsigned odd byte element from 'in0' is added to
1059  even unsigned byte element from 'in0' (pairwise) and the
1060  halfword result is stored in 'out0'
1061 */
1062 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1063 { \
1064  out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1065  out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1066 }
1067 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1068 
1069 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1070 { \
1071  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1072  out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1073 }
1074 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1075 
1076 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1077 { \
1078  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1079  HADD_UB2(RTYPE, in2, in3, out2, out3); \
1080 }
1081 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1082 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1083 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1084 
1085 /* Description : Horizontal subtraction of unsigned byte vector elements
1086  Arguments : Inputs - in0, in1
1087  Outputs - out0, out1
1088  Return Type - as per RTYPE
1089  Details : Each unsigned odd byte element from 'in0' is subtracted from
1090  even unsigned byte element from 'in0' (pairwise) and the
1091  halfword result is stored in 'out0'
1092 */
1093 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1094 { \
1095  out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1096  out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1097 }
1098 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1099 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1100 
1101 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1102 { \
1103  HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1104  HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1105 }
1106 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1107 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1108 
1109 /* Description : SAD (Sum of Absolute Difference)
1110  Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1111  Outputs - sad_m (halfword vector with sad)
1112  Return Type - unsigned halfword
1113  Details : Absolute difference of all the byte elements from 'in0' with
1114  'ref0' is calculated and preserved in 'diff0'. From the 16
1115  unsigned absolute diff values, even-odd pairs are added
1116  together to generate 8 halfword results.
1117 */
1118 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1119 ( { \
1120  v16u8 diff0_m, diff1_m; \
1121  v8u16 sad_m = { 0 }; \
1122  \
1123  diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1124  diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1125  \
1126  sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1127  sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1128  \
1129  sad_m; \
1130 } )
1131 
1132 /* Description : Insert specified word elements from input vectors to 1
1133  destination vector
1134  Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1135  Outputs - out (output vector)
1136  Return Type - as per RTYPE
1137 */
1138 #define INSERT_W2(RTYPE, in0, in1, out) \
1139 { \
1140  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1141  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1142 }
1143 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1144 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1145 
1146 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1147 { \
1148  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1149  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1150  out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1151  out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1152 }
1153 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1154 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1155 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1156 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1157 
1158 /* Description : Insert specified double word elements from input vectors to 1
1159  destination vector
1160  Arguments : Inputs - in0, in1 (2 input vectors)
1161  Outputs - out (output vector)
1162  Return Type - as per RTYPE
1163 */
1164 #define INSERT_D2(RTYPE, in0, in1, out) \
1165 { \
1166  out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1167  out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1168 }
1169 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1170 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1171 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1172 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1173 
1174 /* Description : Interleave even byte elements from vectors
1175  Arguments : Inputs - in0, in1, in2, in3
1176  Outputs - out0, out1
1177  Return Type - as per RTYPE
1178  Details : Even byte elements of 'in0' and even byte
1179  elements of 'in1' are interleaved and copied to 'out0'
1180  Even byte elements of 'in2' and even byte
1181  elements of 'in3' are interleaved and copied to 'out1'
1182 */
1183 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1184 { \
1185  out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1186  out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1187 }
1188 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1189 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1190 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1191 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1192 
1193 /* Description : Interleave even halfword elements from vectors
1194  Arguments : Inputs - in0, in1, in2, in3
1195  Outputs - out0, out1
1196  Return Type - as per RTYPE
1197  Details : Even halfword elements of 'in0' and even halfword
1198  elements of 'in1' are interleaved and copied to 'out0'
1199  Even halfword elements of 'in2' and even halfword
1200  elements of 'in3' are interleaved and copied to 'out1'
1201 */
1202 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1203 { \
1204  out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1205  out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1206 }
1207 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1208 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1209 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1210 
1211 /* Description : Interleave even word elements from vectors
1212  Arguments : Inputs - in0, in1, in2, in3
1213  Outputs - out0, out1
1214  Return Type - as per RTYPE
1215  Details : Even word elements of 'in0' and even word
1216  elements of 'in1' are interleaved and copied to 'out0'
1217  Even word elements of 'in2' and even word
1218  elements of 'in3' are interleaved and copied to 'out1'
1219 */
1220 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1221 { \
1222  out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1223  out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1224 }
1225 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1226 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1227 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1228 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1229 
1230 /* Description : Interleave even double word elements from vectors
1231  Arguments : Inputs - in0, in1, in2, in3
1232  Outputs - out0, out1
1233  Return Type - as per RTYPE
1234  Details : Even double word elements of 'in0' and even double word
1235  elements of 'in1' are interleaved and copied to 'out0'
1236  Even double word elements of 'in2' and even double word
1237  elements of 'in3' are interleaved and copied to 'out1'
1238 */
1239 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1240 { \
1241  out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1242  out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1243 }
1244 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1245 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1246 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1247 
1248 /* Description : Interleave left half of byte elements from vectors
1249  Arguments : Inputs - in0, in1, in2, in3
1250  Outputs - out0, out1
1251  Return Type - as per RTYPE
1252  Details : Left half of byte elements of in0 and left half of byte
1253  elements of in1 are interleaved and copied to out0.
1254  Left half of byte elements of in2 and left half of byte
1255  elements of in3 are interleaved and copied to out1.
1256 */
1257 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1258 { \
1259  out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1260  out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1261 }
1262 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1263 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1264 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1265 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1266 
1267 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1268  out0, out1, out2, out3) \
1269 { \
1270  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1271  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1272 }
1273 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1274 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1275 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1276 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1277 
1278 /* Description : Interleave left half of halfword elements from vectors
1279  Arguments : Inputs - in0, in1, in2, in3
1280  Outputs - out0, out1
1281  Return Type - as per RTYPE
1282  Details : Left half of halfword elements of in0 and left half of halfword
1283  elements of in1 are interleaved and copied to out0.
1284  Left half of halfword elements of in2 and left half of halfword
1285  elements of in3 are interleaved and copied to out1.
1286 */
1287 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1288 { \
1289  out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1290  out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1291 }
1292 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1293 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1294 
1295 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1296  out0, out1, out2, out3) \
1297 { \
1298  ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1299  ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1300 }
1301 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1302 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1303 
1304 /* Description : Interleave left half of word elements from vectors
1305  Arguments : Inputs - in0, in1, in2, in3
1306  Outputs - out0, out1
1307  Return Type - as per RTYPE
1308  Details : Left half of word elements of in0 and left half of word
1309  elements of in1 are interleaved and copied to out0.
1310  Left half of word elements of in2 and left half of word
1311  elements of in3 are interleaved and copied to out1.
1312 */
1313 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1314 { \
1315  out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1316  out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1317 }
1318 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1319 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1320 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1321 
1322 /* Description : Interleave right half of byte elements from vectors
1323  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1324  Outputs - out0, out1, out2, out3
1325  Return Type - as per RTYPE
1326  Details : Right half of byte elements of in0 and right half of byte
1327  elements of in1 are interleaved and copied to out0.
1328  Right half of byte elements of in2 and right half of byte
1329  elements of in3 are interleaved and copied to out1.
1330  Similar for other pairs
1331 */
1332 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1333 { \
1334  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1335  out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1336 }
1337 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1338 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1339 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1340 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1341 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1342 
1343 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1344 { \
1345  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1346  out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1347 }
1348 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1349 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1350 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1351 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1352 
1353 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1354  out0, out1, out2, out3) \
1355 { \
1356  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1357  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1358 }
1359 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1360 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1361 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1362 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1363 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1364 
1365 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1366  in8, in9, in10, in11, in12, in13, in14, in15, \
1367  out0, out1, out2, out3, out4, out5, out6, out7) \
1368 { \
1369  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1370  out0, out1, out2, out3); \
1371  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1372  out4, out5, out6, out7); \
1373 }
1374 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1375 #define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1376 
1377 /* Description : Interleave right half of halfword elements from vectors
1378  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1379  Outputs - out0, out1, out2, out3
1380  Return Type - as per RTYPE
1381  Details : Right half of halfword elements of in0 and right half of
1382  halfword elements of in1 are interleaved and copied to out0.
1383  Right half of halfword elements of in2 and right half of
1384  halfword elements of in3 are interleaved and copied to out1.
1385  Similar for other pairs
1386 */
1387 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1388 { \
1389  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1390  out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1391 }
1392 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1393 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1394 
1395 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1396 { \
1397  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1398  out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1399 }
1400 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1401 
1402 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1403  out0, out1, out2, out3) \
1404 { \
1405  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1406  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1407 }
1408 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1409 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1410 
1411 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1412 { \
1413  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1414  out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1415 }
1416 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1417 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1418 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1419 
1420 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1421  out0, out1, out2, out3) \
1422 { \
1423  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1424  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1425 }
1426 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1427 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1428 
1429 /* Description : Interleave right half of double word elements from vectors
1430  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1431  Outputs - out0, out1, out2, out3
1432  Return Type - as per RTYPE
1433  Details : Right half of double word elements of in0 and right half of
1434  double word elements of in1 are interleaved and copied to out0.
1435  Right half of double word elements of in2 and right half of
1436  double word elements of in3 are interleaved and copied to out1.
1437 */
1438 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1439 { \
1440  out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1441  out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1442 }
1443 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1444 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1445 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1446 
1447 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1448 { \
1449  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1450  out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1451 }
1452 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1453 
1454 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1455  out0, out1, out2, out3) \
1456 { \
1457  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1458  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1459 }
1460 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1461 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1462 
1463 /* Description : Interleave left half of double word elements from vectors
1464  Arguments : Inputs - in0, in1, in2, in3
1465  Outputs - out0, out1
1466  Return Type - as per RTYPE
1467  Details : Left half of double word elements of in0 and left half of
1468  double word elements of in1 are interleaved and copied to out0.
1469  Left half of double word elements of in2 and left half of
1470  double word elements of in3 are interleaved and copied to out1.
1471 */
1472 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1473 { \
1474  out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1475  out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1476 }
1477 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1478 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1479 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1480 
1481 /* Description : Interleave both left and right half of input vectors
1482  Arguments : Inputs - in0, in1
1483  Outputs - out0, out1
1484  Return Type - as per RTYPE
1485  Details : Right half of byte elements from 'in0' and 'in1' are
1486  interleaved and stored to 'out0'
1487  Left half of byte elements from 'in0' and 'in1' are
1488  interleaved and stored to 'out1'
1489 */
1490 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1491 { \
1492  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1493  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1494 }
1495 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1496 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1497 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1498 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1499 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1500 
1501 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1502 { \
1503  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1504  out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1505 }
1506 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1507 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1508 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1509 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1510 
1511 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1512 { \
1513  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1514  out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1515 }
1516 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1517 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1518 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1519 
1520 /* Description : Maximum values between signed elements of vector and
1521  5-bit signed immediate value are copied to the output vector
1522  Arguments : Inputs - in0, in1, in2, in3, max_val
1523  Outputs - in0, in1, in2, in3 (in place)
1524  Return Type - as per RTYPE
1525  Details : Maximum of signed halfword element values from 'in0' and
1526  'max_val' are written to output vector 'in0'
1527 */
1528 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1529 { \
1530  in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1531  in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1532 }
1533 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1534 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1535 
1536 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1537 { \
1538  MAXI_SH2(RTYPE, in0, in1, max_val); \
1539  MAXI_SH2(RTYPE, in2, in3, max_val); \
1540 }
1541 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1542 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1543 
1544 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
1545 { \
1546  MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
1547  MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
1548 }
1549 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1550 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1551 
1552 /* Description : Saturate the halfword element values to the max
1553  unsigned value of (sat_val+1 bits)
1554  The element data width remains unchanged
1555  Arguments : Inputs - in0, in1, in2, in3, sat_val
1556  Outputs - in0, in1, in2, in3 (in place)
1557  Return Type - as per RTYPE
1558  Details : Each unsigned halfword element from 'in0' is saturated to the
1559  value generated with (sat_val+1) bit range
1560  Results are in placed to original vectors
1561 */
1562 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1563 { \
1564  in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1565  in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1566 }
1567 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1568 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1569 
1570 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1571 { \
1572  SAT_UH2(RTYPE, in0, in1, sat_val); \
1573  SAT_UH2(RTYPE, in2, in3, sat_val); \
1574 }
1575 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1576 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1577 
1578 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
1579 { \
1580  SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
1581  SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
1582 }
1583 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1584 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1585 
1586 /* Description : Saturate the halfword element values to the max
1587  unsigned value of (sat_val+1 bits)
1588  The element data width remains unchanged
1589  Arguments : Inputs - in0, in1, in2, in3, sat_val
1590  Outputs - in0, in1, in2, in3 (in place)
1591  Return Type - as per RTYPE
1592  Details : Each unsigned halfword element from 'in0' is saturated to the
1593  value generated with (sat_val+1) bit range
1594  Results are in placed to original vectors
1595 */
1596 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1597 { \
1598  in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1599  in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1600 }
1601 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1602 
1603 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1604 { \
1605  SAT_SH2(RTYPE, in0, in1, sat_val); \
1606  in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1607 }
1608 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1609 
1610 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1611 { \
1612  SAT_SH2(RTYPE, in0, in1, sat_val); \
1613  SAT_SH2(RTYPE, in2, in3, sat_val); \
1614 }
1615 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1616 
1617 /* Description : Saturate the word element values to the max
1618  unsigned value of (sat_val+1 bits)
1619  The element data width remains unchanged
1620  Arguments : Inputs - in0, in1, in2, in3, sat_val
1621  Outputs - in0, in1, in2, in3 (in place)
1622  Return Type - as per RTYPE
1623  Details : Each unsigned word element from 'in0' is saturated to the
1624  value generated with (sat_val+1) bit range
1625  Results are in placed to original vectors
1626 */
1627 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1628 { \
1629  in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1630  in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1631 }
1632 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1633 
1634 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1635 { \
1636  SAT_SW2(RTYPE, in0, in1, sat_val); \
1637  SAT_SW2(RTYPE, in2, in3, sat_val); \
1638 }
1639 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1640 
1641 /* Description : Indexed halfword element values are replicated to all
1642  elements in output vector
1643  Arguments : Inputs - in, idx0, idx1
1644  Outputs - out0, out1
1645  Return Type - as per RTYPE
1646  Details : 'idx0' element value from 'in' vector is replicated to all
1647  elements in 'out0' vector
1648  Valid index range for halfword operation is 0-7
1649 */
1650 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1651 { \
1652  out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1653  out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1654 }
1655 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1656 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1657 
1658 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1659  out0, out1, out2) \
1660 { \
1661  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1662  out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1663 }
1664 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1665 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1666 
1667 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1668  out0, out1, out2, out3) \
1669 { \
1670  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1671  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1672 }
1673 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1674 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1675 
1676 /* Description : Indexed word element values are replicated to all
1677  elements in output vector
1678  Arguments : Inputs - in, stidx
1679  Outputs - out0, out1
1680  Return Type - as per RTYPE
1681  Details : 'stidx' element value from 'in' vector is replicated to all
1682  elements in 'out0' vector
1683  'stidx + 1' element value from 'in' vector is replicated to all
1684  elements in 'out1' vector
1685  Valid index range for halfword operation is 0-3
1686 */
1687 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1688 { \
1689  out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1690  out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1691 }
1692 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1693 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1694 
1695 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1696 { \
1697  SPLATI_W2(RTYPE, in, 0, out0, out1); \
1698  SPLATI_W2(RTYPE, in, 2, out2, out3); \
1699 }
1700 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1701 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1702 
1703 /* Description : Pack even byte elements of vector pairs
1704  Arguments : Inputs - in0, in1, in2, in3
1705  Outputs - out0, out1
1706  Return Type - as per RTYPE
1707  Details : Even byte elements of in0 are copied to the left half of
1708  out0 & even byte elements of in1 are copied to the right
1709  half of out0.
1710  Even byte elements of in2 are copied to the left half of
1711  out1 & even byte elements of in3 are copied to the right
1712  half of out1.
1713 */
1714 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1715 { \
1716  out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1717  out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1718 }
1719 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1720 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1721 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1722 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1723 
1724 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1725 { \
1726  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1727  out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1728 }
1729 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1730 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1731 
1732 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1733  out0, out1, out2, out3) \
1734 { \
1735  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1736  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1737 }
1738 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1739 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1740 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1741 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1742 
1743 /* Description : Pack even halfword elements of vector pairs
1744  Arguments : Inputs - in0, in1, in2, in3
1745  Outputs - out0, out1
1746  Return Type - as per RTYPE
1747  Details : Even halfword elements of in0 are copied to the left half of
1748  out0 & even halfword elements of in1 are copied to the right
1749  half of out0.
1750  Even halfword elements of in2 are copied to the left half of
1751  out1 & even halfword elements of in3 are copied to the right
1752  half of out1.
1753 */
1754 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1755 { \
1756  out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1757  out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1758 }
1759 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1760 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1761 
1762 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1763  out0, out1, out2, out3) \
1764 { \
1765  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1766  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1767 }
1768 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1769 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1770 
1771 /* Description : Pack even double word elements of vector pairs
1772  Arguments : Inputs - in0, in1, in2, in3
1773  Outputs - out0, out1
1774  Return Type - as per RTYPE
1775  Details : Even double elements of in0 are copied to the left half of
1776  out0 & even double elements of in1 are copied to the right
1777  half of out0.
1778  Even double elements of in2 are copied to the left half of
1779  out1 & even double elements of in3 are copied to the right
1780  half of out1.
1781 */
1782 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1783 { \
1784  out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1785  out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1786 }
1787 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1788 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1789 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1790 
1791 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1792  out0, out1, out2, out3) \
1793 { \
1794  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1795  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1796 }
1797 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1798 
1799 /* Description : Pack odd double word elements of vector pairs
1800  Arguments : Inputs - in0, in1
1801  Outputs - out0, out1
1802  Return Type - as per RTYPE
1803  Details : As operation is on same input 'in0' vector, index 1 double word
1804  element is overwritten to index 0 and result is written to out0
1805  As operation is on same input 'in1' vector, index 1 double word
1806  element is overwritten to index 0 and result is written to out1
1807 */
1808 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1809 { \
1810  out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1811  out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1812 }
1813 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1814 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1815 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1816 
1817 /* Description : Each byte element is logically xor'ed with immediate 128
1818  Arguments : Inputs - in0, in1
1819  Outputs - in0, in1 (in-place)
1820  Return Type - as per RTYPE
1821  Details : Each unsigned byte element from input vector 'in0' is
1822  logically xor'ed with 128 and result is in-place stored in
1823  'in0' vector
1824  Each unsigned byte element from input vector 'in1' is
1825  logically xor'ed with 128 and result is in-place stored in
1826  'in1' vector
1827  Similar for other pairs
1828 */
1829 #define XORI_B2_128(RTYPE, in0, in1) \
1830 { \
1831  in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1832  in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1833 }
1834 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1835 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1836 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1837 
1838 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1839 { \
1840  XORI_B2_128(RTYPE, in0, in1); \
1841  in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1842 }
1843 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1844 
1845 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1846 { \
1847  XORI_B2_128(RTYPE, in0, in1); \
1848  XORI_B2_128(RTYPE, in2, in3); \
1849 }
1850 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1851 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1852 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1853 
1854 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1855 { \
1856  XORI_B3_128(RTYPE, in0, in1, in2); \
1857  XORI_B2_128(RTYPE, in3, in4); \
1858 }
1859 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1860 
1861 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1862 { \
1863  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1864  XORI_B2_128(RTYPE, in4, in5); \
1865 }
1866 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1867 
1868 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1869 { \
1870  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1871  XORI_B3_128(RTYPE, in4, in5, in6); \
1872 }
1873 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1874 
1875 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1876 { \
1877  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1878  XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1879 }
1880 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1881 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1882 
1883 /* Description : Addition of signed halfword elements and signed saturation
1884  Arguments : Inputs - in0, in1, in2, in3
1885  Outputs - out0, out1
1886  Return Type - as per RTYPE
1887  Details : Signed halfword elements from 'in0' are added to signed
1888  halfword elements of 'in1'. The result is then signed saturated
1889  between -32768 to +32767 (as per halfword data type)
1890  Similar for other pairs
1891 */
1892 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1893 { \
1894  out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1895  out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1896 }
1897 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1898 
1899 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1900  out0, out1, out2, out3) \
1901 { \
1902  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1903  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1904 }
1905 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1906 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1907 
1908 /* Description : Shift left all elements of vector (generic for all data types)
1909  Arguments : Inputs - in0, in1, in2, in3, shift
1910  Outputs - in0, in1, in2, in3 (in place)
1911  Return Type - as per input vector RTYPE
1912  Details : Each element of vector 'in0' is left shifted by 'shift' and
1913  result is in place written to 'in0'
1914  Similar for other pairs
1915 */
1916 #define SLLI_2V(in0, in1, shift) \
1917 { \
1918  in0 = in0 << shift; \
1919  in1 = in1 << shift; \
1920 }
1921 #define SLLI_4V(in0, in1, in2, in3, shift) \
1922 { \
1923  in0 = in0 << shift; \
1924  in1 = in1 << shift; \
1925  in2 = in2 << shift; \
1926  in3 = in3 << shift; \
1927 }
1928 
1929 /* Description : Arithmetic shift right all elements of vector
1930  (generic for all data types)
1931  Arguments : Inputs - in0, in1, in2, in3, shift
1932  Outputs - in0, in1, in2, in3 (in place)
1933  Return Type - as per input vector RTYPE
1934  Details : Each element of vector 'in0' is right shifted by 'shift' and
1935  result is in place written to 'in0'
1936  Here, 'shift' is GP variable passed in
1937  Similar for other pairs
1938 */
1939 #define SRA_4V(in0, in1, in2, in3, shift) \
1940 { \
1941  in0 = in0 >> shift; \
1942  in1 = in1 >> shift; \
1943  in2 = in2 >> shift; \
1944  in3 = in3 >> shift; \
1945 }
1946 
1947 /* Description : Shift right logical all halfword elements of vector
1948  Arguments : Inputs - in0, in1, in2, in3, shift
1949  Outputs - in0, in1, in2, in3 (in place)
1950  Return Type - as per RTYPE
1951  Details : Each element of vector 'in0' is shifted right logical by
1952  number of bits respective element holds in vector 'shift' and
1953  result is in place written to 'in0'
1954  Here, 'shift' is a vector passed in
1955  Similar for other pairs
1956 */
1957 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1958 { \
1959  in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1960  in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1961  in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1962  in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1963 }
1964 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1965 
1966 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
1967 { \
1968  in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
1969  in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
1970  in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
1971  in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
1972 }
1973 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1974 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1975 
1976 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
1977 { \
1978  SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
1979  SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
1980 }
1981 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1982 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1983 
1984 /* Description : Shift right arithmetic rounded halfwords
1985  Arguments : Inputs - in0, in1, shift
1986  Outputs - in0, in1, (in place)
1987  Return Type - as per RTYPE
1988  Details : Each element of vector 'in0' is shifted right arithmetic by
1989  number of bits respective element holds in vector 'shift'.
1990  The last discarded bit is added to shifted value for rounding
1991  and the result is in place written to 'in0'
1992  Here, 'shift' is a vector passed in
1993  Similar for other pairs
1994 */
1995 #define SRAR_H2(RTYPE, in0, in1, shift) \
1996 { \
1997  in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
1998  in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
1999 }
2000 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2001 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2002 
2003 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2004 { \
2005  SRAR_H2(RTYPE, in0, in1, shift) \
2006  in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2007 }
2008 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2009 
2010 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2011 { \
2012  SRAR_H2(RTYPE, in0, in1, shift) \
2013  SRAR_H2(RTYPE, in2, in3, shift) \
2014 }
2015 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2016 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2017 
2018 /* Description : Shift right arithmetic rounded words
2019  Arguments : Inputs - in0, in1, shift
2020  Outputs - in0, in1, (in place)
2021  Return Type - as per RTYPE
2022  Details : Each element of vector 'in0' is shifted right arithmetic by
2023  number of bits respective element holds in vector 'shift'.
2024  The last discarded bit is added to shifted value for rounding
2025  and the result is in place written to 'in0'
2026  Here, 'shift' is a vector passed in
2027  Similar for other pairs
2028 */
2029 #define SRAR_W2(RTYPE, in0, in1, shift) \
2030 { \
2031  in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2032  in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2033 }
2034 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2035 
2036 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2037 { \
2038  SRAR_W2(RTYPE, in0, in1, shift) \
2039  SRAR_W2(RTYPE, in2, in3, shift) \
2040 }
2041 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2042 
2043 /* Description : Shift right arithmetic rounded (immediate)
2044  Arguments : Inputs - in0, in1, in2, in3, shift
2045  Outputs - in0, in1, in2, in3 (in place)
2046  Return Type - as per RTYPE
2047  Details : Each element of vector 'in0' is shifted right arithmetic by
2048  value in 'shift'.
2049  The last discarded bit is added to shifted value for rounding
2050  and the result is in place written to 'in0'
2051  Similar for other pairs
2052 */
2053 #define SRARI_H2(RTYPE, in0, in1, shift) \
2054 { \
2055  in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2056  in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2057 }
2058 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2059 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2060 
2061 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2062 { \
2063  SRARI_H2(RTYPE, in0, in1, shift); \
2064  SRARI_H2(RTYPE, in2, in3, shift); \
2065 }
2066 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2067 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2068 
2069 /* Description : Shift right arithmetic rounded (immediate)
2070  Arguments : Inputs - in0, in1, shift
2071  Outputs - in0, in1 (in place)
2072  Return Type - as per RTYPE
2073  Details : Each element of vector 'in0' is shifted right arithmetic by
2074  value in 'shift'.
2075  The last discarded bit is added to shifted value for rounding
2076  and the result is in place written to 'in0'
2077  Similar for other pairs
2078 */
2079 #define SRARI_W2(RTYPE, in0, in1, shift) \
2080 { \
2081  in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2082  in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2083 }
2084 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2085 
2086 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2087 { \
2088  SRARI_W2(RTYPE, in0, in1, shift); \
2089  SRARI_W2(RTYPE, in2, in3, shift); \
2090 }
2091 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2092 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2093 
2094 /* Description : Multiplication of pairs of vectors
2095  Arguments : Inputs - in0, in1, in2, in3
2096  Outputs - out0, out1
2097  Details : Each element from 'in0' is multiplied with elements from 'in1'
2098  and result is written to 'out0'
2099  Similar for other pairs
2100 */
2101 #define MUL2(in0, in1, in2, in3, out0, out1) \
2102 { \
2103  out0 = in0 * in1; \
2104  out1 = in2 * in3; \
2105 }
2106 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2107 { \
2108  MUL2(in0, in1, in2, in3, out0, out1); \
2109  MUL2(in4, in5, in6, in7, out2, out3); \
2110 }
2111 
2112 /* Description : Addition of 2 pairs of vectors
2113  Arguments : Inputs - in0, in1, in2, in3
2114  Outputs - out0, out1
2115  Details : Each element from 2 pairs vectors is added and 2 results are
2116  produced
2117 */
2118 #define ADD2(in0, in1, in2, in3, out0, out1) \
2119 { \
2120  out0 = in0 + in1; \
2121  out1 = in2 + in3; \
2122 }
2123 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2124 { \
2125  ADD2(in0, in1, in2, in3, out0, out1); \
2126  ADD2(in4, in5, in6, in7, out2, out3); \
2127 }
2128 
2129 /* Description : Subtraction of 2 pairs of vectors
2130  Arguments : Inputs - in0, in1, in2, in3
2131  Outputs - out0, out1
2132  Details : Each element from 2 pairs vectors is subtracted and 2 results
2133  are produced
2134 */
2135 #define SUB2(in0, in1, in2, in3, out0, out1) \
2136 { \
2137  out0 = in0 - in1; \
2138  out1 = in2 - in3; \
2139 }
2140 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2141 { \
2142  out0 = in0 - in1; \
2143  out1 = in2 - in3; \
2144  out2 = in4 - in5; \
2145  out3 = in6 - in7; \
2146 }
2147 
2148 /* Description : Sign extend byte elements from right half of the vector
2149  Arguments : Input - in (byte vector)
2150  Output - out (sign extended halfword vector)
2151  Return Type - signed halfword
2152  Details : Sign bit of byte elements from input vector 'in' is
2153  extracted and interleaved with same vector 'in' to generate
2154  8 halfword elements keeping sign intact
2155 */
2156 #define UNPCK_R_SB_SH(in, out) \
2157 { \
2158  v16i8 sign_m; \
2159  \
2160  sign_m = __msa_clti_s_b((v16i8) in, 0); \
2161  out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \
2162 }
2163 
2164 /* Description : Sign extend halfword elements from right half of the vector
2165  Arguments : Inputs - in (input halfword vector)
2166  Outputs - out (sign extended word vectors)
2167  Return Type - signed word
2168  Details : Sign bit of halfword elements from input vector 'in' is
2169  extracted and interleaved with same vector 'in0' to generate
2170  4 word elements keeping sign intact
2171 */
2172 #define UNPCK_R_SH_SW(in, out) \
2173 { \
2174  v8i16 sign_m; \
2175  \
2176  sign_m = __msa_clti_s_h((v8i16) in, 0); \
2177  out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2178 }
2179 
2180 /* Description : Sign extend byte elements from input vector and return
2181  halfword results in pair of vectors
2182  Arguments : Inputs - in (1 input byte vector)
2183  Outputs - out0, out1 (sign extended 2 halfword vectors)
2184  Return Type - signed halfword
2185  Details : Sign bit of byte elements from input vector 'in' is
2186  extracted and interleaved right with same vector 'in0' to
2187  generate 8 signed halfword elements in 'out0'
2188  Then interleaved left with same vector 'in0' to
2189  generate 8 signed halfword elements in 'out1'
2190 */
2191 #define UNPCK_SB_SH(in, out0, out1) \
2192 { \
2193  v16i8 tmp_m; \
2194  \
2195  tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2196  ILVRL_B2_SH(tmp_m, in, out0, out1); \
2197 }
2198 
2199 /* Description : Zero extend unsigned byte elements to halfword elements
2200  Arguments : Inputs - in (1 input unsigned byte vector)
2201  Outputs - out0, out1 (unsigned 2 halfword vectors)
2202  Return Type - signed halfword
2203  Details : Zero extended right half of vector is returned in 'out0'
2204  Zero extended left half of vector is returned in 'out1'
2205 */
2206 #define UNPCK_UB_SH(in, out0, out1) \
2207 { \
2208  v16i8 zero_m = { 0 }; \
2209  \
2210  ILVRL_B2_SH(zero_m, in, out0, out1); \
2211 }
2212 
2213 /* Description : Sign extend halfword elements from input vector and return
2214  result in pair of vectors
2215  Arguments : Inputs - in (1 input halfword vector)
2216  Outputs - out0, out1 (sign extended 2 word vectors)
2217  Return Type - signed word
2218  Details : Sign bit of halfword elements from input vector 'in' is
2219  extracted and interleaved right with same vector 'in0' to
2220  generate 4 signed word elements in 'out0'
2221  Then interleaved left with same vector 'in0' to
2222  generate 4 signed word elements in 'out1'
2223 */
2224 #define UNPCK_SH_SW(in, out0, out1) \
2225 { \
2226  v8i16 tmp_m; \
2227  \
2228  tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2229  ILVRL_H2_SW(tmp_m, in, out0, out1); \
2230 }
2231 
2232 /* Description : Swap two variables
2233  Arguments : Inputs - in0, in1
2234  Outputs - in0, in1 (in-place)
2235  Details : Swapping of two input variables using xor
2236 */
2237 #define SWAP(in0, in1) \
2238 { \
2239  in0 = in0 ^ in1; \
2240  in1 = in0 ^ in1; \
2241  in0 = in0 ^ in1; \
2242 }
2243 
2244 /* Description : Butterfly of 4 input vectors
2245  Arguments : Inputs - in0, in1, in2, in3
2246  Outputs - out0, out1, out2, out3
2247  Details : Butterfly operation
2248 */
2249 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2250 { \
2251  out0 = in0 + in3; \
2252  out1 = in1 + in2; \
2253  \
2254  out2 = in1 - in2; \
2255  out3 = in0 - in3; \
2256 }
2257 
2258 /* Description : Butterfly of 8 input vectors
2259  Arguments : Inputs - in0 ... in7
2260  Outputs - out0 .. out7
2261  Details : Butterfly operation
2262 */
2263 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2264  out0, out1, out2, out3, out4, out5, out6, out7) \
2265 { \
2266  out0 = in0 + in7; \
2267  out1 = in1 + in6; \
2268  out2 = in2 + in5; \
2269  out3 = in3 + in4; \
2270  \
2271  out4 = in3 - in4; \
2272  out5 = in2 - in5; \
2273  out6 = in1 - in6; \
2274  out7 = in0 - in7; \
2275 }
2276 
2277 /* Description : Butterfly of 16 input vectors
2278  Arguments : Inputs - in0 ... in15
2279  Outputs - out0 .. out15
2280  Details : Butterfly operation
2281 */
2282 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2283  in8, in9, in10, in11, in12, in13, in14, in15, \
2284  out0, out1, out2, out3, out4, out5, out6, out7, \
2285  out8, out9, out10, out11, out12, out13, out14, out15) \
2286 { \
2287  out0 = in0 + in15; \
2288  out1 = in1 + in14; \
2289  out2 = in2 + in13; \
2290  out3 = in3 + in12; \
2291  out4 = in4 + in11; \
2292  out5 = in5 + in10; \
2293  out6 = in6 + in9; \
2294  out7 = in7 + in8; \
2295  \
2296  out8 = in7 - in8; \
2297  out9 = in6 - in9; \
2298  out10 = in5 - in10; \
2299  out11 = in4 - in11; \
2300  out12 = in3 - in12; \
2301  out13 = in2 - in13; \
2302  out14 = in1 - in14; \
2303  out15 = in0 - in15; \
2304 }
2305 
2306 /* Description : Transposes input 4x4 byte block
2307  Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2308  Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2309  Return Type - unsigned byte
2310  Details :
2311 */
2312 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2313 { \
2314  v16i8 zero_m = { 0 }; \
2315  v16i8 s0_m, s1_m, s2_m, s3_m; \
2316  \
2317  ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2318  ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2319  \
2320  out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2321  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2322  out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2323  out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2324 }
2325 
2326 /* Description : Transposes input 8x4 byte block into 4x8
2327  Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2328  Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2329  Return Type - as per RTYPE
2330  Details :
2331 */
2332 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2333  out0, out1, out2, out3) \
2334 { \
2335  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2336  \
2337  ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2338  tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2339  ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2340  \
2341  tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2342  ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2343  \
2344  ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2345  out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2346  out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2347 }
2348 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2349 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2350 
2351 /* Description : Transposes input 8x8 byte block
2352  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2353  (input 8x8 byte block)
2354  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2355  (output 8x8 byte block)
2356  Return Type - as per RTYPE
2357  Details :
2358 */
2359 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2360  out0, out1, out2, out3, out4, out5, out6, out7) \
2361 { \
2362  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2363  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2364  v16i8 zeros = { 0 }; \
2365  \
2366  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2367  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2368  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2369  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2370  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2371  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2372  SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6, \
2373  8, out1, out3, out5, out7); \
2374 }
2375 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2376 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2377 
2378 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2379  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2380  in8, in9, in10, in11, in12, in13, in14, in15
2381  Outputs - out0, out1, out2, out3
2382  Return Type - unsigned byte
2383  Details :
2384 */
2385 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2386  in8, in9, in10, in11, in12, in13, in14, in15, \
2387  out0, out1, out2, out3) \
2388 { \
2389  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2390  \
2391  ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2392  out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2393  \
2394  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2395  out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2396  \
2397  ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2398  \
2399  tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2400  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2401  \
2402  tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2403  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2404  out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2405  out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2406  \
2407  tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2408  tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2409  out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2410  out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2411 }
2412 
2413 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2414  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2415  in8, in9, in10, in11, in12, in13, in14, in15
2416  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2417  Return Type - unsigned byte
2418  Details :
2419 */
2420 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2421  in8, in9, in10, in11, in12, in13, in14, in15, \
2422  out0, out1, out2, out3, out4, out5, out6, out7) \
2423 { \
2424  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2425  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2426  \
2427  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2428  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2429  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2430  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2431  \
2432  tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2433  tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2434  tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2435  tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2436  out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2437  tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2438  out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2439  tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2440  \
2441  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2442  out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2443  out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2444  \
2445  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2446  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2447  out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2448  out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2449  \
2450  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2451  out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2452  out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2453  \
2454  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2455  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2456  out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2457  out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2458 }
2459 
2460 /* Description : Transposes 4x4 block with half word elements in vectors
2461  Arguments : Inputs - in0, in1, in2, in3
2462  Outputs - out0, out1, out2, out3
2463  Return Type - signed halfword
2464  Details :
2465 */
2466 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2467 { \
2468  v8i16 s0_m, s1_m; \
2469  \
2470  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2471  ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2472  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2473  out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2474 }
2475 
2476 /* Description : Transposes 8x8 block with half word elements in vectors
2477  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2478  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2479  Return Type - as per RTYPE
2480  Details :
2481 */
2482 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2483  out0, out1, out2, out3, out4, out5, out6, out7) \
2484 { \
2485  v8i16 s0_m, s1_m; \
2486  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2487  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2488  \
2489  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2490  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2491  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2492  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2493  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2494  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2495  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2496  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2497  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2498  tmp3_m, tmp7_m, out0, out2, out4, out6); \
2499  out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2500  out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2501  out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2502  out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2503 }
2504 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2505 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2506 
2507 /* Description : Transposes 4x4 block with word elements in vectors
2508  Arguments : Inputs - in0, in1, in2, in3
2509  Outputs - out0, out1, out2, out3
2510  Return Type - signed word
2511  Details :
2512 */
2513 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2514 { \
2515  v4i32 s0_m, s1_m, s2_m, s3_m; \
2516  \
2517  ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2518  ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2519  \
2520  out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2521  out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2522  out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2523  out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2524 }
2525 
2526 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2527  block in destination memory
2528  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2529  Details : Each byte element from input vector pair 'in0' and 'in1' are
2530  averaged (a + b)/2 and stored in 'tmp0_m'
2531  Each byte element from input vector pair 'in2' and 'in3' are
2532  averaged (a + b)/2 and stored in 'tmp1_m'
2533  Each byte element from input vector pair 'in4' and 'in5' are
2534  averaged (a + b)/2 and stored in 'tmp2_m'
2535  Each byte element from input vector pair 'in6' and 'in7' are
2536  averaged (a + b)/2 and stored in 'tmp3_m'
2537  The half vector results from all 4 vectors are stored in
2538  destination memory as 8x4 byte block
2539 */
2540 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2541 { \
2542  uint64_t out0_m, out1_m, out2_m, out3_m; \
2543  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2544  \
2545  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2546  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2547  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2548  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2549  \
2550  out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2551  out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2552  out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2553  out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2554  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2555 }
2556 
2557 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2558  block in destination memory
2559  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2560  Details : Each byte element from input vector pair 'in0' and 'in1' are
2561  averaged (a + b)/2 and stored in 'tmp0_m'
2562  Each byte element from input vector pair 'in2' and 'in3' are
2563  averaged (a + b)/2 and stored in 'tmp1_m'
2564  Each byte element from input vector pair 'in4' and 'in5' are
2565  averaged (a + b)/2 and stored in 'tmp2_m'
2566  Each byte element from input vector pair 'in6' and 'in7' are
2567  averaged (a + b)/2 and stored in 'tmp3_m'
2568  The results from all 4 vectors are stored in destination
2569  memory as 16x4 byte block
2570 */
2571 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2572 { \
2573  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2574  \
2575  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2576  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2577  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2578  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2579  \
2580  ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2581 }
2582 
2583 /* Description : Average rounded byte elements from pair of vectors and store
2584  8x4 byte block in destination memory
2585  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2586  Details : Each byte element from input vector pair 'in0' and 'in1' are
2587  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2588  Each byte element from input vector pair 'in2' and 'in3' are
2589  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2590  Each byte element from input vector pair 'in4' and 'in5' are
2591  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2592  Each byte element from input vector pair 'in6' and 'in7' are
2593  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2594  The half vector results from all 4 vectors are stored in
2595  destination memory as 8x4 byte block
2596 */
2597 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2598 { \
2599  uint64_t out0_m, out1_m, out2_m, out3_m; \
2600  v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2601  \
2602  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2603  tp0_m, tp1_m, tp2_m, tp3_m); \
2604  \
2605  out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2606  out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2607  out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2608  out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2609  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2610 }
2611 
2612 /* Description : Average rounded byte elements from pair of vectors and store
2613  16x4 byte block in destination memory
2614  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2615  Details : Each byte element from input vector pair 'in0' and 'in1' are
2616  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2617  Each byte element from input vector pair 'in2' and 'in3' are
2618  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2619  Each byte element from input vector pair 'in4' and 'in5' are
2620  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2621  Each byte element from input vector pair 'in6' and 'in7' are
2622  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2623  The vector results from all 4 vectors are stored in
2624  destination memory as 16x4 byte block
2625 */
2626 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2627 { \
2628  v16u8 t0_m, t1_m, t2_m, t3_m; \
2629  \
2630  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2631  t0_m, t1_m, t2_m, t3_m); \
2632  ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2633 }
2634 
2635 /* Description : Average rounded byte elements from pair of vectors,
2636  average rounded with destination and store 8x4 byte block
2637  in destination memory
2638  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2639  Details : Each byte element from input vector pair 'in0' and 'in1' are
2640  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2641  Each byte element from input vector pair 'in2' and 'in3' are
2642  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2643  Each byte element from input vector pair 'in4' and 'in5' are
2644  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2645  Each byte element from input vector pair 'in6' and 'in7' are
2646  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2647  The half vector results from all 4 vectors are stored in
2648  destination memory as 8x4 byte block
2649 */
2650 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2651  pdst, stride) \
2652 { \
2653  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2654  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2655  \
2656  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2657  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2658  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2659  AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2660  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2661 }
2662 
2663 /* Description : Average rounded byte elements from pair of vectors,
2664  average rounded with destination and store 16x4 byte block
2665  in destination memory
2666  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2667  Details : Each byte element from input vector pair 'in0' and 'in1' are
2668  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2669  Each byte element from input vector pair 'in2' and 'in3' are
2670  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2671  Each byte element from input vector pair 'in4' and 'in5' are
2672  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2673  Each byte element from input vector pair 'in6' and 'in7' are
2674  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2675  The vector results from all 4 vectors are stored in
2676  destination memory as 16x4 byte block
2677 */
2678 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2679  pdst, stride) \
2680 { \
2681  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2682  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2683  \
2684  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2685  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2686  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2687  AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2688  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2689 }
2690 
2691 /* Description : Add block 4x4
2692  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2693  Details : Least significant 4 bytes from each input vector are added to
2694  the destination bytes, clipped between 0-255 and then stored.
2695 */
2696 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2697 { \
2698  uint32_t src0_m, src1_m, src2_m, src3_m; \
2699  uint32_t out0_m, out1_m, out2_m, out3_m; \
2700  v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2701  v16i8 dst0_m = { 0 }; \
2702  v16i8 dst1_m = { 0 }; \
2703  v16i8 zero_m = { 0 }; \
2704  \
2705  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2706  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2707  INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2708  INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2709  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2710  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2711  CLIP_SH2_0_255(res0_m, res1_m); \
2712  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2713  \
2714  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2715  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2716  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2717  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2718  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2719 }
2720 
2721 /* Description : Dot product and addition of 3 signed halfword input vectors
2722  Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2723  Outputs - out0_m
2724  Return Type - signed halfword
2725  Details : Dot product of 'in0' with 'coeff0'
2726  Dot product of 'in1' with 'coeff1'
2727  Dot product of 'in2' with 'coeff2'
2728  Addition of all the 3 vector results
2729 
2730  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2731 */
2732 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2733 ( { \
2734  v8i16 out0_m; \
2735  \
2736  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2737  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2738  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
2739  \
2740  out0_m; \
2741 } )
2742 
2743 /* Description : Pack even elements of input vectors & xor with 128
2744  Arguments : Inputs - in0, in1
2745  Outputs - out_m
2746  Return Type - unsigned byte
2747  Details : Signed byte even elements from 'in0' and 'in1' are packed
2748  together in one vector and the resulted vector is xor'ed with
2749  128 to shift the range from signed to unsigned byte
2750 */
2751 #define PCKEV_XORI128_UB(in0, in1) \
2752 ( { \
2753  v16u8 out_m; \
2754  out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2755  out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2756  out_m; \
2757 } )
2758 
2759 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2760  as 8x4 unsigned byte block
2761  Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
2762 */
2763 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2764  dst0, dst1, pdst, stride) \
2765 { \
2766  v16u8 tmp0_m, tmp1_m; \
2767  uint8_t *pdst_m = (uint8_t *) (pdst); \
2768  \
2769  tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2770  tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2771  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
2772  ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
2773 }
2774 
2775 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2776  of results and store 4 words in destination memory as per
2777  stride
2778  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2779 */
2780 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2781 { \
2782  uint32_t out0_m, out1_m, out2_m, out3_m; \
2783  v16i8 tmp0_m, tmp1_m; \
2784  \
2785  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2786  \
2787  out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2788  out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2789  out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2790  out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2791  \
2792  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2793 }
2794 
2795 /* Description : Pack even byte elements and store byte vector in destination
2796  memory
2797  Arguments : Inputs - in0, in1, pdst
2798 */
2799 #define PCKEV_ST_SB(in0, in1, pdst) \
2800 { \
2801  v16i8 tmp_m; \
2802  tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2803  ST_SB(tmp_m, (pdst)); \
2804 }
2805 
2806 /* Description : Horizontal 2 tap filter kernel code
2807  Arguments : Inputs - in0, in1, mask, coeff, shift
2808 */
2809 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2810 ( { \
2811  v16i8 tmp0_m; \
2812  v8u16 tmp1_m; \
2813  \
2814  tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2815  tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2816  tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2817  tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2818  \
2819  tmp1_m; \
2820 } )
2821 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */