FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
generic_macros_msa.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 
27 #define LD_B(RTYPE, psrc) *((RTYPE *)(psrc))
28 #define LD_UB(...) LD_B(v16u8, __VA_ARGS__)
29 #define LD_SB(...) LD_B(v16i8, __VA_ARGS__)
30 
31 #define LD_H(RTYPE, psrc) *((RTYPE *)(psrc))
32 #define LD_UH(...) LD_H(v8u16, __VA_ARGS__)
33 #define LD_SH(...) LD_H(v8i16, __VA_ARGS__)
34 
35 #define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UW(...) LD_W(v4u32, __VA_ARGS__)
37 #define LD_SW(...) LD_W(v4i32, __VA_ARGS__)
38 
39 #define ST_B(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
40 #define ST_UB(...) ST_B(v16u8, __VA_ARGS__)
41 #define ST_SB(...) ST_B(v16i8, __VA_ARGS__)
42 
43 #define ST_H(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UH(...) ST_H(v8u16, __VA_ARGS__)
45 #define ST_SH(...) ST_H(v8i16, __VA_ARGS__)
46 
47 #define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
48 #define ST_UW(...) ST_W(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_W(v4i32, __VA_ARGS__)
50 
51 #if (__mips_isa_rev >= 6)
52  #define LW(psrc) \
53  ( { \
54  uint8_t *psrc_m = (uint8_t *) (psrc); \
55  uint32_t val_m; \
56  \
57  __asm__ volatile ( \
58  "lw %[val_m], %[psrc_m] \n\t" \
59  \
60  : [val_m] "=r" (val_m) \
61  : [psrc_m] "m" (*psrc_m) \
62  ); \
63  \
64  val_m; \
65  } )
66 
67  #if (__mips == 64)
68  #define LD(psrc) \
69  ( { \
70  uint8_t *psrc_m = (uint8_t *) (psrc); \
71  uint64_t val_m = 0; \
72  \
73  __asm__ volatile ( \
74  "ld %[val_m], %[psrc_m] \n\t" \
75  \
76  : [val_m] "=r" (val_m) \
77  : [psrc_m] "m" (*psrc_m) \
78  ); \
79  \
80  val_m; \
81  } )
82  #else // !(__mips == 64)
83  #define LD(psrc) \
84  ( { \
85  uint8_t *psrc_m = (uint8_t *) (psrc); \
86  uint32_t val0_m, val1_m; \
87  uint64_t val_m = 0; \
88  \
89  val0_m = LW(psrc_m); \
90  val1_m = LW(psrc_m + 4); \
91  \
92  val_m = (uint64_t) (val1_m); \
93  val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
94  val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
95  \
96  val_m; \
97  } )
98  #endif // (__mips == 64)
99 
100  #define SH(val, pdst) \
101  { \
102  uint8_t *pdst_m = (uint8_t *) (pdst); \
103  uint16_t val_m = (val); \
104  \
105  __asm__ volatile ( \
106  "sh %[val_m], %[pdst_m] \n\t" \
107  \
108  : [pdst_m] "=m" (*pdst_m) \
109  : [val_m] "r" (val_m) \
110  ); \
111  }
112 
113  #define SW(val, pdst) \
114  { \
115  uint8_t *pdst_m = (uint8_t *) (pdst); \
116  uint32_t val_m = (val); \
117  \
118  __asm__ volatile ( \
119  "sw %[val_m], %[pdst_m] \n\t" \
120  \
121  : [pdst_m] "=m" (*pdst_m) \
122  : [val_m] "r" (val_m) \
123  ); \
124  }
125 
126  #define SD(val, pdst) \
127  { \
128  uint8_t *pdst_m = (uint8_t *) (pdst); \
129  uint64_t val_m = (val); \
130  \
131  __asm__ volatile ( \
132  "sd %[val_m], %[pdst_m] \n\t" \
133  \
134  : [pdst_m] "=m" (*pdst_m) \
135  : [val_m] "r" (val_m) \
136  ); \
137  }
138 #else // !(__mips_isa_rev >= 6)
139  #define LW(psrc) \
140  ( { \
141  uint8_t *psrc_m = (uint8_t *) (psrc); \
142  uint32_t val_m; \
143  \
144  __asm__ volatile ( \
145  "ulw %[val_m], %[psrc_m] \n\t" \
146  \
147  : [val_m] "=r" (val_m) \
148  : [psrc_m] "m" (*psrc_m) \
149  ); \
150  \
151  val_m; \
152  } )
153 
154  #if (__mips == 64)
155  #define LD(psrc) \
156  ( { \
157  uint8_t *psrc_m = (uint8_t *) (psrc); \
158  uint64_t val_m = 0; \
159  \
160  __asm__ volatile ( \
161  "uld %[val_m], %[psrc_m] \n\t" \
162  \
163  : [val_m] "=r" (val_m) \
164  : [psrc_m] "m" (*psrc_m) \
165  ); \
166  \
167  val_m; \
168  } )
169  #else // !(__mips == 64)
170  #define LD(psrc) \
171  ( { \
172  uint8_t *psrc_m1 = (uint8_t *) (psrc); \
173  uint32_t val0_m, val1_m; \
174  uint64_t val_m = 0; \
175  \
176  val0_m = LW(psrc_m1); \
177  val1_m = LW(psrc_m1 + 4); \
178  \
179  val_m = (uint64_t) (val1_m); \
180  val_m = (uint64_t) ((val_m << 32) & 0xFFFFFFFF00000000); \
181  val_m = (uint64_t) (val_m | (uint64_t) val0_m); \
182  \
183  val_m; \
184  } )
185  #endif // (__mips == 64)
186 
187  #define SH(val, pdst) \
188  { \
189  uint8_t *pdst_m = (uint8_t *) (pdst); \
190  uint16_t val_m = (val); \
191  \
192  __asm__ volatile ( \
193  "ush %[val_m], %[pdst_m] \n\t" \
194  \
195  : [pdst_m] "=m" (*pdst_m) \
196  : [val_m] "r" (val_m) \
197  ); \
198  }
199 
200  #define SW(val, pdst) \
201  { \
202  uint8_t *pdst_m = (uint8_t *) (pdst); \
203  uint32_t val_m = (val); \
204  \
205  __asm__ volatile ( \
206  "usw %[val_m], %[pdst_m] \n\t" \
207  \
208  : [pdst_m] "=m" (*pdst_m) \
209  : [val_m] "r" (val_m) \
210  ); \
211  }
212 
213  #define SD(val, pdst) \
214  { \
215  uint8_t *pdst_m1 = (uint8_t *) (pdst); \
216  uint32_t val0_m, val1_m; \
217  \
218  val0_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
219  val1_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
220  \
221  SW(val0_m, pdst_m1); \
222  SW(val1_m, pdst_m1 + 4); \
223  }
224 #endif // (__mips_isa_rev >= 6)
225 
226 /* Description : Load 4 words with stride
227  Arguments : Inputs - psrc (source pointer to load from)
228  - stride
229  Outputs - out0, out1, out2, out3
230  Details : Loads word in 'out0' from (psrc)
231  Loads word in 'out1' from (psrc + stride)
232  Loads word in 'out2' from (psrc + 2 * stride)
233  Loads word in 'out3' from (psrc + 3 * stride)
234 */
235 #define LW4(psrc, stride, out0, out1, out2, out3) \
236 { \
237  out0 = LW((psrc)); \
238  out1 = LW((psrc) + stride); \
239  out2 = LW((psrc) + 2 * stride); \
240  out3 = LW((psrc) + 3 * stride); \
241 }
242 
243 /* Description : Load double words with stride
244  Arguments : Inputs - psrc (source pointer to load from)
245  - stride
246  Outputs - out0, out1
247  Details : Loads double word in 'out0' from (psrc)
248  Loads double word in 'out1' from (psrc + stride)
249 */
250 #define LD2(psrc, stride, out0, out1) \
251 { \
252  out0 = LD((psrc)); \
253  out1 = LD((psrc) + stride); \
254 }
255 #define LD4(psrc, stride, out0, out1, out2, out3) \
256 { \
257  LD2((psrc), stride, out0, out1); \
258  LD2((psrc) + 2 * stride, stride, out2, out3); \
259 }
260 
261 /* Description : Store 4 words with stride
262  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
263  Details : Stores word from 'in0' to (pdst)
264  Stores word from 'in1' to (pdst + stride)
265  Stores word from 'in2' to (pdst + 2 * stride)
266  Stores word from 'in3' to (pdst + 3 * stride)
267 */
268 #define SW4(in0, in1, in2, in3, pdst, stride) \
269 { \
270  SW(in0, (pdst)) \
271  SW(in1, (pdst) + stride); \
272  SW(in2, (pdst) + 2 * stride); \
273  SW(in3, (pdst) + 3 * stride); \
274 }
275 
276 /* Description : Store 4 double words with stride
277  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
278  Details : Stores double word from 'in0' to (pdst)
279  Stores double word from 'in1' to (pdst + stride)
280  Stores double word from 'in2' to (pdst + 2 * stride)
281  Stores double word from 'in3' to (pdst + 3 * stride)
282 */
283 #define SD4(in0, in1, in2, in3, pdst, stride) \
284 { \
285  SD(in0, (pdst)) \
286  SD(in1, (pdst) + stride); \
287  SD(in2, (pdst) + 2 * stride); \
288  SD(in3, (pdst) + 3 * stride); \
289 }
290 
291 /* Description : Load vectors with 16 byte elements with stride
292  Arguments : Inputs - psrc (source pointer to load from)
293  - stride
294  Outputs - out0, out1
295  Return Type - as per RTYPE
296  Details : Loads 16 byte elements in 'out0' from (psrc)
297  Loads 16 byte elements in 'out1' from (psrc + stride)
298 */
299 #define LD_B2(RTYPE, psrc, stride, out0, out1) \
300 { \
301  out0 = LD_B(RTYPE, (psrc)); \
302  out1 = LD_B(RTYPE, (psrc) + stride); \
303 }
304 #define LD_UB2(...) LD_B2(v16u8, __VA_ARGS__)
305 #define LD_SB2(...) LD_B2(v16i8, __VA_ARGS__)
306 
307 #define LD_B3(RTYPE, psrc, stride, out0, out1, out2) \
308 { \
309  LD_B2(RTYPE, (psrc), stride, out0, out1); \
310  out2 = LD_B(RTYPE, (psrc) + 2 * stride); \
311 }
312 #define LD_UB3(...) LD_B3(v16u8, __VA_ARGS__)
313 #define LD_SB3(...) LD_B3(v16i8, __VA_ARGS__)
314 
315 #define LD_B4(RTYPE, psrc, stride, out0, out1, out2, out3) \
316 { \
317  LD_B2(RTYPE, (psrc), stride, out0, out1); \
318  LD_B2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
319 }
320 #define LD_UB4(...) LD_B4(v16u8, __VA_ARGS__)
321 #define LD_SB4(...) LD_B4(v16i8, __VA_ARGS__)
322 
323 #define LD_B5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
324 { \
325  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
326  out4 = LD_B(RTYPE, (psrc) + 4 * stride); \
327 }
328 #define LD_UB5(...) LD_B5(v16u8, __VA_ARGS__)
329 #define LD_SB5(...) LD_B5(v16i8, __VA_ARGS__)
330 
331 #define LD_B6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
332 { \
333  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
334  LD_B2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
335 }
336 #define LD_SB6(...) LD_B6(v16i8, __VA_ARGS__)
337 
338 #define LD_B7(RTYPE, psrc, stride, \
339  out0, out1, out2, out3, out4, out5, out6) \
340 { \
341  LD_B5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
342  LD_B2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
343 }
344 #define LD_SB7(...) LD_B7(v16i8, __VA_ARGS__)
345 
346 #define LD_B8(RTYPE, psrc, stride, \
347  out0, out1, out2, out3, out4, out5, out6, out7) \
348 { \
349  LD_B4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
350  LD_B4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
351 }
352 #define LD_UB8(...) LD_B8(v16u8, __VA_ARGS__)
353 #define LD_SB8(...) LD_B8(v16i8, __VA_ARGS__)
354 
355 /* Description : Load vectors with 8 halfword elements with stride
356  Arguments : Inputs - psrc (source pointer to load from)
357  - stride
358  Outputs - out0, out1
359  Details : Loads 8 halfword elements in 'out0' from (psrc)
360  Loads 8 halfword elements in 'out1' from (psrc + stride)
361 */
362 #define LD_H2(RTYPE, psrc, stride, out0, out1) \
363 { \
364  out0 = LD_H(RTYPE, (psrc)); \
365  out1 = LD_H(RTYPE, (psrc) + (stride)); \
366 }
367 #define LD_UH2(...) LD_H2(v8u16, __VA_ARGS__)
368 #define LD_SH2(...) LD_H2(v8i16, __VA_ARGS__)
369 
370 #define LD_H4(RTYPE, psrc, stride, out0, out1, out2, out3) \
371 { \
372  LD_H2(RTYPE, (psrc), stride, out0, out1); \
373  LD_H2(RTYPE, (psrc) + 2 * stride, stride, out2, out3); \
374 }
375 #define LD_UH4(...) LD_H4(v8u16, __VA_ARGS__)
376 #define LD_SH4(...) LD_H4(v8i16, __VA_ARGS__)
377 
378 #define LD_H6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
379 { \
380  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
381  LD_H2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
382 }
383 #define LD_UH6(...) LD_H6(v8u16, __VA_ARGS__)
384 #define LD_SH6(...) LD_H6(v8i16, __VA_ARGS__)
385 
386 #define LD_H8(RTYPE, psrc, stride, \
387  out0, out1, out2, out3, out4, out5, out6, out7) \
388 { \
389  LD_H4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
390  LD_H4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
391 }
392 #define LD_UH8(...) LD_H8(v8u16, __VA_ARGS__)
393 #define LD_SH8(...) LD_H8(v8i16, __VA_ARGS__)
394 
395 #define LD_H16(RTYPE, psrc, stride, \
396  out0, out1, out2, out3, out4, out5, out6, out7, \
397  out8, out9, out10, out11, out12, out13, out14, out15) \
398 { \
399  LD_H8(RTYPE, (psrc), stride, \
400  out0, out1, out2, out3, out4, out5, out6, out7); \
401  LD_H8(RTYPE, (psrc) + 8 * stride, stride, \
402  out8, out9, out10, out11, out12, out13, out14, out15); \
403 }
404 #define LD_SH16(...) LD_H16(v8i16, __VA_ARGS__)
405 
406 /* Description : Load 2 vectors of signed word elements with stride
407  Arguments : Inputs - psrc (source pointer to load from)
408  - stride
409  Outputs - out0, out1
410  Return Type - signed word
411 */
412 #define LD_SW2(psrc, stride, out0, out1) \
413 { \
414  out0 = LD_SW((psrc)); \
415  out1 = LD_SW((psrc) + stride); \
416 }
417 
418 /* Description : Store vectors of 16 byte elements with stride
419  Arguments : Inputs - in0, in1, stride
420  Outputs - pdst (destination pointer to store to)
421  Details : Stores 16 byte elements from 'in0' to (pdst)
422  Stores 16 byte elements from 'in1' to (pdst + stride)
423 */
424 #define ST_B2(RTYPE, in0, in1, pdst, stride) \
425 { \
426  ST_B(RTYPE, in0, (pdst)); \
427  ST_B(RTYPE, in1, (pdst) + stride); \
428 }
429 #define ST_UB2(...) ST_B2(v16u8, __VA_ARGS__)
430 #define ST_SB2(...) ST_B2(v16i8, __VA_ARGS__)
431 
432 #define ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride) \
433 { \
434  ST_B2(RTYPE, in0, in1, (pdst), stride); \
435  ST_B2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
436 }
437 #define ST_UB4(...) ST_B4(v16u8, __VA_ARGS__)
438 #define ST_SB4(...) ST_B4(v16i8, __VA_ARGS__)
439 
440 #define ST_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
441  pdst, stride) \
442 { \
443  ST_B4(RTYPE, in0, in1, in2, in3, pdst, stride); \
444  ST_B4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
445 }
446 #define ST_UB8(...) ST_B8(v16u8, __VA_ARGS__)
447 
448 /* Description : Store vectors of 8 halfword elements with stride
449  Arguments : Inputs - in0, in1, stride
450  Outputs - pdst (destination pointer to store to)
451  Details : Stores 8 halfword elements from 'in0' to (pdst)
452  Stores 8 halfword elements from 'in1' to (pdst + stride)
453 */
454 #define ST_H2(RTYPE, in0, in1, pdst, stride) \
455 { \
456  ST_H(RTYPE, in0, (pdst)); \
457  ST_H(RTYPE, in1, (pdst) + stride); \
458 }
459 #define ST_UH2(...) ST_H2(v8u16, __VA_ARGS__)
460 #define ST_SH2(...) ST_H2(v8i16, __VA_ARGS__)
461 
462 #define ST_H4(RTYPE, in0, in1, in2, in3, pdst, stride) \
463 { \
464  ST_H2(RTYPE, in0, in1, (pdst), stride); \
465  ST_H2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
466 }
467 #define ST_SH4(...) ST_H4(v8i16, __VA_ARGS__)
468 
469 #define ST_H6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
470 { \
471  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
472  ST_H2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
473 }
474 #define ST_SH6(...) ST_H6(v8i16, __VA_ARGS__)
475 
476 #define ST_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
477 { \
478  ST_H4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
479  ST_H4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
480 }
481 #define ST_SH8(...) ST_H8(v8i16, __VA_ARGS__)
482 
483 /* Description : Store vectors of word elements with stride
484  Arguments : Inputs - in0, in1, stride
485  Outputs - pdst (destination pointer to store to)
486  Return Type - signed word
487  Details : Stores 4 word elements from 'in0' to (pdst)
488  Stores 4 word elements from 'in1' to (pdst + stride)
489 */
490 #define ST_SW2(in0, in1, pdst, stride) \
491 { \
492  ST_SW(in0, (pdst)); \
493  ST_SW(in1, (pdst) + stride); \
494 }
495 
496 /* Description : Store as 2x4 byte block to destination memory from input vector
497  Arguments : Inputs - in, stidx, pdst, stride
498  Return Type - unsigned byte
499  Details : Index stidx halfword element from 'in' vector is copied and
500  stored on first line
501  Index stidx+1 halfword element from 'in' vector is copied and
502  stored on second line
503  Index stidx+2 halfword element from 'in' vector is copied and
504  stored on third line
505  Index stidx+3 halfword element from 'in' vector is copied and
506  stored on fourth line
507 */
508 #define ST2x4_UB(in, stidx, pdst, stride) \
509 { \
510  uint16_t out0_m, out1_m, out2_m, out3_m; \
511  uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
512  \
513  out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
514  out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
515  out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
516  out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
517  \
518  SH(out0_m, pblk_2x4_m); \
519  SH(out1_m, pblk_2x4_m + stride); \
520  SH(out2_m, pblk_2x4_m + 2 * stride); \
521  SH(out3_m, pblk_2x4_m + 3 * stride); \
522 }
523 
524 /* Description : Store as 4x2 byte block to destination memory from input vector
525  Arguments : Inputs - in, pdst, stride
526  Return Type - unsigned byte
527  Details : Index 0 word element from input vector is copied and stored
528  on first line
529  Index 1 word element from input vector is copied and stored
530  on second line
531 */
532 #define ST4x2_UB(in, pdst, stride) \
533 { \
534  uint32_t out0_m, out1_m; \
535  uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
536  \
537  out0_m = __msa_copy_u_w((v4i32) in, 0); \
538  out1_m = __msa_copy_u_w((v4i32) in, 1); \
539  \
540  SW(out0_m, pblk_4x2_m); \
541  SW(out1_m, pblk_4x2_m + stride); \
542 }
543 
544 /* Description : Store as 4x4 byte block to destination memory from input vector
545  Arguments : Inputs - in0, in1, pdst, stride
546  Return Type - unsigned byte
547  Details : Idx0 word element from input vector 'in0' is copied and stored
548  on first line
549  Idx1 word element from input vector 'in0' is copied and stored
550  on second line
551  Idx2 word element from input vector 'in1' is copied and stored
552  on third line
553  Idx3 word element from input vector 'in1' is copied and stored
554  on fourth line
555 */
556 #define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
557 { \
558  uint32_t out0_m, out1_m, out2_m, out3_m; \
559  uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
560  \
561  out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
562  out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
563  out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
564  out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
565  \
566  SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
567 }
568 #define ST4x8_UB(in0, in1, pdst, stride) \
569 { \
570  uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
571  \
572  ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
573  ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
574 }
575 
576 /* Description : Store as 6x4 byte block to destination memory from input
577  vectors
578  Arguments : Inputs - in0, in1, pdst, stride
579  Return Type - unsigned byte
580  Details : Index 0 word element from input vector 'in0' is copied and
581  stored on first line followed by index 2 halfword element
582  Index 2 word element from input vector 'in0' is copied and
583  stored on second line followed by index 2 halfword element
584  Index 0 word element from input vector 'in1' is copied and
585  stored on third line followed by index 2 halfword element
586  Index 2 word element from input vector 'in1' is copied and
587  stored on fourth line followed by index 2 halfword element
588 */
589 #define ST6x4_UB(in0, in1, pdst, stride) \
590 { \
591  uint32_t out0_m, out1_m, out2_m, out3_m; \
592  uint16_t out4_m, out5_m, out6_m, out7_m; \
593  uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
594  \
595  out0_m = __msa_copy_u_w((v4i32) in0, 0); \
596  out1_m = __msa_copy_u_w((v4i32) in0, 2); \
597  out2_m = __msa_copy_u_w((v4i32) in1, 0); \
598  out3_m = __msa_copy_u_w((v4i32) in1, 2); \
599  \
600  out4_m = __msa_copy_u_h((v8i16) in0, 2); \
601  out5_m = __msa_copy_u_h((v8i16) in0, 6); \
602  out6_m = __msa_copy_u_h((v8i16) in1, 2); \
603  out7_m = __msa_copy_u_h((v8i16) in1, 6); \
604  \
605  SW(out0_m, pblk_6x4_m); \
606  SH(out4_m, (pblk_6x4_m + 4)); \
607  pblk_6x4_m += stride; \
608  SW(out1_m, pblk_6x4_m); \
609  SH(out5_m, (pblk_6x4_m + 4)); \
610  pblk_6x4_m += stride; \
611  SW(out2_m, pblk_6x4_m); \
612  SH(out6_m, (pblk_6x4_m + 4)); \
613  pblk_6x4_m += stride; \
614  SW(out3_m, pblk_6x4_m); \
615  SH(out7_m, (pblk_6x4_m + 4)); \
616 }
617 
618 /* Description : Store as 8x1 byte block to destination memory from input vector
619  Arguments : Inputs - in, pdst
620  Details : Index 0 double word element from input vector 'in' is copied
621  and stored to destination memory at (pdst)
622 */
623 #define ST8x1_UB(in, pdst) \
624 { \
625  uint64_t out0_m; \
626  out0_m = __msa_copy_u_d((v2i64) in, 0); \
627  SD(out0_m, pdst); \
628 }
629 
630 /* Description : Store as 8x2 byte block to destination memory from input vector
631  Arguments : Inputs - in, pdst, stride
632  Details : Index 0 double word element from input vector 'in' is copied
633  and stored to destination memory at (pdst)
634  Index 1 double word element from input vector 'in' is copied
635  and stored to destination memory at (pdst + stride)
636 */
637 #define ST8x2_UB(in, pdst, stride) \
638 { \
639  uint64_t out0_m, out1_m; \
640  uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
641  \
642  out0_m = __msa_copy_u_d((v2i64) in, 0); \
643  out1_m = __msa_copy_u_d((v2i64) in, 1); \
644  \
645  SD(out0_m, pblk_8x2_m); \
646  SD(out1_m, pblk_8x2_m + stride); \
647 }
648 
649 /* Description : Store as 8x4 byte block to destination memory from input
650  vectors
651  Arguments : Inputs - in0, in1, pdst, stride
652  Details : Index 0 double word element from input vector 'in0' is copied
653  and stored to destination memory at (pblk_8x4_m)
654  Index 1 double word element from input vector 'in0' is copied
655  and stored to destination memory at (pblk_8x4_m + stride)
656  Index 0 double word element from input vector 'in1' is copied
657  and stored to destination memory at (pblk_8x4_m + 2 * stride)
658  Index 1 double word element from input vector 'in1' is copied
659  and stored to destination memory at (pblk_8x4_m + 3 * stride)
660 */
661 #define ST8x4_UB(in0, in1, pdst, stride) \
662 { \
663  uint64_t out0_m, out1_m, out2_m, out3_m; \
664  uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
665  \
666  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
667  out1_m = __msa_copy_u_d((v2i64) in0, 1); \
668  out2_m = __msa_copy_u_d((v2i64) in1, 0); \
669  out3_m = __msa_copy_u_d((v2i64) in1, 1); \
670  \
671  SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
672 }
673 #define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
674 { \
675  uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
676  \
677  ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
678  ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
679 }
680 #define ST12x4_UB(in0, in1, in2, pdst, stride) \
681 { \
682  uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
683  \
684  /* left 8x4 */ \
685  ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
686  /* right 4x4 */ \
687  ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
688 }
689 
690 /* Description : Store as 12x8 byte block to destination memory from
691  input vectors
692  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
693  Details : Index 0 double word element from input vector 'in0' is copied
694  and stored to destination memory at (pblk_12x8_m) followed by
695  index 2 word element from same input vector 'in0' at
696  (pblk_12x8_m + 8)
697  Similar to remaining lines
698 */
699 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
700 { \
701  uint64_t out0_m, out1_m, out2_m, out3_m; \
702  uint64_t out4_m, out5_m, out6_m, out7_m; \
703  uint32_t out8_m, out9_m, out10_m, out11_m; \
704  uint32_t out12_m, out13_m, out14_m, out15_m; \
705  uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
706  \
707  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
708  out1_m = __msa_copy_u_d((v2i64) in1, 0); \
709  out2_m = __msa_copy_u_d((v2i64) in2, 0); \
710  out3_m = __msa_copy_u_d((v2i64) in3, 0); \
711  out4_m = __msa_copy_u_d((v2i64) in4, 0); \
712  out5_m = __msa_copy_u_d((v2i64) in5, 0); \
713  out6_m = __msa_copy_u_d((v2i64) in6, 0); \
714  out7_m = __msa_copy_u_d((v2i64) in7, 0); \
715  \
716  out8_m = __msa_copy_u_w((v4i32) in0, 2); \
717  out9_m = __msa_copy_u_w((v4i32) in1, 2); \
718  out10_m = __msa_copy_u_w((v4i32) in2, 2); \
719  out11_m = __msa_copy_u_w((v4i32) in3, 2); \
720  out12_m = __msa_copy_u_w((v4i32) in4, 2); \
721  out13_m = __msa_copy_u_w((v4i32) in5, 2); \
722  out14_m = __msa_copy_u_w((v4i32) in6, 2); \
723  out15_m = __msa_copy_u_w((v4i32) in7, 2); \
724  \
725  SD(out0_m, pblk_12x8_m); \
726  SW(out8_m, pblk_12x8_m + 8); \
727  pblk_12x8_m += stride; \
728  SD(out1_m, pblk_12x8_m); \
729  SW(out9_m, pblk_12x8_m + 8); \
730  pblk_12x8_m += stride; \
731  SD(out2_m, pblk_12x8_m); \
732  SW(out10_m, pblk_12x8_m + 8); \
733  pblk_12x8_m += stride; \
734  SD(out3_m, pblk_12x8_m); \
735  SW(out11_m, pblk_12x8_m + 8); \
736  pblk_12x8_m += stride; \
737  SD(out4_m, pblk_12x8_m); \
738  SW(out12_m, pblk_12x8_m + 8); \
739  pblk_12x8_m += stride; \
740  SD(out5_m, pblk_12x8_m); \
741  SW(out13_m, pblk_12x8_m + 8); \
742  pblk_12x8_m += stride; \
743  SD(out6_m, pblk_12x8_m); \
744  SW(out14_m, pblk_12x8_m + 8); \
745  pblk_12x8_m += stride; \
746  SD(out7_m, pblk_12x8_m); \
747  SW(out15_m, pblk_12x8_m + 8); \
748 }
749 
750 /* Description : Immediate number of columns to slide with zero
751  Arguments : Inputs - in0, in1, slide_val
752  Outputs - out0, out1
753  Return Type - as per RTYPE
754  Details : Byte elements from 'zero_m' vector are slide into 'in0' by
755  number of elements specified by 'slide_val'
756 */
757 #define SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val) \
758 { \
759  v16i8 zero_m = { 0 }; \
760  out0 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in0, slide_val); \
761  out1 = (RTYPE) __msa_sldi_b((v16i8) zero_m, (v16i8) in1, slide_val); \
762 }
763 #define SLDI_B2_0_UB(...) SLDI_B2_0(v16u8, __VA_ARGS__)
764 
765 #define SLDI_B4_0(RTYPE, in0, in1, in2, in3, \
766  out0, out1, out2, out3, slide_val) \
767 { \
768  SLDI_B2_0(RTYPE, in0, in1, out0, out1, slide_val); \
769  SLDI_B2_0(RTYPE, in2, in3, out2, out3, slide_val); \
770 }
771 #define SLDI_B4_0_SB(...) SLDI_B4_0(v16i8, __VA_ARGS__)
772 
773 /* Description : Immediate number of columns to slide
774  Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
775  Outputs - out0, out1
776  Return Type - as per RTYPE
777  Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
778  number of elements specified by 'slide_val'
779 */
780 #define SLDI_B2(RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val) \
781 { \
782  out0 = (RTYPE) __msa_sldi_b((v16i8) in0_0, (v16i8) in1_0, slide_val); \
783  out1 = (RTYPE) __msa_sldi_b((v16i8) in0_1, (v16i8) in1_1, slide_val); \
784 }
785 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
786 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
787 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
788 
789 
790 /* Description : Shuffle byte vector elements as per mask vector
791  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
792  Outputs - out0, out1
793  Return Type - as per RTYPE
794  Details : Selective byte elements from in0 & in1 are copied to out0 as
795  per control vector mask0
796  Selective byte elements from in2 & in3 are copied to out1 as
797  per control vector mask1
798 */
799 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
800 { \
801  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
802  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
803 }
804 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
805 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
806 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
807 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
808 
809 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
810  out0, out1, out2) \
811 { \
812  VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
813  out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
814 }
815 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
816 
817 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
818  out0, out1, out2, out3) \
819 { \
820  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
821  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
822 }
823 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
824 
825 /* Description : Shuffle byte vector elements as per mask vector
826  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
827  Outputs - out0, out1
828  Return Type - as per RTYPE
829  Details : Selective byte elements from in0 & in1 are copied to out0 as
830  per control vector mask0
831  Selective byte elements from in2 & in3 are copied to out1 as
832  per control vector mask1
833 */
834 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
835 { \
836  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
837  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
838 }
839 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
840 
841 /* Description : Dot product of byte vector elements
842  Arguments : Inputs - mult0, mult1
843  cnst0, cnst1
844  Outputs - out0, out1
845  Return Type - signed halfword
846  Details : Signed byte elements from mult0 are multiplied with
847  signed byte elements from cnst0 producing a result
848  twice the size of input i.e. signed halfword.
849  Then this multiplication results of adjacent odd-even elements
850  are added together and stored to the out vector
851  (2 signed halfword results)
852 */
853 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
854 { \
855  out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
856  out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
857 }
858 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
859 
860 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
861  out0, out1, out2) \
862 { \
863  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
864  out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
865 }
866 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
867 
868 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
869  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
870 { \
871  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
872  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
873 }
874 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
875 
876 /* Description : Dot product of halfword vector elements
877  Arguments : Inputs - mult0, mult1
878  cnst0, cnst1
879  Outputs - out0, out1
880  Return Type - signed word
881  Details : Signed halfword elements from mult0 are multiplied with
882  signed halfword elements from cnst0 producing a result
883  twice the size of input i.e. signed word.
884  Then this multiplication results of adjacent odd-even elements
885  are added together and stored to the out vector
886  (2 signed word results)
887 */
888 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
889 { \
890  out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
891  out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
892 }
893 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
894 
895 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
896  cnst0, cnst1, cnst2, cnst3, \
897  out0, out1, out2, out3) \
898 { \
899  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
900  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
901 }
902 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
903 
904 /* Description : Dot product & addition of byte vector elements
905  Arguments : Inputs - mult0, mult1
906  cnst0, cnst1
907  Outputs - out0, out1
908  Return Type - signed halfword
909  Details : Signed byte elements from mult0 are multiplied with
910  signed byte elements from cnst0 producing a result
911  twice the size of input i.e. signed halfword.
912  Then this multiplication results of adjacent odd-even elements
913  are added to the out vector
914  (2 signed halfword results)
915 */
916 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
917 { \
918  out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
919  (v16i8) mult0, (v16i8) cnst0); \
920  out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
921  (v16i8) mult1, (v16i8) cnst1); \
922 }
923 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
924 
925 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
926  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
927 { \
928  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
929  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
930 }
931 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
932 
933 /* Description : Dot product & addition of halfword vector elements
934  Arguments : Inputs - mult0, mult1
935  cnst0, cnst1
936  Outputs - out0, out1
937  Return Type - signed word
938  Details : Signed halfword elements from mult0 are multiplied with
939  signed halfword elements from cnst0 producing a result
940  twice the size of input i.e. signed word.
941  Then this multiplication results of adjacent odd-even elements
942  are added to the out vector
943  (2 signed word results)
944 */
945 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
946 { \
947  out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
948  (v8i16) mult0, (v8i16) cnst0); \
949  out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
950  (v8i16) mult1, (v8i16) cnst1); \
951 }
952 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
953 
954 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
955  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
956 { \
957  DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
958  DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
959 }
960 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
961 
962 /* Description : Clips all halfword elements of input vector between min & max
963  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
964  Arguments : Inputs - in (input vector)
965  - min (min threshold)
966  - max (max threshold)
967  Outputs - out_m (output vector with clipped elements)
968  Return Type - signed halfword
969 */
970 #define CLIP_SH(in, min, max) \
971 ( { \
972  v8i16 out_m; \
973  \
974  out_m = __msa_max_s_h((v8i16) min, (v8i16) in); \
975  out_m = __msa_min_s_h((v8i16) max, (v8i16) out_m); \
976  out_m; \
977 } )
978 
979 /* Description : Clips all signed halfword elements of input vector
980  between 0 & 255
981  Arguments : Inputs - in (input vector)
982  Outputs - out_m (output vector with clipped elements)
983  Return Type - signed halfword
984 */
985 #define CLIP_SH_0_255(in) \
986 ( { \
987  v8i16 max_m = __msa_ldi_h(255); \
988  v8i16 out_m; \
989  \
990  out_m = __msa_maxi_s_h((v8i16) in, 0); \
991  out_m = __msa_min_s_h((v8i16) max_m, (v8i16) out_m); \
992  out_m; \
993 } )
994 #define CLIP_SH2_0_255(in0, in1) \
995 { \
996  in0 = CLIP_SH_0_255(in0); \
997  in1 = CLIP_SH_0_255(in1); \
998 }
999 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
1000 { \
1001  CLIP_SH2_0_255(in0, in1); \
1002  CLIP_SH2_0_255(in2, in3); \
1003 }
1004 
1005 /* Description : Clips all signed word elements of input vector
1006  between 0 & 255
1007  Arguments : Inputs - in (input vector)
1008  Outputs - out_m (output vector with clipped elements)
1009  Return Type - signed word
1010 */
1011 #define CLIP_SW_0_255(in) \
1012 ( { \
1013  v4i32 max_m = __msa_ldi_w(255); \
1014  v4i32 out_m; \
1015  \
1016  out_m = __msa_maxi_s_w((v4i32) in, 0); \
1017  out_m = __msa_min_s_w((v4i32) max_m, (v4i32) out_m); \
1018  out_m; \
1019 } )
1020 
1021 /* Description : Horizontal subtraction of unsigned byte vector elements
1022  Arguments : Inputs - in0, in1
1023  Outputs - out0, out1
1024  Return Type - as per RTYPE
1025  Details : Each unsigned odd byte element from 'in0' is subtracted from
1026  even unsigned byte element from 'in0' (pairwise) and the
1027  halfword result is stored in 'out0'
1028 */
1029 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1030 { \
1031  out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1032  out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1033 }
1034 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1035 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1036 
1037 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1038 { \
1039  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1040  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1041  out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1042  out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1043 }
1044 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1045 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1046 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1047 
1048 /* Description : Insert specified double word elements from input vectors to 1
1049  destination vector
1050  Arguments : Inputs - in0, in1 (2 input vectors)
1051  Outputs - out (output vector)
1052  Return Type - as per RTYPE
1053 */
1054 #define INSERT_D2(RTYPE, in0, in1, out) \
1055 { \
1056  out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1057  out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1058 }
1059 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1060 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1061 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1062 
1063 /* Description : Interleave even halfword elements from vectors
1064  Arguments : Inputs - in0, in1, in2, in3
1065  Outputs - out0, out1
1066  Return Type - as per RTYPE
1067  Details : Even halfword elements of 'in0' and even halfword
1068  elements of 'in1' are interleaved and copied to 'out0'
1069  Even halfword elements of 'in2' and even halfword
1070  elements of 'in3' are interleaved and copied to 'out1'
1071 */
1072 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1073 { \
1074  out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1075  out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1076 }
1077 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1078 
1079 /* Description : Interleave even word elements from vectors
1080  Arguments : Inputs - in0, in1, in2, in3
1081  Outputs - out0, out1
1082  Return Type - as per RTYPE
1083  Details : Even word elements of 'in0' and even word
1084  elements of 'in1' are interleaved and copied to 'out0'
1085  Even word elements of 'in2' and even word
1086  elements of 'in3' are interleaved and copied to 'out1'
1087 */
1088 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1089 { \
1090  out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1091  out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1092 }
1093 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1094 
1095 /* Description : Interleave even double word elements from vectors
1096  Arguments : Inputs - in0, in1, in2, in3
1097  Outputs - out0, out1
1098  Return Type - as per RTYPE
1099  Details : Even double word elements of 'in0' and even double word
1100  elements of 'in1' are interleaved and copied to 'out0'
1101  Even double word elements of 'in2' and even double word
1102  elements of 'in3' are interleaved and copied to 'out1'
1103 */
1104 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1105 { \
1106  out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1107  out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1108 }
1109 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1110 
1111 /* Description : Interleave left half of byte elements from vectors
1112  Arguments : Inputs - in0, in1, in2, in3
1113  Outputs - out0, out1
1114  Return Type - as per RTYPE
1115  Details : Left half of byte elements of in0 and left half of byte
1116  elements of in1 are interleaved and copied to out0.
1117  Left half of byte elements of in2 and left half of byte
1118  elements of in3 are interleaved and copied to out1.
1119 */
1120 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1121 { \
1122  out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1123  out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1124 }
1125 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1126 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1127 
1128 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1129  out0, out1, out2, out3) \
1130 { \
1131  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1132  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1133 }
1134 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1135 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1136 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1137 
1138 /* Description : Interleave left half of halfword elements from vectors
1139  Arguments : Inputs - in0, in1, in2, in3
1140  Outputs - out0, out1
1141  Return Type - as per RTYPE
1142  Details : Left half of halfword elements of in0 and left half of halfword
1143  elements of in1 are interleaved and copied to out0.
1144  Left half of halfword elements of in2 and left half of halfword
1145  elements of in3 are interleaved and copied to out1.
1146 */
1147 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1148 { \
1149  out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1150  out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1151 }
1152 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1153 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1154 
1155 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1156  out0, out1, out2, out3) \
1157 { \
1158  ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1159  ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1160 }
1161 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1162 
1163 /* Description : Interleave left half of word elements from vectors
1164  Arguments : Inputs - in0, in1, in2, in3
1165  Outputs - out0, out1
1166  Return Type - as per RTYPE
1167  Details : Left half of word elements of in0 and left half of word
1168  elements of in1 are interleaved and copied to out0.
1169  Left half of word elements of in2 and left half of word
1170  elements of in3 are interleaved and copied to out1.
1171 */
1172 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1173 { \
1174  out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1175  out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1176 }
1177 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1178 
1179 /* Description : Interleave right half of byte elements from vectors
1180  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1181  Outputs - out0, out1, out2, out3
1182  Return Type - as per RTYPE
1183  Details : Right half of byte elements of in0 and right half of byte
1184  elements of in1 are interleaved and copied to out0.
1185  Right half of byte elements of in2 and right half of byte
1186  elements of in3 are interleaved and copied to out1.
1187  Similar for other pairs
1188 */
1189 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1190 { \
1191  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1192  out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1193 }
1194 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1195 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1196 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1197 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1198 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1199 
1200 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1201 { \
1202  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1203  out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1204 }
1205 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1206 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1207 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1208 
1209 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1210  out0, out1, out2, out3) \
1211 { \
1212  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1213  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1214 }
1215 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1216 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1217 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1218 
1219 /* Description : Interleave right half of halfword elements from vectors
1220  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1221  Outputs - out0, out1, out2, out3
1222  Return Type - signed halfword
1223  Details : Right half of halfword elements of in0 and right half of
1224  halfword elements of in1 are interleaved and copied to out0.
1225  Right half of halfword elements of in2 and right half of
1226  halfword elements of in3 are interleaved and copied to out1.
1227  Similar for other pairs
1228 */
1229 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1230 { \
1231  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1232  out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1233 }
1234 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1235 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1236 
1237 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1238 { \
1239  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1240  out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1241 }
1242 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1243 
1244 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1245  out0, out1, out2, out3) \
1246 { \
1247  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1248  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1249 }
1250 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1251 
1252 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1253 { \
1254  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1255  out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1256 }
1257 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1258 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1259 
1260 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1261  out0, out1, out2, out3) \
1262 { \
1263  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1264  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1265 }
1266 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1267 
1268 /* Description : Interleave right half of double word elements from vectors
1269  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1270  Outputs - out0, out1, out2, out3
1271  Return Type - unsigned double word
1272  Details : Right half of double word elements of in0 and right half of
1273  double word elements of in1 are interleaved and copied to out0.
1274  Right half of double word elements of in2 and right half of
1275  double word elements of in3 are interleaved and copied to out1.
1276 */
1277 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1278 { \
1279  out0 = (RTYPE) __msa_ilvr_d((v2i64) (in0), (v2i64) (in1)); \
1280  out1 = (RTYPE) __msa_ilvr_d((v2i64) (in2), (v2i64) (in3)); \
1281 }
1282 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1283 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1284 
1285 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1286 { \
1287  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1288  out2 = (RTYPE) __msa_ilvr_d((v2i64) (in4), (v2i64) (in5)); \
1289 }
1290 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1291 
1292 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1293  out0, out1, out2, out3) \
1294 { \
1295  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1296  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1297 }
1298 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1299 
1300 /* Description : Interleave both left and right half of input vectors
1301  Arguments : Inputs - in0, in1
1302  Outputs - out0, out1
1303  Return Type - as per RTYPE
1304  Details : Right half of byte elements from 'in0' and 'in1' are
1305  interleaved and stored to 'out0'
1306  Left half of byte elements from 'in0' and 'in1' are
1307  interleaved and stored to 'out1'
1308 */
1309 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1310 { \
1311  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1312  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1313 }
1314 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1315 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1316 
1317 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1318 { \
1319  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1320  out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1321 }
1322 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1323 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1324 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1325 
1326 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1327 { \
1328  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1329  out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1330 }
1331 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1332 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1333 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1334 
1335 /* Description : Maximum values between signed elements of vector and
1336  5-bit signed immediate value are copied to the output vector
1337  Arguments : Inputs - in0, in1, in2, in3, max_val
1338  Outputs - in0, in1, in2, in3 (in place)
1339  Return Type - unsigned halfword
1340  Details : Maximum of signed halfword element values from 'in0' and
1341  'max_val' are written to output vector 'in0'
1342 */
1343 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1344 { \
1345  in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, (max_val)); \
1346  in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, (max_val)); \
1347 }
1348 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1349 
1350 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1351 { \
1352  MAXI_SH2(RTYPE, in0, in1, max_val); \
1353  MAXI_SH2(RTYPE, in2, in3, max_val); \
1354 }
1355 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1356 
1357 /* Description : Saturate the halfword element values to the max
1358  unsigned value of (sat_val+1 bits)
1359  The element data width remains unchanged
1360  Arguments : Inputs - in0, in1, in2, in3, sat_val
1361  Outputs - in0, in1, in2, in3 (in place)
1362  Return Type - unsigned halfword
1363  Details : Each unsigned halfword element from 'in0' is saturated to the
1364  value generated with (sat_val+1) bit range
1365  Results are in placed to original vectors
1366 */
1367 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1368 { \
1369  in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1370  in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1371 }
1372 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1373 
1374 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1375 { \
1376  SAT_UH2(RTYPE, in0, in1, sat_val); \
1377  SAT_UH2(RTYPE, in2, in3, sat_val) \
1378 }
1379 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1380 
1381 /* Description : Saturate the halfword element values to the max
1382  unsigned value of (sat_val+1 bits)
1383  The element data width remains unchanged
1384  Arguments : Inputs - in0, in1, in2, in3, sat_val
1385  Outputs - in0, in1, in2, in3 (in place)
1386  Return Type - unsigned halfword
1387  Details : Each unsigned halfword element from 'in0' is saturated to the
1388  value generated with (sat_val+1) bit range
1389  Results are in placed to original vectors
1390 */
1391 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1392 { \
1393  in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1394  in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1395 }
1396 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1397 
1398 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1399 { \
1400  SAT_SH2(RTYPE, in0, in1, sat_val) \
1401  in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1402 }
1403 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1404 
1405 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1406 { \
1407  SAT_SH2(RTYPE, in0, in1, sat_val); \
1408  SAT_SH2(RTYPE, in2, in3, sat_val); \
1409 }
1410 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1411 
1412 /* Description : Saturate the word element values to the max
1413  unsigned value of (sat_val+1 bits)
1414  The element data width remains unchanged
1415  Arguments : Inputs - in0, in1, in2, in3, sat_val
1416  Outputs - in0, in1, in2, in3 (in place)
1417  Return Type - unsigned word
1418  Details : Each unsigned word element from 'in0' is saturated to the
1419  value generated with (sat_val+1) bit range
1420  Results are in placed to original vectors
1421 */
1422 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1423 { \
1424  in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1425  in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1426 }
1427 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1428 
1429 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1430 { \
1431  SAT_SW2(RTYPE, in0, in1, sat_val); \
1432  SAT_SW2(RTYPE, in2, in3, sat_val); \
1433 }
1434 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1435 
1436 /* Description : Indexed halfword element values are replicated to all
1437  elements in output vector
1438  Arguments : Inputs - in, idx0, idx1
1439  Outputs - out0, out1
1440  Return Type - as per RTYPE
1441  Details : 'idx0' element value from 'in' vector is replicated to all
1442  elements in 'out0' vector
1443  Valid index range for halfword operation is 0-7
1444 */
1445 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1446 { \
1447  out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1448  out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1449 }
1450 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1451 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1452 
1453 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1454  out0, out1, out2, out3) \
1455 { \
1456  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1457  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1458 }
1459 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1460 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1461 
1462 /* Description : Indexed word element values are replicated to all
1463  elements in output vector
1464  Arguments : Inputs - in, stidx
1465  Outputs - out0, out1
1466  Return Type - as per RTYPE
1467  Details : 'stidx' element value from 'in' vector is replicated to all
1468  elements in 'out0' vector
1469  'stidx + 1' element value from 'in' vector is replicated to all
1470  elements in 'out1' vector
1471  Valid index range for halfword operation is 0-3
1472 */
1473 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1474 { \
1475  out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1476  out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1477 }
1478 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1479 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1480 
1481 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1482 { \
1483  SPLATI_W2(RTYPE, in, 0, out0, out1); \
1484  SPLATI_W2(RTYPE, in, 2, out2, out3); \
1485 }
1486 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1487 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1488 
1489 /* Description : Pack even byte elements of vector pairs
1490  Arguments : Inputs - in0, in1, in2, in3
1491  Outputs - out0, out1
1492  Return Type - as per RTYPE
1493  Details : Even byte elements of in0 are copied to the left half of
1494  out0 & even byte elements of in1 are copied to the right
1495  half of out0.
1496  Even byte elements of in2 are copied to the left half of
1497  out1 & even byte elements of in3 are copied to the right
1498  half of out1.
1499 */
1500 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1501 { \
1502  out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1503  out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1504 }
1505 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1506 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1507 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1508 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1509 
1510 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1511 { \
1512  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1513  out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1514 }
1515 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1516 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1517 
1518 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1519  out0, out1, out2, out3) \
1520 { \
1521  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1522  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1523 }
1524 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1525 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1526 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1527 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1528 
1529 /* Description : Pack even halfword elements of vector pairs
1530  Arguments : Inputs - in0, in1, in2, in3
1531  Outputs - out0, out1
1532  Return Type - as per RTYPE
1533  Details : Even halfword elements of in0 are copied to the left half of
1534  out0 & even halfword elements of in1 are copied to the right
1535  half of out0.
1536  Even halfword elements of in2 are copied to the left half of
1537  out1 & even halfword elements of in3 are copied to the right
1538  half of out1.
1539 */
1540 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1541 { \
1542  out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1543  out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1544 }
1545 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1546 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1547 
1548 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1549  out0, out1, out2, out3) \
1550 { \
1551  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1552  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1553 }
1554 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1555 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1556 
1557 /* Description : Pack even double word elements of vector pairs
1558  Arguments : Inputs - in0, in1, in2, in3
1559  Outputs - out0, out1
1560  Return Type - unsigned byte
1561  Details : Even double elements of in0 are copied to the left half of
1562  out0 & even double elements of in1 are copied to the right
1563  half of out0.
1564  Even double elements of in2 are copied to the left half of
1565  out1 & even double elements of in3 are copied to the right
1566  half of out1.
1567 */
1568 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1569 { \
1570  out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1571  out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1572 }
1573 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1574 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1575 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1576 
1577 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1578  out0, out1, out2, out3) \
1579 { \
1580  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1581  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1582 }
1583 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1584 
1585 /* Description : Each byte element is logically xor'ed with immediate 128
1586  Arguments : Inputs - in0, in1
1587  Outputs - in0, in1 (in-place)
1588  Return Type - as per RTYPE
1589  Details : Each unsigned byte element from input vector 'in0' is
1590  logically xor'ed with 128 and result is in-place stored in
1591  'in0' vector
1592  Each unsigned byte element from input vector 'in1' is
1593  logically xor'ed with 128 and result is in-place stored in
1594  'in1' vector
1595  Similar for other pairs
1596 */
1597 #define XORI_B2_128(RTYPE, in0, in1) \
1598 { \
1599  in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1600  in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1601 }
1602 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1603 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1604 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1605 
1606 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1607 { \
1608  XORI_B2_128(RTYPE, in0, in1); \
1609  in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1610 }
1611 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1612 
1613 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1614 { \
1615  XORI_B2_128(RTYPE, in0, in1); \
1616  XORI_B2_128(RTYPE, in2, in3); \
1617 }
1618 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1619 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1620 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1621 
1622 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1623 { \
1624  XORI_B3_128(RTYPE, in0, in1, in2); \
1625  XORI_B2_128(RTYPE, in3, in4); \
1626 }
1627 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1628 
1629 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1630 { \
1631  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1632  XORI_B2_128(RTYPE, in4, in5); \
1633 }
1634 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1635 
1636 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1637 { \
1638  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1639  XORI_B3_128(RTYPE, in4, in5, in6); \
1640 }
1641 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1642 
1643 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1644 { \
1645  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1646  XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1647 }
1648 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1649 
1650 /* Description : Addition of signed halfword elements and signed saturation
1651  Arguments : Inputs - in0, in1, in2, in3
1652  Outputs - out0, out1
1653  Return Type - as per RTYPE
1654  Details : Signed halfword elements from 'in0' are added to signed
1655  halfword elements of 'in1'. The result is then signed saturated
1656  between -32768 to +32767 (as per halfword data type)
1657  Similar for other pairs
1658 */
1659 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1660 { \
1661  out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1662  out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1663 }
1664 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1665 
1666 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1667  out0, out1, out2, out3) \
1668 { \
1669  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1670  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1671 }
1672 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1673 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1674 
1675 /* Description : Shift left all elements of vector (generic for all data types)
1676  Arguments : Inputs - in0, in1, in2, in3, shift
1677  Outputs - in0, in1, in2, in3 (in place)
1678  Return Type - as per input vector RTYPE
1679  Details : Each element of vector 'in0' is left shifted by 'shift' and
1680  result is in place written to 'in0'
1681  Similar for other pairs
1682 */
1683 #define SLLI_4V(in0, in1, in2, in3, shift) \
1684 { \
1685  in0 = in0 << shift; \
1686  in1 = in1 << shift; \
1687  in2 = in2 << shift; \
1688  in3 = in3 << shift; \
1689 }
1690 
1691 /* Description : Arithmetic shift right all elements of vector
1692  (generic for all data types)
1693  Arguments : Inputs - in0, in1, in2, in3, shift
1694  Outputs - in0, in1, in2, in3 (in place)
1695  Return Type - as per input vector RTYPE
1696  Details : Each element of vector 'in0' is right shifted by 'shift' and
1697  result is in place written to 'in0'
1698  Here, 'shift' is GP variable passed in
1699  Similar for other pairs
1700 */
1701 #define SRA_4V(in0, in1, in2, in3, shift) \
1702 { \
1703  in0 = in0 >> shift; \
1704  in1 = in1 >> shift; \
1705  in2 = in2 >> shift; \
1706  in3 = in3 >> shift; \
1707 }
1708 
1709 /* Description : Shift right logical all halfword elements of vector
1710  Arguments : Inputs - in0, in1, in2, in3, shift
1711  Outputs - in0, in1, in2, in3 (in place)
1712  Return Type - unsigned halfword
1713  Details : Each element of vector 'in0' is shifted right logical by
1714  number of bits respective element holds in vector 'shift' and
1715  result is in place written to 'in0'
1716  Here, 'shift' is a vector passed in
1717  Similar for other pairs
1718 */
1719 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1720 { \
1721  in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1722  in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1723  in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1724  in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1725 }
1726 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1727 
1728 /* Description : Shift right arithmetic rounded halfwords
1729  Arguments : Inputs - in0, in1, shift
1730  Outputs - in0, in1, (in place)
1731  Return Type - unsigned halfword
1732  Details : Each element of vector 'in0' is shifted right arithmetic by
1733  number of bits respective element holds in vector 'shift'.
1734  The last discarded bit is added to shifted value for rounding
1735  and the result is in place written to 'in0'
1736  Here, 'shift' is a vector passed in
1737  Similar for other pairs
1738 */
1739 #define SRAR_H2(RTYPE, in0, in1, shift) \
1740 { \
1741  in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
1742  in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
1743 }
1744 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
1745 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
1746 
1747 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
1748 { \
1749  SRAR_H2(RTYPE, in0, in1, shift) \
1750  in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
1751 }
1752 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
1753 
1754 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
1755 { \
1756  SRAR_H2(RTYPE, in0, in1, shift) \
1757  SRAR_H2(RTYPE, in2, in3, shift) \
1758 }
1759 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
1760 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
1761 
1762 /* Description : Shift right arithmetic rounded words
1763  Arguments : Inputs - in0, in1, shift
1764  Outputs - in0, in1, (in place)
1765  Return Type - as per RTYPE
1766  Details : Each element of vector 'in0' is shifted right arithmetic by
1767  number of bits respective element holds in vector 'shift'.
1768  The last discarded bit is added to shifted value for rounding
1769  and the result is in place written to 'in0'
1770  Here, 'shift' is a vector passed in
1771  Similar for other pairs
1772 */
1773 #define SRAR_W2(RTYPE, in0, in1, shift) \
1774 { \
1775  in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
1776  in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
1777 }
1778 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
1779 
1780 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
1781 { \
1782  SRAR_W2(RTYPE, in0, in1, shift) \
1783  SRAR_W2(RTYPE, in2, in3, shift) \
1784 }
1785 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
1786 
1787 /* Description : Shift right arithmetic rounded (immediate)
1788  Arguments : Inputs - in0, in1, in2, in3, shift
1789  Outputs - in0, in1, in2, in3 (in place)
1790  Return Type - as per RTYPE
1791  Details : Each element of vector 'in0' is shifted right arithmetic by
1792  value in 'shift'.
1793  The last discarded bit is added to shifted value for rounding
1794  and the result is in place written to 'in0'
1795  Similar for other pairs
1796 */
1797 #define SRARI_H2(RTYPE, in0, in1, shift) \
1798 { \
1799  in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
1800  in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
1801 }
1802 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
1803 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
1804 
1805 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
1806 { \
1807  SRARI_H2(RTYPE, in0, in1, shift); \
1808  SRARI_H2(RTYPE, in2, in3, shift); \
1809 }
1810 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
1811 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
1812 
1813 /* Description : Shift right arithmetic rounded (immediate)
1814  Arguments : Inputs - in0, in1, shift
1815  Outputs - in0, in1 (in place)
1816  Return Type - as per RTYPE
1817  Details : Each element of vector 'in0' is shifted right arithmetic by
1818  value in 'shift'.
1819  The last discarded bit is added to shifted value for rounding
1820  and the result is in place written to 'in0'
1821  Similar for other pairs
1822 */
1823 #define SRARI_W2(RTYPE, in0, in1, shift) \
1824 { \
1825  in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
1826  in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
1827 }
1828 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
1829 
1830 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
1831 { \
1832  SRARI_W2(RTYPE, in0, in1, shift); \
1833  SRARI_W2(RTYPE, in2, in3, shift); \
1834 }
1835 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
1836 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
1837 
1838 /* Description : Multiplication of pairs of vectors
1839  Arguments : Inputs - in0, in1, in2, in3
1840  Outputs - out0, out1
1841  Details : Each element from 'in0' is multiplied with elements from 'in1'
1842  and result is written to 'out0'
1843  Similar for other pairs
1844 */
1845 #define MUL2(in0, in1, in2, in3, out0, out1) \
1846 { \
1847  out0 = in0 * in1; \
1848  out1 = in2 * in3; \
1849 }
1850 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1851 { \
1852  MUL2(in0, in1, in2, in3, out0, out1); \
1853  MUL2(in4, in5, in6, in7, out2, out3); \
1854 }
1855 
1856 /* Description : Addition of 2 pairs of vectors
1857  Arguments : Inputs - in0, in1, in2, in3
1858  Outputs - out0, out1
1859  Details : Each element from 2 pairs vectors is added and 2 results are
1860  produced
1861 */
1862 #define ADD2(in0, in1, in2, in3, out0, out1) \
1863 { \
1864  out0 = in0 + in1; \
1865  out1 = in2 + in3; \
1866 }
1867 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
1868 { \
1869  ADD2(in0, in1, in2, in3, out0, out1); \
1870  ADD2(in4, in5, in6, in7, out2, out3); \
1871 }
1872 
1873 /* Description : Zero extend unsigned byte elements to halfword elements
1874  Arguments : Inputs - in (1 input unsigned byte vector)
1875  Outputs - out0, out1 (unsigned 2 halfword vectors)
1876  Return Type - signed halfword
1877  Details : Zero extended right half of vector is returned in 'out0'
1878  Zero extended left half of vector is returned in 'out1'
1879 */
1880 #define UNPCK_UB_SH(in, out0, out1) \
1881 { \
1882  v16i8 zero_m = { 0 }; \
1883  \
1884  ILVRL_B2_SH(zero_m, in, out0, out1); \
1885 }
1886 
1887 /* Description : Sign extend halfword elements from input vector and return
1888  result in pair of vectors
1889  Arguments : Inputs - in (1 input halfword vector)
1890  Outputs - out0, out1 (sign extended 2 word vectors)
1891  Return Type - signed word
1892  Details : Sign bit of halfword elements from input vector 'in' is
1893  extracted and interleaved right with same vector 'in0' to
1894  generate 4 signed word elements in 'out0'
1895  Then interleaved left with same vector 'in0' to
1896  generate 4 signed word elements in 'out1'
1897 */
1898 #define UNPCK_SH_SW(in, out0, out1) \
1899 { \
1900  v8i16 tmp_m; \
1901  \
1902  tmp_m = __msa_clti_s_h((v8i16) in, 0); \
1903  ILVRL_H2_SW(tmp_m, in, out0, out1); \
1904 }
1905 
1906 /* Description : Butterfly of 4 input vectors
1907  Arguments : Inputs - in0, in1, in2, in3
1908  Outputs - out0, out1, out2, out3
1909  Details : Butterfly operation
1910 */
1911 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
1912 { \
1913  out0 = in0 + in3; \
1914  out1 = in1 + in2; \
1915  \
1916  out2 = in1 - in2; \
1917  out3 = in0 - in3; \
1918 }
1919 
1920 /* Description : Transposes input 4x4 byte block
1921  Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
1922  Outputs - out0, out1, out2, out3 (output 4x4 byte block)
1923  Return Type - unsigned byte
1924  Details :
1925 */
1926 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
1927 { \
1928  v16i8 zero_m = { 0 }; \
1929  v16i8 s0_m, s1_m, s2_m, s3_m; \
1930  \
1931  ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
1932  ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
1933  \
1934  out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
1935  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
1936  out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
1937  out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
1938 }
1939 
1940 /* Description : Transposes input 8x4 byte block into 4x8
1941  Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
1942  Outputs - out0, out1, out2, out3 (output 4x8 byte block)
1943  Return Type - unsigned byte
1944  Details :
1945 */
1946 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1947  out0, out1, out2, out3) \
1948 { \
1949  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1950  \
1951  ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
1952  tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1953  ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
1954  \
1955  tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
1956  ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
1957  \
1958  ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
1959  out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
1960  out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
1961 }
1962 
1963 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
1964 
1965 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
1966  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
1967  in8, in9, in10, in11, in12, in13, in14, in15
1968  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1969  Return Type - unsigned byte
1970  Details :
1971 */
1972 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
1973  in8, in9, in10, in11, in12, in13, in14, in15, \
1974  out0, out1, out2, out3, out4, out5, out6, out7) \
1975 { \
1976  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1977  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1978  \
1979  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
1980  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
1981  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
1982  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
1983  \
1984  tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
1985  tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
1986  tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
1987  tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
1988  out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
1989  tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
1990  out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
1991  tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
1992  \
1993  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
1994  out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1995  out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
1996  \
1997  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
1998  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
1999  out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2000  out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2001  \
2002  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2003  out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2004  out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2005  \
2006  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2007  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2008  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2009  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2010  out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2011  out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2012 }
2013 
2014 /* Description : Transposes 8x8 block with half word elements in vectors
2015  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2016  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2017  Return Type - signed halfword
2018  Details :
2019 */
2020 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2021  out0, out1, out2, out3, out4, out5, out6, out7) \
2022 { \
2023  v8i16 s0_m, s1_m; \
2024  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2025  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2026  \
2027  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2028  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2029  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2030  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2031  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2032  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2033  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2034  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2035  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2036  tmp3_m, tmp7_m, out0, out2, out4, out6); \
2037  out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2038  out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2039  out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2040  out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2041 }
2042 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2043 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2044 
2045 /* Description : Transposes 4x4 block with word elements in vectors
2046  Arguments : Inputs - in0, in1, in2, in3
2047  Outputs - out0, out1, out2, out3
2048  Return Type - signed word
2049  Details :
2050 */
2051 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2052 { \
2053  v4i32 s0_m, s1_m, s2_m, s3_m; \
2054  \
2055  ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2056  ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2057  \
2058  out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2059  out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2060  out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2061  out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2062 }
2063 
2064 /* Description : Pack even elements of input vectors & xor with 128
2065  Arguments : Inputs - in0, in1
2066  Outputs - out_m
2067  Return Type - unsigned byte
2068  Details : Signed byte even elements from 'in0' and 'in1' are packed
2069  together in one vector and the resulted vector is xor'ed with
2070  128 to shift the range from signed to unsigned byte
2071 */
2072 #define PCKEV_XORI128_UB(in0, in1) \
2073 ( { \
2074  v16u8 out_m; \
2075  out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2076  out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2077  out_m; \
2078 } )
2079 
2080 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2081  of results and store 4 words in destination memory as per
2082  stride
2083  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2084 */
2085 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2086 { \
2087  uint32_t out0_m, out1_m, out2_m, out3_m; \
2088  v16i8 tmp0_m, tmp1_m; \
2089  \
2090  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2091  \
2092  out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2093  out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2094  out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2095  out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2096  \
2097  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2098 }
2099 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */