FFmpeg
generic_macros_msa.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 #include <config.h>
27 
28 #if HAVE_MSA2
29 #include <msa2.h>
30 #endif
31 
32 #define ALIGNMENT 16
33 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
34 
35 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
37 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
38 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
39 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
40 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
41 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
42 
43 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
45 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
46 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
47 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
48 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
50 
51 #if (__mips_isa_rev >= 6)
52  #define LH(psrc) \
53  ( { \
54  uint16_t val_lh_m = *(uint16_t *)(psrc); \
55  val_lh_m; \
56  } )
57 
58  #define LW(psrc) \
59  ( { \
60  uint32_t val_lw_m = *(uint32_t *)(psrc); \
61  val_lw_m; \
62  } )
63 
64  #if (__mips == 64)
65  #define LD(psrc) \
66  ( { \
67  uint64_t val_ld_m = *(uint64_t *)(psrc); \
68  val_ld_m; \
69  } )
70  #else // !(__mips == 64)
71  #define LD(psrc) \
72  ( { \
73  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
74  uint32_t val0_ld_m, val1_ld_m; \
75  uint64_t val_ld_m = 0; \
76  \
77  val0_ld_m = LW(psrc_ld_m); \
78  val1_ld_m = LW(psrc_ld_m + 4); \
79  \
80  val_ld_m = (uint64_t) (val1_ld_m); \
81  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
82  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
83  \
84  val_ld_m; \
85  } )
86  #endif // (__mips == 64)
87 
88  #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
89  #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
90  #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
91 
92 #else // !(__mips_isa_rev >= 6)
93  #define LH(psrc) \
94  ( { \
95  uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
96  uint16_t val_lh_m; \
97  \
98  __asm__ volatile ( \
99  "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
100  \
101  : [val_lh_m] "=r" (val_lh_m) \
102  : [psrc_lh_m] "m" (*psrc_lh_m) \
103  ); \
104  \
105  val_lh_m; \
106  } )
107 
108  #define LW(psrc) \
109  ( { \
110  uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
111  uint32_t val_lw_m; \
112  \
113  __asm__ volatile ( \
114  "lwr %[val_lw_m], 0(%[psrc_lw_m]) \n\t" \
115  "lwl %[val_lw_m], 3(%[psrc_lw_m]) \n\t" \
116  \
117  : [val_lw_m] "=&r"(val_lw_m) \
118  : [psrc_lw_m] "r"(psrc_lw_m) \
119  ); \
120  \
121  val_lw_m; \
122  } )
123 
124  #if (__mips == 64)
125  #define LD(psrc) \
126  ( { \
127  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
128  uint64_t val_ld_m = 0; \
129  \
130  __asm__ volatile ( \
131  "ldr %[val_ld_m], 0(%[psrc_ld_m]) \n\t" \
132  "ldl %[val_ld_m], 7(%[psrc_ld_m]) \n\t" \
133  \
134  : [val_ld_m] "=&r" (val_ld_m) \
135  : [psrc_ld_m] "r" (psrc_ld_m) \
136  ); \
137  \
138  val_ld_m; \
139  } )
140  #else // !(__mips == 64)
141  #define LD(psrc) \
142  ( { \
143  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
144  uint32_t val0_ld_m, val1_ld_m; \
145  uint64_t val_ld_m = 0; \
146  \
147  val0_ld_m = LW(psrc_ld_m); \
148  val1_ld_m = LW(psrc_ld_m + 4); \
149  \
150  val_ld_m = (uint64_t) (val1_ld_m); \
151  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
152  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
153  \
154  val_ld_m; \
155  } )
156  #endif // (__mips == 64)
157 
158  #define SH(val, pdst) \
159  { \
160  uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
161  uint16_t val_sh_m = (val); \
162  \
163  __asm__ volatile ( \
164  "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
165  \
166  : [pdst_sh_m] "=m" (*pdst_sh_m) \
167  : [val_sh_m] "r" (val_sh_m) \
168  ); \
169  }
170 
171  #define SW(val, pdst) \
172  { \
173  uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
174  uint32_t val_sw_m = (val); \
175  \
176  __asm__ volatile ( \
177  "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
178  \
179  : [pdst_sw_m] "=m" (*pdst_sw_m) \
180  : [val_sw_m] "r" (val_sw_m) \
181  ); \
182  }
183 
184  #define SD(val, pdst) \
185  { \
186  uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
187  uint32_t val0_sd_m, val1_sd_m; \
188  \
189  val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
190  val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
191  \
192  SW(val0_sd_m, pdst_sd_m); \
193  SW(val1_sd_m, pdst_sd_m + 4); \
194  }
195 #endif // (__mips_isa_rev >= 6)
196 
197 /* Description : Load 4 words with stride
198  Arguments : Inputs - psrc (source pointer to load from)
199  - stride
200  Outputs - out0, out1, out2, out3
201  Details : Loads word in 'out0' from (psrc)
202  Loads word in 'out1' from (psrc + stride)
203  Loads word in 'out2' from (psrc + 2 * stride)
204  Loads word in 'out3' from (psrc + 3 * stride)
205 */
206 #define LW4(psrc, stride, out0, out1, out2, out3) \
207 { \
208  out0 = LW((psrc)); \
209  out1 = LW((psrc) + stride); \
210  out2 = LW((psrc) + 2 * stride); \
211  out3 = LW((psrc) + 3 * stride); \
212 }
213 
214 #define LW2(psrc, stride, out0, out1) \
215 { \
216  out0 = LW((psrc)); \
217  out1 = LW((psrc) + stride); \
218 }
219 
220 /* Description : Load double words with stride
221  Arguments : Inputs - psrc (source pointer to load from)
222  - stride
223  Outputs - out0, out1
224  Details : Loads double word in 'out0' from (psrc)
225  Loads double word in 'out1' from (psrc + stride)
226 */
227 #define LD2(psrc, stride, out0, out1) \
228 { \
229  out0 = LD((psrc)); \
230  out1 = LD((psrc) + stride); \
231 }
232 #define LD4(psrc, stride, out0, out1, out2, out3) \
233 { \
234  LD2((psrc), stride, out0, out1); \
235  LD2((psrc) + 2 * stride, stride, out2, out3); \
236 }
237 
238 /* Description : Store 4 words with stride
239  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
240  Details : Stores word from 'in0' to (pdst)
241  Stores word from 'in1' to (pdst + stride)
242  Stores word from 'in2' to (pdst + 2 * stride)
243  Stores word from 'in3' to (pdst + 3 * stride)
244 */
245 #define SW4(in0, in1, in2, in3, pdst, stride) \
246 { \
247  SW(in0, (pdst)) \
248  SW(in1, (pdst) + stride); \
249  SW(in2, (pdst) + 2 * stride); \
250  SW(in3, (pdst) + 3 * stride); \
251 }
252 
253 /* Description : Store 4 double words with stride
254  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
255  Details : Stores double word from 'in0' to (pdst)
256  Stores double word from 'in1' to (pdst + stride)
257  Stores double word from 'in2' to (pdst + 2 * stride)
258  Stores double word from 'in3' to (pdst + 3 * stride)
259 */
260 #define SD4(in0, in1, in2, in3, pdst, stride) \
261 { \
262  SD(in0, (pdst)) \
263  SD(in1, (pdst) + stride); \
264  SD(in2, (pdst) + 2 * stride); \
265  SD(in3, (pdst) + 3 * stride); \
266 }
267 
268 /* Description : Load vector elements with stride
269  Arguments : Inputs - psrc (source pointer to load from)
270  - stride
271  Outputs - out0, out1
272  Return Type - as per RTYPE
273  Details : Loads elements in 'out0' from (psrc)
274  Loads elements in 'out1' from (psrc + stride)
275 */
276 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
277 { \
278  out0 = LD_V(RTYPE, (psrc)); \
279  out1 = LD_V(RTYPE, (psrc) + stride); \
280 }
281 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
282 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
283 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
284 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
285 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
286 
287 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
288 { \
289  LD_V2(RTYPE, (psrc), stride, out0, out1); \
290  out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
291 }
292 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
293 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
294 
295 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
296 { \
297  LD_V2(RTYPE, (psrc), stride, out0, out1); \
298  LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
299 }
300 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
301 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
302 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
303 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
304 #define LD_SW4(...) LD_V4(v4i32, __VA_ARGS__)
305 
306 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
307 { \
308  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
309  out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
310 }
311 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
312 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
313 
314 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
315 { \
316  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
317  LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
318 }
319 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
320 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
321 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
322 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
323 
324 #define LD_V7(RTYPE, psrc, stride, \
325  out0, out1, out2, out3, out4, out5, out6) \
326 { \
327  LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
328  LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
329 }
330 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
331 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
332 
333 #define LD_V8(RTYPE, psrc, stride, \
334  out0, out1, out2, out3, out4, out5, out6, out7) \
335 { \
336  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
337  LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
338 }
339 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
340 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
341 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
342 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
343 #define LD_SW8(...) LD_V8(v4i32, __VA_ARGS__)
344 
345 #define LD_V16(RTYPE, psrc, stride, \
346  out0, out1, out2, out3, out4, out5, out6, out7, \
347  out8, out9, out10, out11, out12, out13, out14, out15) \
348 { \
349  LD_V8(RTYPE, (psrc), stride, \
350  out0, out1, out2, out3, out4, out5, out6, out7); \
351  LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
352  out8, out9, out10, out11, out12, out13, out14, out15); \
353 }
354 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
355 
356 /* Description : Store vectors with stride
357  Arguments : Inputs - in0, in1, stride
358  Outputs - pdst (destination pointer to store to)
359  Details : Stores elements from 'in0' to (pdst)
360  Stores elements from 'in1' to (pdst + stride)
361 */
362 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
363 { \
364  ST_V(RTYPE, in0, (pdst)); \
365  ST_V(RTYPE, in1, (pdst) + stride); \
366 }
367 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
368 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
369 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
370 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
371 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
372 
373 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
374 { \
375  ST_V2(RTYPE, in0, in1, (pdst), stride); \
376  ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
377 }
378 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
379 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
380 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
381 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
382 
383 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
384 { \
385  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
386  ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
387 }
388 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
389 
390 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
391 { \
392  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
393  ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
394 }
395 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
396 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
397 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
398 
399 /* Description : Store half word elements of vector with stride
400  * Arguments : Inputs - in source vector
401  * - pdst (destination pointer to store to)
402  * - stride
403  * Details : Stores half word 'idx0' from 'in' to (pdst)
404  * Stores half word 'idx1' from 'in' to (pdst + stride)
405  * Similar for other elements
406  */
407 #define ST_H1(in, idx, pdst) \
408 { \
409  uint16_t out0_m; \
410  out0_m = __msa_copy_u_h((v8i16) in, idx); \
411  SH(out0_m, (pdst)); \
412 }
413 #define ST_H2(in, idx0, idx1, pdst, stride) \
414 { \
415  uint16_t out0_m, out1_m; \
416  out0_m = __msa_copy_u_h((v8i16) in, idx0); \
417  out1_m = __msa_copy_u_h((v8i16) in, idx1); \
418  SH(out0_m, (pdst)); \
419  SH(out1_m, (pdst) + stride); \
420 }
421 #define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
422 { \
423  uint16_t out0_m, out1_m, out2_m, out3_m; \
424  out0_m = __msa_copy_u_h((v8i16) in, idx0); \
425  out1_m = __msa_copy_u_h((v8i16) in, idx1); \
426  out2_m = __msa_copy_u_h((v8i16) in, idx2); \
427  out3_m = __msa_copy_u_h((v8i16) in, idx3); \
428  SH(out0_m, (pdst)); \
429  SH(out1_m, (pdst) + stride); \
430  SH(out2_m, (pdst) + 2 * stride); \
431  SH(out3_m, (pdst) + 3 * stride); \
432 }
433 #define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \
434  idx6, idx7, pdst, stride) \
435 { \
436  ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
437  ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
438 }
439 
440 /* Description : Store word elements of vector with stride
441  * Arguments : Inputs - in source vector
442  * - pdst (destination pointer to store to)
443  * - stride
444  * Details : Stores word 'idx0' from 'in' to (pdst)
445  * Stores word 'idx1' from 'in' to (pdst + stride)
446  * Similar for other elements
447  */
448 #define ST_W1(in, idx, pdst) \
449 { \
450  uint32_t out0_m; \
451  out0_m = __msa_copy_u_w((v4i32) in, idx); \
452  SW(out0_m, (pdst)); \
453 }
454 #define ST_W2(in, idx0, idx1, pdst, stride) \
455 { \
456  uint32_t out0_m, out1_m; \
457  out0_m = __msa_copy_u_w((v4i32) in, idx0); \
458  out1_m = __msa_copy_u_w((v4i32) in, idx1); \
459  SW(out0_m, (pdst)); \
460  SW(out1_m, (pdst) + stride); \
461 }
462 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \
463 { \
464  uint32_t out0_m, out1_m, out2_m, out3_m; \
465  out0_m = __msa_copy_u_w((v4i32) in, idx0); \
466  out1_m = __msa_copy_u_w((v4i32) in, idx1); \
467  out2_m = __msa_copy_u_w((v4i32) in, idx2); \
468  out3_m = __msa_copy_u_w((v4i32) in, idx3); \
469  SW(out0_m, (pdst)); \
470  SW(out1_m, (pdst) + stride); \
471  SW(out2_m, (pdst) + 2*stride); \
472  SW(out3_m, (pdst) + 3*stride); \
473 }
474 #define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \
475  idx4, idx5, idx6, idx7, pdst, stride) \
476 { \
477  ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \
478  ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
479 }
480 
481 /* Description : Store double word elements of vector with stride
482  * Arguments : Inputs - in source vector
483  * - pdst (destination pointer to store to)
484  * - stride
485  * Details : Stores double word 'idx0' from 'in' to (pdst)
486  * Stores double word 'idx1' from 'in' to (pdst + stride)
487  * Similar for other elements
488  */
489 #define ST_D1(in, idx, pdst) \
490 { \
491  uint64_t out0_m; \
492  out0_m = __msa_copy_u_d((v2i64) in, idx); \
493  SD(out0_m, (pdst)); \
494 }
495 #define ST_D2(in, idx0, idx1, pdst, stride) \
496 { \
497  uint64_t out0_m, out1_m; \
498  out0_m = __msa_copy_u_d((v2i64) in, idx0); \
499  out1_m = __msa_copy_u_d((v2i64) in, idx1); \
500  SD(out0_m, (pdst)); \
501  SD(out1_m, (pdst) + stride); \
502 }
503 #define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
504 { \
505  uint64_t out0_m, out1_m, out2_m, out3_m; \
506  out0_m = __msa_copy_u_d((v2i64) in0, idx0); \
507  out1_m = __msa_copy_u_d((v2i64) in0, idx1); \
508  out2_m = __msa_copy_u_d((v2i64) in1, idx2); \
509  out3_m = __msa_copy_u_d((v2i64) in1, idx3); \
510  SD(out0_m, (pdst)); \
511  SD(out1_m, (pdst) + stride); \
512  SD(out2_m, (pdst) + 2 * stride); \
513  SD(out3_m, (pdst) + 3 * stride); \
514 }
515 #define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \
516  idx4, idx5, idx6, idx7, pdst, stride) \
517 { \
518  ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
519  ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
520 }
521 
522 /* Description : Store as 12x8 byte block to destination memory from
523  input vectors
524  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
525  Details : Index 0 double word element from input vector 'in0' is copied
526  and stored to destination memory at (pblk_12x8_m) followed by
527  index 2 word element from same input vector 'in0' at
528  (pblk_12x8_m + 8)
529  Similar to remaining lines
530 */
531 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
532 { \
533  uint64_t out0_m, out1_m, out2_m, out3_m; \
534  uint64_t out4_m, out5_m, out6_m, out7_m; \
535  uint32_t out8_m, out9_m, out10_m, out11_m; \
536  uint32_t out12_m, out13_m, out14_m, out15_m; \
537  uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
538  \
539  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
540  out1_m = __msa_copy_u_d((v2i64) in1, 0); \
541  out2_m = __msa_copy_u_d((v2i64) in2, 0); \
542  out3_m = __msa_copy_u_d((v2i64) in3, 0); \
543  out4_m = __msa_copy_u_d((v2i64) in4, 0); \
544  out5_m = __msa_copy_u_d((v2i64) in5, 0); \
545  out6_m = __msa_copy_u_d((v2i64) in6, 0); \
546  out7_m = __msa_copy_u_d((v2i64) in7, 0); \
547  \
548  out8_m = __msa_copy_u_w((v4i32) in0, 2); \
549  out9_m = __msa_copy_u_w((v4i32) in1, 2); \
550  out10_m = __msa_copy_u_w((v4i32) in2, 2); \
551  out11_m = __msa_copy_u_w((v4i32) in3, 2); \
552  out12_m = __msa_copy_u_w((v4i32) in4, 2); \
553  out13_m = __msa_copy_u_w((v4i32) in5, 2); \
554  out14_m = __msa_copy_u_w((v4i32) in6, 2); \
555  out15_m = __msa_copy_u_w((v4i32) in7, 2); \
556  \
557  SD(out0_m, pblk_12x8_m); \
558  SW(out8_m, pblk_12x8_m + 8); \
559  pblk_12x8_m += stride; \
560  SD(out1_m, pblk_12x8_m); \
561  SW(out9_m, pblk_12x8_m + 8); \
562  pblk_12x8_m += stride; \
563  SD(out2_m, pblk_12x8_m); \
564  SW(out10_m, pblk_12x8_m + 8); \
565  pblk_12x8_m += stride; \
566  SD(out3_m, pblk_12x8_m); \
567  SW(out11_m, pblk_12x8_m + 8); \
568  pblk_12x8_m += stride; \
569  SD(out4_m, pblk_12x8_m); \
570  SW(out12_m, pblk_12x8_m + 8); \
571  pblk_12x8_m += stride; \
572  SD(out5_m, pblk_12x8_m); \
573  SW(out13_m, pblk_12x8_m + 8); \
574  pblk_12x8_m += stride; \
575  SD(out6_m, pblk_12x8_m); \
576  SW(out14_m, pblk_12x8_m + 8); \
577  pblk_12x8_m += stride; \
578  SD(out7_m, pblk_12x8_m); \
579  SW(out15_m, pblk_12x8_m + 8); \
580 }
581 
582 /* Description : average with rounding (in0 + in1 + 1) / 2.
583  Arguments : Inputs - in0, in1, in2, in3,
584  Outputs - out0, out1
585  Return Type - as per RTYPE
586  Details : Each byte element from 'in0' vector is added with each byte
587  element from 'in1' vector. The addition of the elements plus 1
588  (for rounding) is done unsigned with full precision,
589  i.e. the result has one extra bit. Unsigned division by 2
590  (or logical shift right by one bit) is performed before writing
591  the result to vector 'out0'
592  Similar for the pair of 'in2' and 'in3'
593 */
594 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
595 { \
596  out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
597  out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
598 }
599 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
600 
601 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
602  out0, out1, out2, out3) \
603 { \
604  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
605  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
606 }
607 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
608 
609 /* Description : Immediate number of columns to slide
610  Arguments : Inputs - s, d, slide_val
611  Outputs - out
612  Return Type - as per RTYPE
613  Details : Byte elements from 'd' vector are slide into 's' by
614  number of elements specified by 'slide_val'
615 */
616 #define SLDI_B(RTYPE, d, s, slide_val, out) \
617 { \
618  out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val); \
619 }
620 
621 #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
622 { \
623  SLDI_B(RTYPE, d0, s0, slide_val, out0) \
624  SLDI_B(RTYPE, d1, s1, slide_val, out1) \
625 }
626 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
627 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
628 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
629 #define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
630 
631 #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val, \
632  out0, out1, out2) \
633 { \
634  SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
635  SLDI_B(RTYPE, d2, s2, slide_val, out2) \
636 }
637 #define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
638 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
639 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
640 
641 #define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3, \
642  slide_val, out0, out1, out2, out3) \
643 { \
644  SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
645  SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3) \
646 }
647 #define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
648 #define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
649 #define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
650 
651 /* Description : Shuffle byte vector elements as per mask vector
652  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
653  Outputs - out0, out1
654  Return Type - as per RTYPE
655  Details : Selective byte elements from in0 & in1 are copied to out0 as
656  per control vector mask0
657  Selective byte elements from in2 & in3 are copied to out1 as
658  per control vector mask1
659 */
660 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
661 { \
662  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
663  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
664 }
665 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
666 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
667 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
668 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
669 
670 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
671  out0, out1, out2) \
672 { \
673  VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
674  out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
675 }
676 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
677 
678 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
679  out0, out1, out2, out3) \
680 { \
681  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
682  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
683 }
684 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
685 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
686 
687 /* Description : Shuffle halfword vector elements as per mask vector
688  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
689  Outputs - out0, out1
690  Return Type - as per RTYPE
691  Details : Selective halfword elements from in0 & in1 are copied to out0
692  as per control vector mask0
693  Selective halfword elements from in2 & in3 are copied to out1
694  as per control vector mask1
695 */
696 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
697 { \
698  out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
699  out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
700 }
701 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
702 
703 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
704  out0, out1, out2) \
705 { \
706  VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
707  out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
708 }
709 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
710 
711 /* Description : Shuffle byte vector elements as per mask vector
712  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
713  Outputs - out0, out1
714  Return Type - as per RTYPE
715  Details : Selective byte elements from in0 & in1 are copied to out0 as
716  per control vector mask0
717  Selective byte elements from in2 & in3 are copied to out1 as
718  per control vector mask1
719 */
720 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
721 { \
722  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
723  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
724 }
725 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
726 
727 /* Description : Dot product of byte vector elements
728  Arguments : Inputs - mult0, mult1
729  cnst0, cnst1
730  Outputs - out0, out1
731  Return Type - as per RTYPE
732  Details : Unsigned byte elements from mult0 are multiplied with
733  unsigned byte elements from cnst0 producing a result
734  twice the size of input i.e. unsigned halfword.
735  Then this multiplication results of adjacent odd-even elements
736  are added together and stored to the out vector
737  (2 unsigned halfword results)
738 */
739 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
740 { \
741  out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
742  out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
743 }
744 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
745 
746 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
747  cnst0, cnst1, cnst2, cnst3, \
748  out0, out1, out2, out3) \
749 { \
750  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
751  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
752 }
753 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
754 
755 /* Description : Dot product of byte vector elements
756  Arguments : Inputs - mult0, mult1
757  cnst0, cnst1
758  Outputs - out0, out1
759  Return Type - as per RTYPE
760  Details : Signed byte elements from mult0 are multiplied with
761  signed byte elements from cnst0 producing a result
762  twice the size of input i.e. signed halfword.
763  Then this multiplication results of adjacent odd-even elements
764  are added together and stored to the out vector
765  (2 signed halfword results)
766 */
767 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
768 { \
769  out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
770  out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
771 }
772 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
773 
774 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
775  out0, out1, out2) \
776 { \
777  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
778  out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
779 }
780 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
781 
782 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
783  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
784 { \
785  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
786  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
787 }
788 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
789 
790 /* Description : Dot product of halfword vector elements
791  Arguments : Inputs - mult0, mult1
792  cnst0, cnst1
793  Outputs - out0, out1
794  Return Type - as per RTYPE
795  Details : Signed halfword elements from mult0 are multiplied with
796  signed halfword elements from cnst0 producing a result
797  twice the size of input i.e. signed word.
798  Then this multiplication results of adjacent odd-even elements
799  are added together and stored to the out vector
800  (2 signed word results)
801 */
802 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
803 { \
804  out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
805  out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
806 }
807 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
808 
809 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
810  cnst0, cnst1, cnst2, cnst3, \
811  out0, out1, out2, out3) \
812 { \
813  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
814  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
815 }
816 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
817 
818 /* Description : Dot product & addition of byte vector elements
819  Arguments : Inputs - mult0, mult1
820  cnst0, cnst1
821  Outputs - out0, out1
822  Return Type - as per RTYPE
823  Details : Signed byte elements from mult0 are multiplied with
824  signed byte elements from cnst0 producing a result
825  twice the size of input i.e. signed halfword.
826  Then this multiplication results of adjacent odd-even elements
827  are added to the out vector
828  (2 signed halfword results)
829 */
830 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
831 { \
832  out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
833  (v16i8) mult0, (v16i8) cnst0); \
834  out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
835  (v16i8) mult1, (v16i8) cnst1); \
836 }
837 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
838 
839 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
840  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
841 { \
842  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
843  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
844 }
845 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
846 
847 /* Description : Dot product & addition of byte vector elements
848  Arguments : Inputs - mult0, mult1
849  cnst0, cnst1
850  Outputs - out0, out1
851  Return Type - as per RTYPE
852  Details : Unsigned byte elements from mult0 are multiplied with
853  unsigned byte elements from cnst0 producing a result
854  twice the size of input i.e. unsigned halfword.
855  Then this multiplication results of adjacent odd-even elements
856  are added to the out vector
857  (2 unsigned halfword results)
858 */
859 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
860 { \
861  out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
862  (v16u8) mult0, (v16u8) cnst0); \
863  out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
864  (v16u8) mult1, (v16u8) cnst1); \
865 }
866 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
867 
868 /* Description : Dot product & addition of halfword vector elements
869  Arguments : Inputs - mult0, mult1
870  cnst0, cnst1
871  Outputs - out0, out1
872  Return Type - as per RTYPE
873  Details : Signed halfword elements from mult0 are multiplied with
874  signed halfword elements from cnst0 producing a result
875  twice the size of input i.e. signed word.
876  Then this multiplication results of adjacent odd-even elements
877  are added to the out vector
878  (2 signed word results)
879 */
880 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
881 { \
882  out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
883  (v8i16) mult0, (v8i16) cnst0); \
884  out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
885  (v8i16) mult1, (v8i16) cnst1); \
886 }
887 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
888 
889 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
890  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
891 { \
892  DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
893  DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
894 }
895 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
896 
897 /* Description : Minimum values between unsigned elements of
898  either vector are copied to the output vector
899  Arguments : Inputs - in0, in1, min_vec
900  Outputs - in0, in1, (in place)
901  Return Type - as per RTYPE
902  Details : Minimum of unsigned halfword element values from 'in0' and
903  'min_value' are written to output vector 'in0'
904 */
905 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
906 { \
907  in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
908  in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
909 }
910 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
911 
912 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
913 { \
914  MIN_UH2(RTYPE, in0, in1, min_vec); \
915  MIN_UH2(RTYPE, in2, in3, min_vec); \
916 }
917 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
918 
919 /* Description : Clips all halfword elements of input vector between min & max
920  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
921  Arguments : Inputs - in (input vector)
922  - min (min threshold)
923  - max (max threshold)
924  Outputs - in (output vector with clipped elements)
925  Return Type - signed halfword
926 */
927 #define CLIP_SH(in, min, max) \
928 { \
929  in = __msa_max_s_h((v8i16) min, (v8i16) in); \
930  in = __msa_min_s_h((v8i16) max, (v8i16) in); \
931 }
932 
933 /* Description : Clips all signed halfword elements of input vector
934  between 0 & 255
935  Arguments : Inputs - in (input vector)
936  Outputs - in (output vector with clipped elements)
937  Return Type - signed halfwords
938 */
939 #define CLIP_SH_0_255(in) \
940 { \
941  in = __msa_maxi_s_h((v8i16) in, 0); \
942  in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \
943 }
944 
945 #define CLIP_SH2_0_255(in0, in1) \
946 { \
947  CLIP_SH_0_255(in0); \
948  CLIP_SH_0_255(in1); \
949 }
950 
951 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
952 { \
953  CLIP_SH2_0_255(in0, in1); \
954  CLIP_SH2_0_255(in2, in3); \
955 }
956 
957 #define CLIP_SH8_0_255(in0, in1, in2, in3, \
958  in4, in5, in6, in7) \
959 { \
960  CLIP_SH4_0_255(in0, in1, in2, in3); \
961  CLIP_SH4_0_255(in4, in5, in6, in7); \
962 }
963 
964 /* Description : Clips all signed word elements of input vector
965  between 0 & 255
966  Arguments : Inputs - in (input vector)
967  Outputs - in (output vector with clipped elements)
968  Return Type - signed word
969 */
970 #define CLIP_SW_0_255(in) \
971 { \
972  in = __msa_maxi_s_w((v4i32) in, 0); \
973  in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \
974 }
975 
976 #define CLIP_SW2_0_255(in0, in1) \
977 { \
978  CLIP_SW_0_255(in0); \
979  CLIP_SW_0_255(in1); \
980 }
981 
982 #define CLIP_SW4_0_255(in0, in1, in2, in3) \
983 { \
984  CLIP_SW2_0_255(in0, in1); \
985  CLIP_SW2_0_255(in2, in3); \
986 }
987 
988 #define CLIP_SW8_0_255(in0, in1, in2, in3, \
989  in4, in5, in6, in7) \
990 { \
991  CLIP_SW4_0_255(in0, in1, in2, in3); \
992  CLIP_SW4_0_255(in4, in5, in6, in7); \
993 }
994 
995 /* Description : Addition of 4 signed word elements
996  4 signed word elements of input vector are added together and
997  resulted integer sum is returned
998  Arguments : Inputs - in (signed word vector)
999  Outputs - sum_m (i32 sum)
1000  Return Type - signed word
1001 */
1002 #define HADD_SW_S32(in) \
1003 ( { \
1004  v2i64 res0_m, res1_m; \
1005  int32_t sum_m; \
1006  \
1007  res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1008  res1_m = __msa_splati_d(res0_m, 1); \
1009  res0_m += res1_m; \
1010  sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1011  sum_m; \
1012 } )
1013 
1014 /* Description : Addition of 8 unsigned halfword elements
1015  8 unsigned halfword elements of input vector are added
1016  together and resulted integer sum is returned
1017  Arguments : Inputs - in (unsigned halfword vector)
1018  Outputs - sum_m (u32 sum)
1019  Return Type - unsigned word
1020 */
1021 #define HADD_UH_U32(in) \
1022 ( { \
1023  v4u32 res_m; \
1024  v2u64 res0_m, res1_m; \
1025  uint32_t sum_m; \
1026  \
1027  res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1028  res0_m = __msa_hadd_u_d(res_m, res_m); \
1029  res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1030  res0_m += res1_m; \
1031  sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1032  sum_m; \
1033 } )
1034 
1035 /* Description : Horizontal addition of signed byte vector elements
1036  Arguments : Inputs - in0, in1
1037  Outputs - out0, out1
1038  Return Type - as per RTYPE
1039  Details : Each signed odd byte element from 'in0' is added to
1040  even signed byte element from 'in0' (pairwise) and the
1041  halfword result is stored in 'out0'
1042 */
1043 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1044 { \
1045  out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1046  out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1047 }
1048 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1049 
1050 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1051 { \
1052  HADD_SB2(RTYPE, in0, in1, out0, out1); \
1053  HADD_SB2(RTYPE, in2, in3, out2, out3); \
1054 }
1055 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1056 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1057 
1058 /* Description : Horizontal addition of unsigned byte vector elements
1059  Arguments : Inputs - in0, in1
1060  Outputs - out0, out1
1061  Return Type - as per RTYPE
1062  Details : Each unsigned odd byte element from 'in0' is added to
1063  even unsigned byte element from 'in0' (pairwise) and the
1064  halfword result is stored in 'out0'
1065 */
1066 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1067 { \
1068  out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1069  out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1070 }
1071 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1072 
1073 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1074 { \
1075  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1076  out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1077 }
1078 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1079 
1080 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1081 { \
1082  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1083  HADD_UB2(RTYPE, in2, in3, out2, out3); \
1084 }
1085 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1086 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1087 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1088 
1089 /* Description : Horizontal subtraction of unsigned byte vector elements
1090  Arguments : Inputs - in0, in1
1091  Outputs - out0, out1
1092  Return Type - as per RTYPE
1093  Details : Each unsigned odd byte element from 'in0' is subtracted from
1094  even unsigned byte element from 'in0' (pairwise) and the
1095  halfword result is stored in 'out0'
1096 */
1097 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1098 { \
1099  out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1100  out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1101 }
1102 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1103 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1104 
1105 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1106 { \
1107  HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1108  HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1109 }
1110 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1111 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1112 
1113 /* Description : SAD (Sum of Absolute Difference)
1114  Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1115  Outputs - sad_m (halfword vector with sad)
1116  Return Type - unsigned halfword
1117  Details : Absolute difference of all the byte elements from 'in0' with
1118  'ref0' is calculated and preserved in 'diff0'. From the 16
1119  unsigned absolute diff values, even-odd pairs are added
1120  together to generate 8 halfword results.
1121 */
1122 #if HAVE_MSA2
1123 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1124 ( { \
1125  v8u16 sad_m = { 0 }; \
1126  sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \
1127  sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \
1128  sad_m; \
1129 } )
1130 #else
1131 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1132 ( { \
1133  v16u8 diff0_m, diff1_m; \
1134  v8u16 sad_m = { 0 }; \
1135  \
1136  diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1137  diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1138  \
1139  sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1140  sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1141  \
1142  sad_m; \
1143 } )
1144 #endif // #if HAVE_MSA2
1145 
1146 /* Description : Insert specified word elements from input vectors to 1
1147  destination vector
1148  Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1149  Outputs - out (output vector)
1150  Return Type - as per RTYPE
1151 */
1152 #define INSERT_W2(RTYPE, in0, in1, out) \
1153 { \
1154  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1155  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1156 }
1157 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1158 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1159 
1160 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1161 { \
1162  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1163  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1164  out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1165  out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1166 }
1167 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1168 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1169 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1170 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1171 
1172 /* Description : Insert specified double word elements from input vectors to 1
1173  destination vector
1174  Arguments : Inputs - in0, in1 (2 input vectors)
1175  Outputs - out (output vector)
1176  Return Type - as per RTYPE
1177 */
1178 #define INSERT_D2(RTYPE, in0, in1, out) \
1179 { \
1180  out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1181  out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1182 }
1183 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1184 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1185 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1186 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1187 
1188 /* Description : Interleave even byte elements from vectors
1189  Arguments : Inputs - in0, in1, in2, in3
1190  Outputs - out0, out1
1191  Return Type - as per RTYPE
1192  Details : Even byte elements of 'in0' and even byte
1193  elements of 'in1' are interleaved and copied to 'out0'
1194  Even byte elements of 'in2' and even byte
1195  elements of 'in3' are interleaved and copied to 'out1'
1196 */
1197 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1198 { \
1199  out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1200  out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1201 }
1202 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1203 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1204 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1205 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1206 
1207 /* Description : Interleave even halfword elements from vectors
1208  Arguments : Inputs - in0, in1, in2, in3
1209  Outputs - out0, out1
1210  Return Type - as per RTYPE
1211  Details : Even halfword elements of 'in0' and even halfword
1212  elements of 'in1' are interleaved and copied to 'out0'
1213  Even halfword elements of 'in2' and even halfword
1214  elements of 'in3' are interleaved and copied to 'out1'
1215 */
1216 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1217 { \
1218  out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1219  out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1220 }
1221 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1222 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1223 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1224 
1225 /* Description : Interleave even word elements from vectors
1226  Arguments : Inputs - in0, in1, in2, in3
1227  Outputs - out0, out1
1228  Return Type - as per RTYPE
1229  Details : Even word elements of 'in0' and even word
1230  elements of 'in1' are interleaved and copied to 'out0'
1231  Even word elements of 'in2' and even word
1232  elements of 'in3' are interleaved and copied to 'out1'
1233 */
1234 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1235 { \
1236  out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1237  out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1238 }
1239 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1240 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1241 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1242 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1243 
1244 /* Description : Interleave even double word elements from vectors
1245  Arguments : Inputs - in0, in1, in2, in3
1246  Outputs - out0, out1
1247  Return Type - as per RTYPE
1248  Details : Even double word elements of 'in0' and even double word
1249  elements of 'in1' are interleaved and copied to 'out0'
1250  Even double word elements of 'in2' and even double word
1251  elements of 'in3' are interleaved and copied to 'out1'
1252 */
1253 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1254 { \
1255  out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1256  out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1257 }
1258 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1259 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1260 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1261 
1262 /* Description : Interleave left half of byte elements from vectors
1263  Arguments : Inputs - in0, in1, in2, in3
1264  Outputs - out0, out1
1265  Return Type - as per RTYPE
1266  Details : Left half of byte elements of in0 and left half of byte
1267  elements of in1 are interleaved and copied to out0.
1268  Left half of byte elements of in2 and left half of byte
1269  elements of in3 are interleaved and copied to out1.
1270 */
1271 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1272 { \
1273  out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1274  out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1275 }
1276 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1277 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1278 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1279 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1280 
1281 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1282  out0, out1, out2, out3) \
1283 { \
1284  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1285  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1286 }
1287 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1288 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1289 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1290 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1291 
1292 /* Description : Interleave left half of halfword elements from vectors
1293  Arguments : Inputs - in0, in1, in2, in3
1294  Outputs - out0, out1
1295  Return Type - as per RTYPE
1296  Details : Left half of halfword elements of in0 and left half of halfword
1297  elements of in1 are interleaved and copied to out0.
1298  Left half of halfword elements of in2 and left half of halfword
1299  elements of in3 are interleaved and copied to out1.
1300 */
1301 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1302 { \
1303  out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1304  out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1305 }
1306 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1307 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1308 
1309 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1310  out0, out1, out2, out3) \
1311 { \
1312  ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1313  ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1314 }
1315 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1316 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1317 
1318 /* Description : Interleave left half of word elements from vectors
1319  Arguments : Inputs - in0, in1, in2, in3
1320  Outputs - out0, out1
1321  Return Type - as per RTYPE
1322  Details : Left half of word elements of in0 and left half of word
1323  elements of in1 are interleaved and copied to out0.
1324  Left half of word elements of in2 and left half of word
1325  elements of in3 are interleaved and copied to out1.
1326 */
1327 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1328 { \
1329  out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1330  out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1331 }
1332 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1333 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1334 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1335 
1336 /* Description : Interleave right half of byte elements from vectors
1337  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1338  Outputs - out0, out1, out2, out3
1339  Return Type - as per RTYPE
1340  Details : Right half of byte elements of in0 and right half of byte
1341  elements of in1 are interleaved and copied to out0.
1342  Right half of byte elements of in2 and right half of byte
1343  elements of in3 are interleaved and copied to out1.
1344  Similar for other pairs
1345 */
1346 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1347 { \
1348  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1349  out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1350 }
1351 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1352 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1353 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1354 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1355 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1356 
1357 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1358 { \
1359  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1360  out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1361 }
1362 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1363 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1364 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1365 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1366 
1367 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1368  out0, out1, out2, out3) \
1369 { \
1370  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1371  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1372 }
1373 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1374 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1375 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1376 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1377 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1378 
1379 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1380  in8, in9, in10, in11, in12, in13, in14, in15, \
1381  out0, out1, out2, out3, out4, out5, out6, out7) \
1382 { \
1383  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1384  out0, out1, out2, out3); \
1385  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1386  out4, out5, out6, out7); \
1387 }
1388 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1389 #define ILVR_B8_SW(...) ILVR_B8(v4i32, __VA_ARGS__)
1390 
1391 /* Description : Interleave right half of halfword elements from vectors
1392  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1393  Outputs - out0, out1, out2, out3
1394  Return Type - as per RTYPE
1395  Details : Right half of halfword elements of in0 and right half of
1396  halfword elements of in1 are interleaved and copied to out0.
1397  Right half of halfword elements of in2 and right half of
1398  halfword elements of in3 are interleaved and copied to out1.
1399  Similar for other pairs
1400 */
1401 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1402 { \
1403  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1404  out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1405 }
1406 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1407 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1408 
1409 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1410 { \
1411  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1412  out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1413 }
1414 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1415 
1416 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1417  out0, out1, out2, out3) \
1418 { \
1419  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1420  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1421 }
1422 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1423 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1424 
1425 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1426 { \
1427  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1428  out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1429 }
1430 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1431 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1432 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1433 
1434 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1435  out0, out1, out2, out3) \
1436 { \
1437  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1438  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1439 }
1440 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1441 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1442 
1443 /* Description : Interleave right half of double word elements from vectors
1444  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1445  Outputs - out0, out1, out2, out3
1446  Return Type - as per RTYPE
1447  Details : Right half of double word elements of in0 and right half of
1448  double word elements of in1 are interleaved and copied to out0.
1449  Right half of double word elements of in2 and right half of
1450  double word elements of in3 are interleaved and copied to out1.
1451 */
1452 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1453 { \
1454  out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1455  out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1456 }
1457 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1458 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1459 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1460 
1461 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1462 { \
1463  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1464  out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1465 }
1466 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1467 
1468 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1469  out0, out1, out2, out3) \
1470 { \
1471  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1472  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1473 }
1474 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1475 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1476 
1477 /* Description : Interleave left half of double word elements from vectors
1478  Arguments : Inputs - in0, in1, in2, in3
1479  Outputs - out0, out1
1480  Return Type - as per RTYPE
1481  Details : Left half of double word elements of in0 and left half of
1482  double word elements of in1 are interleaved and copied to out0.
1483  Left half of double word elements of in2 and left half of
1484  double word elements of in3 are interleaved and copied to out1.
1485 */
1486 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1487 { \
1488  out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1489  out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1490 }
1491 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1492 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1493 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1494 
1495 /* Description : Interleave both left and right half of input vectors
1496  Arguments : Inputs - in0, in1
1497  Outputs - out0, out1
1498  Return Type - as per RTYPE
1499  Details : Right half of byte elements from 'in0' and 'in1' are
1500  interleaved and stored to 'out0'
1501  Left half of byte elements from 'in0' and 'in1' are
1502  interleaved and stored to 'out1'
1503 */
1504 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1505 { \
1506  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1507  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1508 }
1509 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1510 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1511 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1512 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1513 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1514 
1515 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1516 { \
1517  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1518  out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1519 }
1520 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1521 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1522 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1523 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1524 
1525 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1526 { \
1527  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1528  out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1529 }
1530 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1531 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1532 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1533 
1534 /* Description : Maximum values between signed elements of vector and
1535  5-bit signed immediate value are copied to the output vector
1536  Arguments : Inputs - in0, in1, in2, in3, max_val
1537  Outputs - in0, in1, in2, in3 (in place)
1538  Return Type - as per RTYPE
1539  Details : Maximum of signed halfword element values from 'in0' and
1540  'max_val' are written to output vector 'in0'
1541 */
1542 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1543 { \
1544  in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1545  in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1546 }
1547 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1548 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1549 
1550 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1551 { \
1552  MAXI_SH2(RTYPE, in0, in1, max_val); \
1553  MAXI_SH2(RTYPE, in2, in3, max_val); \
1554 }
1555 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1556 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1557 
1558 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
1559 { \
1560  MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
1561  MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
1562 }
1563 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1564 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1565 
1566 /* Description : Saturate the halfword element values to the max
1567  unsigned value of (sat_val+1 bits)
1568  The element data width remains unchanged
1569  Arguments : Inputs - in0, in1, in2, in3, sat_val
1570  Outputs - in0, in1, in2, in3 (in place)
1571  Return Type - as per RTYPE
1572  Details : Each unsigned halfword element from 'in0' is saturated to the
1573  value generated with (sat_val+1) bit range
1574  Results are in placed to original vectors
1575 */
1576 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1577 { \
1578  in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1579  in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1580 }
1581 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1582 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1583 
1584 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1585 { \
1586  SAT_UH2(RTYPE, in0, in1, sat_val); \
1587  SAT_UH2(RTYPE, in2, in3, sat_val); \
1588 }
1589 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1590 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1591 
1592 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
1593 { \
1594  SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
1595  SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
1596 }
1597 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1598 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1599 
1600 /* Description : Saturate the halfword element values to the max
1601  unsigned value of (sat_val+1 bits)
1602  The element data width remains unchanged
1603  Arguments : Inputs - in0, in1, in2, in3, sat_val
1604  Outputs - in0, in1, in2, in3 (in place)
1605  Return Type - as per RTYPE
1606  Details : Each unsigned halfword element from 'in0' is saturated to the
1607  value generated with (sat_val+1) bit range
1608  Results are in placed to original vectors
1609 */
1610 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1611 { \
1612  in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1613  in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1614 }
1615 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1616 
1617 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1618 { \
1619  SAT_SH2(RTYPE, in0, in1, sat_val); \
1620  in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1621 }
1622 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1623 
1624 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1625 { \
1626  SAT_SH2(RTYPE, in0, in1, sat_val); \
1627  SAT_SH2(RTYPE, in2, in3, sat_val); \
1628 }
1629 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1630 
1631 /* Description : Saturate the word element values to the max
1632  unsigned value of (sat_val+1 bits)
1633  The element data width remains unchanged
1634  Arguments : Inputs - in0, in1, in2, in3, sat_val
1635  Outputs - in0, in1, in2, in3 (in place)
1636  Return Type - as per RTYPE
1637  Details : Each unsigned word element from 'in0' is saturated to the
1638  value generated with (sat_val+1) bit range
1639  Results are in placed to original vectors
1640 */
1641 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1642 { \
1643  in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1644  in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1645 }
1646 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1647 
1648 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1649 { \
1650  SAT_SW2(RTYPE, in0, in1, sat_val); \
1651  SAT_SW2(RTYPE, in2, in3, sat_val); \
1652 }
1653 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1654 
1655 /* Description : Indexed halfword element values are replicated to all
1656  elements in output vector
1657  Arguments : Inputs - in, idx0, idx1
1658  Outputs - out0, out1
1659  Return Type - as per RTYPE
1660  Details : 'idx0' element value from 'in' vector is replicated to all
1661  elements in 'out0' vector
1662  Valid index range for halfword operation is 0-7
1663 */
1664 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1665 { \
1666  out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1667  out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1668 }
1669 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1670 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1671 
1672 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1673  out0, out1, out2) \
1674 { \
1675  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1676  out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1677 }
1678 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1679 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1680 
1681 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1682  out0, out1, out2, out3) \
1683 { \
1684  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1685  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1686 }
1687 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1688 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1689 
1690 /* Description : Indexed word element values are replicated to all
1691  elements in output vector
1692  Arguments : Inputs - in, stidx
1693  Outputs - out0, out1
1694  Return Type - as per RTYPE
1695  Details : 'stidx' element value from 'in' vector is replicated to all
1696  elements in 'out0' vector
1697  'stidx + 1' element value from 'in' vector is replicated to all
1698  elements in 'out1' vector
1699  Valid index range for halfword operation is 0-3
1700 */
1701 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1702 { \
1703  out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1704  out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1705 }
1706 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1707 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1708 
1709 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1710 { \
1711  SPLATI_W2(RTYPE, in, 0, out0, out1); \
1712  SPLATI_W2(RTYPE, in, 2, out2, out3); \
1713 }
1714 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1715 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1716 
1717 /* Description : Pack even byte elements of vector pairs
1718  Arguments : Inputs - in0, in1, in2, in3
1719  Outputs - out0, out1
1720  Return Type - as per RTYPE
1721  Details : Even byte elements of in0 are copied to the left half of
1722  out0 & even byte elements of in1 are copied to the right
1723  half of out0.
1724  Even byte elements of in2 are copied to the left half of
1725  out1 & even byte elements of in3 are copied to the right
1726  half of out1.
1727 */
1728 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1729 { \
1730  out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1731  out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1732 }
1733 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1734 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1735 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1736 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1737 
1738 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1739 { \
1740  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1741  out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1742 }
1743 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1744 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1745 
1746 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1747  out0, out1, out2, out3) \
1748 { \
1749  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1750  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1751 }
1752 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1753 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1754 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1755 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1756 
1757 /* Description : Pack even halfword elements of vector pairs
1758  Arguments : Inputs - in0, in1, in2, in3
1759  Outputs - out0, out1
1760  Return Type - as per RTYPE
1761  Details : Even halfword elements of in0 are copied to the left half of
1762  out0 & even halfword elements of in1 are copied to the right
1763  half of out0.
1764  Even halfword elements of in2 are copied to the left half of
1765  out1 & even halfword elements of in3 are copied to the right
1766  half of out1.
1767 */
1768 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1769 { \
1770  out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1771  out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1772 }
1773 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1774 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1775 
1776 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1777  out0, out1, out2, out3) \
1778 { \
1779  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1780  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1781 }
1782 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1783 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1784 
1785 /* Description : Pack even double word elements of vector pairs
1786  Arguments : Inputs - in0, in1, in2, in3
1787  Outputs - out0, out1
1788  Return Type - as per RTYPE
1789  Details : Even double elements of in0 are copied to the left half of
1790  out0 & even double elements of in1 are copied to the right
1791  half of out0.
1792  Even double elements of in2 are copied to the left half of
1793  out1 & even double elements of in3 are copied to the right
1794  half of out1.
1795 */
1796 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1797 { \
1798  out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1799  out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1800 }
1801 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1802 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1803 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1804 
1805 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1806  out0, out1, out2, out3) \
1807 { \
1808  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1809  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1810 }
1811 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1812 
1813 /* Description : Pack odd double word elements of vector pairs
1814  Arguments : Inputs - in0, in1
1815  Outputs - out0, out1
1816  Return Type - as per RTYPE
1817  Details : As operation is on same input 'in0' vector, index 1 double word
1818  element is overwritten to index 0 and result is written to out0
1819  As operation is on same input 'in1' vector, index 1 double word
1820  element is overwritten to index 0 and result is written to out1
1821 */
1822 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1823 { \
1824  out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1825  out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1826 }
1827 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1828 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1829 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1830 
1831 /* Description : Each byte element is logically xor'ed with immediate 128
1832  Arguments : Inputs - in0, in1
1833  Outputs - in0, in1 (in-place)
1834  Return Type - as per RTYPE
1835  Details : Each unsigned byte element from input vector 'in0' is
1836  logically xor'ed with 128 and result is in-place stored in
1837  'in0' vector
1838  Each unsigned byte element from input vector 'in1' is
1839  logically xor'ed with 128 and result is in-place stored in
1840  'in1' vector
1841  Similar for other pairs
1842 */
1843 #define XORI_B2_128(RTYPE, in0, in1) \
1844 { \
1845  in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1846  in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1847 }
1848 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1849 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1850 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1851 
1852 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1853 { \
1854  XORI_B2_128(RTYPE, in0, in1); \
1855  in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1856 }
1857 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1858 
1859 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1860 { \
1861  XORI_B2_128(RTYPE, in0, in1); \
1862  XORI_B2_128(RTYPE, in2, in3); \
1863 }
1864 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1865 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1866 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1867 
1868 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1869 { \
1870  XORI_B3_128(RTYPE, in0, in1, in2); \
1871  XORI_B2_128(RTYPE, in3, in4); \
1872 }
1873 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1874 
1875 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1876 { \
1877  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1878  XORI_B2_128(RTYPE, in4, in5); \
1879 }
1880 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1881 
1882 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1883 { \
1884  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1885  XORI_B3_128(RTYPE, in4, in5, in6); \
1886 }
1887 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1888 
1889 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1890 { \
1891  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1892  XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1893 }
1894 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1895 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1896 
1897 /* Description : Addition of signed halfword elements and signed saturation
1898  Arguments : Inputs - in0, in1, in2, in3
1899  Outputs - out0, out1
1900  Return Type - as per RTYPE
1901  Details : Signed halfword elements from 'in0' are added to signed
1902  halfword elements of 'in1'. The result is then signed saturated
1903  between -32768 to +32767 (as per halfword data type)
1904  Similar for other pairs
1905 */
1906 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1907 { \
1908  out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1909  out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1910 }
1911 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1912 
1913 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1914  out0, out1, out2, out3) \
1915 { \
1916  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1917  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1918 }
1919 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1920 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1921 
1922 /* Description : Shift left all elements of vector (generic for all data types)
1923  Arguments : Inputs - in0, in1, in2, in3, shift
1924  Outputs - in0, in1, in2, in3 (in place)
1925  Return Type - as per input vector RTYPE
1926  Details : Each element of vector 'in0' is left shifted by 'shift' and
1927  result is in place written to 'in0'
1928  Similar for other pairs
1929 */
1930 #define SLLI_2V(in0, in1, shift) \
1931 { \
1932  in0 = in0 << shift; \
1933  in1 = in1 << shift; \
1934 }
1935 #define SLLI_4V(in0, in1, in2, in3, shift) \
1936 { \
1937  in0 = in0 << shift; \
1938  in1 = in1 << shift; \
1939  in2 = in2 << shift; \
1940  in3 = in3 << shift; \
1941 }
1942 
1943 /* Description : Arithmetic shift right all elements of vector
1944  (generic for all data types)
1945  Arguments : Inputs - in0, in1, in2, in3, shift
1946  Outputs - in0, in1, in2, in3 (in place)
1947  Return Type - as per input vector RTYPE
1948  Details : Each element of vector 'in0' is right shifted by 'shift' and
1949  result is in place written to 'in0'
1950  Here, 'shift' is GP variable passed in
1951  Similar for other pairs
1952 */
1953 #define SRA_4V(in0, in1, in2, in3, shift) \
1954 { \
1955  in0 = in0 >> shift; \
1956  in1 = in1 >> shift; \
1957  in2 = in2 >> shift; \
1958  in3 = in3 >> shift; \
1959 }
1960 
1961 /* Description : Shift right logical all halfword elements of vector
1962  Arguments : Inputs - in0, in1, in2, in3, shift
1963  Outputs - in0, in1, in2, in3 (in place)
1964  Return Type - as per RTYPE
1965  Details : Each element of vector 'in0' is shifted right logical by
1966  number of bits respective element holds in vector 'shift' and
1967  result is in place written to 'in0'
1968  Here, 'shift' is a vector passed in
1969  Similar for other pairs
1970 */
1971 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1972 { \
1973  in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1974  in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1975  in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1976  in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1977 }
1978 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1979 
1980 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
1981 { \
1982  in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
1983  in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
1984  in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
1985  in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
1986 }
1987 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1988 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1989 
1990 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
1991 { \
1992  SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
1993  SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
1994 }
1995 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1996 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1997 
1998 /* Description : Shift right arithmetic rounded halfwords
1999  Arguments : Inputs - in0, in1, shift
2000  Outputs - in0, in1, (in place)
2001  Return Type - as per RTYPE
2002  Details : Each element of vector 'in0' is shifted right arithmetic by
2003  number of bits respective element holds in vector 'shift'.
2004  The last discarded bit is added to shifted value for rounding
2005  and the result is in place written to 'in0'
2006  Here, 'shift' is a vector passed in
2007  Similar for other pairs
2008 */
2009 #define SRAR_H2(RTYPE, in0, in1, shift) \
2010 { \
2011  in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2012  in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2013 }
2014 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2015 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2016 
2017 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2018 { \
2019  SRAR_H2(RTYPE, in0, in1, shift) \
2020  in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2021 }
2022 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2023 
2024 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2025 { \
2026  SRAR_H2(RTYPE, in0, in1, shift) \
2027  SRAR_H2(RTYPE, in2, in3, shift) \
2028 }
2029 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2030 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2031 
2032 /* Description : Shift right arithmetic rounded words
2033  Arguments : Inputs - in0, in1, shift
2034  Outputs - in0, in1, (in place)
2035  Return Type - as per RTYPE
2036  Details : Each element of vector 'in0' is shifted right arithmetic by
2037  number of bits respective element holds in vector 'shift'.
2038  The last discarded bit is added to shifted value for rounding
2039  and the result is in place written to 'in0'
2040  Here, 'shift' is a vector passed in
2041  Similar for other pairs
2042 */
2043 #define SRAR_W2(RTYPE, in0, in1, shift) \
2044 { \
2045  in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2046  in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2047 }
2048 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2049 
2050 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2051 { \
2052  SRAR_W2(RTYPE, in0, in1, shift) \
2053  SRAR_W2(RTYPE, in2, in3, shift) \
2054 }
2055 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2056 
2057 /* Description : Shift right arithmetic rounded (immediate)
2058  Arguments : Inputs - in0, in1, in2, in3, shift
2059  Outputs - in0, in1, in2, in3 (in place)
2060  Return Type - as per RTYPE
2061  Details : Each element of vector 'in0' is shifted right arithmetic by
2062  value in 'shift'.
2063  The last discarded bit is added to shifted value for rounding
2064  and the result is in place written to 'in0'
2065  Similar for other pairs
2066 */
2067 #define SRARI_H2(RTYPE, in0, in1, shift) \
2068 { \
2069  in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2070  in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2071 }
2072 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2073 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2074 
2075 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2076 { \
2077  SRARI_H2(RTYPE, in0, in1, shift); \
2078  SRARI_H2(RTYPE, in2, in3, shift); \
2079 }
2080 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2081 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2082 
2083 /* Description : Shift right arithmetic rounded (immediate)
2084  Arguments : Inputs - in0, in1, shift
2085  Outputs - in0, in1 (in place)
2086  Return Type - as per RTYPE
2087  Details : Each element of vector 'in0' is shifted right arithmetic by
2088  value in 'shift'.
2089  The last discarded bit is added to shifted value for rounding
2090  and the result is in place written to 'in0'
2091  Similar for other pairs
2092 */
2093 #define SRARI_W2(RTYPE, in0, in1, shift) \
2094 { \
2095  in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2096  in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2097 }
2098 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2099 
2100 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2101 { \
2102  SRARI_W2(RTYPE, in0, in1, shift); \
2103  SRARI_W2(RTYPE, in2, in3, shift); \
2104 }
2105 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2106 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2107 
2108 /* Description : Multiplication of pairs of vectors
2109  Arguments : Inputs - in0, in1, in2, in3
2110  Outputs - out0, out1
2111  Details : Each element from 'in0' is multiplied with elements from 'in1'
2112  and result is written to 'out0'
2113  Similar for other pairs
2114 */
2115 #define MUL2(in0, in1, in2, in3, out0, out1) \
2116 { \
2117  out0 = in0 * in1; \
2118  out1 = in2 * in3; \
2119 }
2120 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2121 { \
2122  MUL2(in0, in1, in2, in3, out0, out1); \
2123  MUL2(in4, in5, in6, in7, out2, out3); \
2124 }
2125 
2126 /* Description : Addition of 2 pairs of vectors
2127  Arguments : Inputs - in0, in1, in2, in3
2128  Outputs - out0, out1
2129  Details : Each element from 2 pairs vectors is added and 2 results are
2130  produced
2131 */
2132 #define ADD2(in0, in1, in2, in3, out0, out1) \
2133 { \
2134  out0 = in0 + in1; \
2135  out1 = in2 + in3; \
2136 }
2137 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2138 { \
2139  ADD2(in0, in1, in2, in3, out0, out1); \
2140  ADD2(in4, in5, in6, in7, out2, out3); \
2141 }
2142 
2143 /* Description : Subtraction of 2 pairs of vectors
2144  Arguments : Inputs - in0, in1, in2, in3
2145  Outputs - out0, out1
2146  Details : Each element from 2 pairs vectors is subtracted and 2 results
2147  are produced
2148 */
2149 #define SUB2(in0, in1, in2, in3, out0, out1) \
2150 { \
2151  out0 = in0 - in1; \
2152  out1 = in2 - in3; \
2153 }
2154 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2155 { \
2156  out0 = in0 - in1; \
2157  out1 = in2 - in3; \
2158  out2 = in4 - in5; \
2159  out3 = in6 - in7; \
2160 }
2161 
2162 /* Description : Sign extend byte elements from right half of the vector
2163  Arguments : Input - in (byte vector)
2164  Output - out (sign extended halfword vector)
2165  Return Type - signed halfword
2166  Details : Sign bit of byte elements from input vector 'in' is
2167  extracted and interleaved with same vector 'in' to generate
2168  8 halfword elements keeping sign intact
2169 */
2170 #define UNPCK_R_SB_SH(in, out) \
2171 { \
2172  v16i8 sign_m; \
2173  \
2174  sign_m = __msa_clti_s_b((v16i8) in, 0); \
2175  out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \
2176 }
2177 
2178 /* Description : Sign extend halfword elements from right half of the vector
2179  Arguments : Inputs - in (input halfword vector)
2180  Outputs - out (sign extended word vectors)
2181  Return Type - signed word
2182  Details : Sign bit of halfword elements from input vector 'in' is
2183  extracted and interleaved with same vector 'in0' to generate
2184  4 word elements keeping sign intact
2185 */
2186 #if HAVE_MSA2
2187 #define UNPCK_R_SH_SW(in, out) \
2188 { \
2189  out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2190 }
2191 #else
2192 #define UNPCK_R_SH_SW(in, out) \
2193 { \
2194  v8i16 sign_m; \
2195  \
2196  sign_m = __msa_clti_s_h((v8i16) in, 0); \
2197  out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2198 }
2199 #endif // #if HAVE_MSA2
2200 
2201 /* Description : Sign extend byte elements from input vector and return
2202  halfword results in pair of vectors
2203  Arguments : Inputs - in (1 input byte vector)
2204  Outputs - out0, out1 (sign extended 2 halfword vectors)
2205  Return Type - signed halfword
2206  Details : Sign bit of byte elements from input vector 'in' is
2207  extracted and interleaved right with same vector 'in0' to
2208  generate 8 signed halfword elements in 'out0'
2209  Then interleaved left with same vector 'in0' to
2210  generate 8 signed halfword elements in 'out1'
2211 */
2212 #if HAVE_MSA2
2213 #define UNPCK_SB_SH(in, out0, out1) \
2214 { \
2215  out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \
2216  out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \
2217 }
2218 #else
2219 #define UNPCK_SB_SH(in, out0, out1) \
2220 { \
2221  v16i8 tmp_m; \
2222  \
2223  tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2224  ILVRL_B2_SH(tmp_m, in, out0, out1); \
2225 }
2226 #endif // #if HAVE_MSA2
2227 
2228 /* Description : Zero extend unsigned byte elements to halfword elements
2229  Arguments : Inputs - in (1 input unsigned byte vector)
2230  Outputs - out0, out1 (unsigned 2 halfword vectors)
2231  Return Type - signed halfword
2232  Details : Zero extended right half of vector is returned in 'out0'
2233  Zero extended left half of vector is returned in 'out1'
2234 */
2235 #define UNPCK_UB_SH(in, out0, out1) \
2236 { \
2237  v16i8 zero_m = { 0 }; \
2238  \
2239  ILVRL_B2_SH(zero_m, in, out0, out1); \
2240 }
2241 
2242 /* Description : Sign extend halfword elements from input vector and return
2243  result in pair of vectors
2244  Arguments : Inputs - in (1 input halfword vector)
2245  Outputs - out0, out1 (sign extended 2 word vectors)
2246  Return Type - signed word
2247  Details : Sign bit of halfword elements from input vector 'in' is
2248  extracted and interleaved right with same vector 'in0' to
2249  generate 4 signed word elements in 'out0'
2250  Then interleaved left with same vector 'in0' to
2251  generate 4 signed word elements in 'out1'
2252 */
2253 #if HAVE_MSA2
2254 #define UNPCK_SH_SW(in, out0, out1) \
2255 { \
2256  out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2257  out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \
2258 }
2259 #else
2260 #define UNPCK_SH_SW(in, out0, out1) \
2261 { \
2262  v8i16 tmp_m; \
2263  \
2264  tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2265  ILVRL_H2_SW(tmp_m, in, out0, out1); \
2266 }
2267 #endif // #if HAVE_MSA2
2268 
2269 /* Description : Swap two variables
2270  Arguments : Inputs - in0, in1
2271  Outputs - in0, in1 (in-place)
2272  Details : Swapping of two input variables using xor
2273 */
2274 #define SWAP(in0, in1) \
2275 { \
2276  in0 = in0 ^ in1; \
2277  in1 = in0 ^ in1; \
2278  in0 = in0 ^ in1; \
2279 }
2280 
2281 /* Description : Butterfly of 4 input vectors
2282  Arguments : Inputs - in0, in1, in2, in3
2283  Outputs - out0, out1, out2, out3
2284  Details : Butterfly operation
2285 */
2286 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2287 { \
2288  out0 = in0 + in3; \
2289  out1 = in1 + in2; \
2290  \
2291  out2 = in1 - in2; \
2292  out3 = in0 - in3; \
2293 }
2294 
2295 /* Description : Butterfly of 8 input vectors
2296  Arguments : Inputs - in0 ... in7
2297  Outputs - out0 .. out7
2298  Details : Butterfly operation
2299 */
2300 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2301  out0, out1, out2, out3, out4, out5, out6, out7) \
2302 { \
2303  out0 = in0 + in7; \
2304  out1 = in1 + in6; \
2305  out2 = in2 + in5; \
2306  out3 = in3 + in4; \
2307  \
2308  out4 = in3 - in4; \
2309  out5 = in2 - in5; \
2310  out6 = in1 - in6; \
2311  out7 = in0 - in7; \
2312 }
2313 
2314 /* Description : Butterfly of 16 input vectors
2315  Arguments : Inputs - in0 ... in15
2316  Outputs - out0 .. out15
2317  Details : Butterfly operation
2318 */
2319 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2320  in8, in9, in10, in11, in12, in13, in14, in15, \
2321  out0, out1, out2, out3, out4, out5, out6, out7, \
2322  out8, out9, out10, out11, out12, out13, out14, out15) \
2323 { \
2324  out0 = in0 + in15; \
2325  out1 = in1 + in14; \
2326  out2 = in2 + in13; \
2327  out3 = in3 + in12; \
2328  out4 = in4 + in11; \
2329  out5 = in5 + in10; \
2330  out6 = in6 + in9; \
2331  out7 = in7 + in8; \
2332  \
2333  out8 = in7 - in8; \
2334  out9 = in6 - in9; \
2335  out10 = in5 - in10; \
2336  out11 = in4 - in11; \
2337  out12 = in3 - in12; \
2338  out13 = in2 - in13; \
2339  out14 = in1 - in14; \
2340  out15 = in0 - in15; \
2341 }
2342 
2343 /* Description : Transposes input 4x4 byte block
2344  Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2345  Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2346  Return Type - unsigned byte
2347  Details :
2348 */
2349 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2350 { \
2351  v16i8 zero_m = { 0 }; \
2352  v16i8 s0_m, s1_m, s2_m, s3_m; \
2353  \
2354  ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2355  ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2356  \
2357  out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2358  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2359  out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2360  out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2361 }
2362 
2363 /* Description : Transposes input 8x4 byte block into 4x8
2364  Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2365  Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2366  Return Type - as per RTYPE
2367  Details :
2368 */
2369 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2370  out0, out1, out2, out3) \
2371 { \
2372  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2373  \
2374  ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2375  tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2376  ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2377  \
2378  tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2379  ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2380  \
2381  ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2382  out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2383  out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2384 }
2385 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2386 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2387 
2388 /* Description : Transposes input 8x8 byte block
2389  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2390  (input 8x8 byte block)
2391  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2392  (output 8x8 byte block)
2393  Return Type - as per RTYPE
2394  Details :
2395 */
2396 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2397  out0, out1, out2, out3, out4, out5, out6, out7) \
2398 { \
2399  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2400  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2401  v16i8 zeros = { 0 }; \
2402  \
2403  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2404  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2405  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2406  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2407  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2408  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2409  SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6, \
2410  8, out1, out3, out5, out7); \
2411 }
2412 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2413 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2414 
2415 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2416  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2417  in8, in9, in10, in11, in12, in13, in14, in15
2418  Outputs - out0, out1, out2, out3
2419  Return Type - unsigned byte
2420  Details :
2421 */
2422 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2423  in8, in9, in10, in11, in12, in13, in14, in15, \
2424  out0, out1, out2, out3) \
2425 { \
2426  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2427  \
2428  ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2429  out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2430  \
2431  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2432  out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2433  \
2434  ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2435  \
2436  tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2437  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2438  \
2439  tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2440  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2441  out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2442  out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2443  \
2444  tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2445  tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2446  out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2447  out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2448 }
2449 
2450 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2451  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2452  in8, in9, in10, in11, in12, in13, in14, in15
2453  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2454  Return Type - unsigned byte
2455  Details :
2456 */
2457 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2458  in8, in9, in10, in11, in12, in13, in14, in15, \
2459  out0, out1, out2, out3, out4, out5, out6, out7) \
2460 { \
2461  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2462  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2463  \
2464  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2465  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2466  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2467  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2468  \
2469  tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2470  tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2471  tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2472  tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2473  out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2474  tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2475  out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2476  tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2477  \
2478  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2479  out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2480  out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2481  \
2482  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2483  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2484  out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2485  out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2486  \
2487  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2488  out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2489  out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2490  \
2491  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2492  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2493  out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2494  out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2495 }
2496 
2497 /* Description : Transposes 4x4 block with half word elements in vectors
2498  Arguments : Inputs - in0, in1, in2, in3
2499  Outputs - out0, out1, out2, out3
2500  Return Type - signed halfword
2501  Details :
2502 */
2503 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2504 { \
2505  v8i16 s0_m, s1_m; \
2506  \
2507  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2508  ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2509  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2510  out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2511 }
2512 
2513 /* Description : Transposes 8x8 block with half word elements in vectors
2514  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2515  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2516  Return Type - as per RTYPE
2517  Details :
2518 */
2519 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2520  out0, out1, out2, out3, out4, out5, out6, out7) \
2521 { \
2522  v8i16 s0_m, s1_m; \
2523  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2524  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2525  \
2526  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2527  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2528  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2529  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2530  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2531  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2532  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2533  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2534  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2535  tmp3_m, tmp7_m, out0, out2, out4, out6); \
2536  out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2537  out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2538  out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2539  out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2540 }
2541 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2542 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2543 
2544 /* Description : Transposes 4x4 block with word elements in vectors
2545  Arguments : Inputs - in0, in1, in2, in3
2546  Outputs - out0, out1, out2, out3
2547  Return Type - signed word
2548  Details :
2549 */
2550 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2551 { \
2552  v4i32 s0_m, s1_m, s2_m, s3_m; \
2553  \
2554  ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2555  ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2556  \
2557  out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2558  out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2559  out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2560  out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2561 }
2562 
2563 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2564  block in destination memory
2565  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2566  Details : Each byte element from input vector pair 'in0' and 'in1' are
2567  averaged (a + b)/2 and stored in 'tmp0_m'
2568  Each byte element from input vector pair 'in2' and 'in3' are
2569  averaged (a + b)/2 and stored in 'tmp1_m'
2570  Each byte element from input vector pair 'in4' and 'in5' are
2571  averaged (a + b)/2 and stored in 'tmp2_m'
2572  Each byte element from input vector pair 'in6' and 'in7' are
2573  averaged (a + b)/2 and stored in 'tmp3_m'
2574  The half vector results from all 4 vectors are stored in
2575  destination memory as 8x4 byte block
2576 */
2577 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2578 { \
2579  uint64_t out0_m, out1_m, out2_m, out3_m; \
2580  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2581  \
2582  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2583  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2584  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2585  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2586  \
2587  out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2588  out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2589  out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2590  out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2591  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2592 }
2593 
2594 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2595  block in destination memory
2596  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2597  Details : Each byte element from input vector pair 'in0' and 'in1' are
2598  averaged (a + b)/2 and stored in 'tmp0_m'
2599  Each byte element from input vector pair 'in2' and 'in3' are
2600  averaged (a + b)/2 and stored in 'tmp1_m'
2601  Each byte element from input vector pair 'in4' and 'in5' are
2602  averaged (a + b)/2 and stored in 'tmp2_m'
2603  Each byte element from input vector pair 'in6' and 'in7' are
2604  averaged (a + b)/2 and stored in 'tmp3_m'
2605  The results from all 4 vectors are stored in destination
2606  memory as 16x4 byte block
2607 */
2608 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2609 { \
2610  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2611  \
2612  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2613  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2614  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2615  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2616  \
2617  ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2618 }
2619 
2620 /* Description : Average rounded byte elements from pair of vectors and store
2621  8x4 byte block in destination memory
2622  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2623  Details : Each byte element from input vector pair 'in0' and 'in1' are
2624  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2625  Each byte element from input vector pair 'in2' and 'in3' are
2626  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2627  Each byte element from input vector pair 'in4' and 'in5' are
2628  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2629  Each byte element from input vector pair 'in6' and 'in7' are
2630  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2631  The half vector results from all 4 vectors are stored in
2632  destination memory as 8x4 byte block
2633 */
2634 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2635 { \
2636  uint64_t out0_m, out1_m, out2_m, out3_m; \
2637  v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2638  \
2639  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2640  tp0_m, tp1_m, tp2_m, tp3_m); \
2641  \
2642  out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2643  out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2644  out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2645  out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2646  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2647 }
2648 
2649 /* Description : Average rounded byte elements from pair of vectors and store
2650  16x4 byte block in destination memory
2651  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2652  Details : Each byte element from input vector pair 'in0' and 'in1' are
2653  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2654  Each byte element from input vector pair 'in2' and 'in3' are
2655  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2656  Each byte element from input vector pair 'in4' and 'in5' are
2657  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2658  Each byte element from input vector pair 'in6' and 'in7' are
2659  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2660  The vector results from all 4 vectors are stored in
2661  destination memory as 16x4 byte block
2662 */
2663 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2664 { \
2665  v16u8 t0_m, t1_m, t2_m, t3_m; \
2666  \
2667  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2668  t0_m, t1_m, t2_m, t3_m); \
2669  ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2670 }
2671 
2672 /* Description : Average rounded byte elements from pair of vectors,
2673  average rounded with destination and store 8x4 byte block
2674  in destination memory
2675  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2676  Details : Each byte element from input vector pair 'in0' and 'in1' are
2677  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2678  Each byte element from input vector pair 'in2' and 'in3' are
2679  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2680  Each byte element from input vector pair 'in4' and 'in5' are
2681  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2682  Each byte element from input vector pair 'in6' and 'in7' are
2683  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2684  The half vector results from all 4 vectors are stored in
2685  destination memory as 8x4 byte block
2686 */
2687 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2688  pdst, stride) \
2689 { \
2690  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2691  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2692  \
2693  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2694  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2695  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2696  AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2697  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2698 }
2699 
2700 /* Description : Average rounded byte elements from pair of vectors,
2701  average rounded with destination and store 16x4 byte block
2702  in destination memory
2703  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2704  Details : Each byte element from input vector pair 'in0' and 'in1' are
2705  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2706  Each byte element from input vector pair 'in2' and 'in3' are
2707  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2708  Each byte element from input vector pair 'in4' and 'in5' are
2709  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2710  Each byte element from input vector pair 'in6' and 'in7' are
2711  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2712  The vector results from all 4 vectors are stored in
2713  destination memory as 16x4 byte block
2714 */
2715 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2716  pdst, stride) \
2717 { \
2718  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2719  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2720  \
2721  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2722  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2723  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2724  AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2725  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2726 }
2727 
2728 /* Description : Add block 4x4
2729  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2730  Details : Least significant 4 bytes from each input vector are added to
2731  the destination bytes, clipped between 0-255 and then stored.
2732 */
2733 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2734 { \
2735  uint32_t src0_m, src1_m, src2_m, src3_m; \
2736  uint32_t out0_m, out1_m, out2_m, out3_m; \
2737  v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2738  v16i8 dst0_m = { 0 }; \
2739  v16i8 dst1_m = { 0 }; \
2740  v16i8 zero_m = { 0 }; \
2741  \
2742  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2743  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2744  INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2745  INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2746  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2747  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2748  CLIP_SH2_0_255(res0_m, res1_m); \
2749  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2750  \
2751  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2752  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2753  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2754  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2755  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2756 }
2757 
2758 /* Description : Dot product and addition of 3 signed halfword input vectors
2759  Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2760  Outputs - out0_m
2761  Return Type - signed halfword
2762  Details : Dot product of 'in0' with 'coeff0'
2763  Dot product of 'in1' with 'coeff1'
2764  Dot product of 'in2' with 'coeff2'
2765  Addition of all the 3 vector results
2766 
2767  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2768 */
2769 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2770 ( { \
2771  v8i16 out0_m; \
2772  \
2773  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2774  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2775  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
2776  \
2777  out0_m; \
2778 } )
2779 
2780 /* Description : Pack even elements of input vectors & xor with 128
2781  Arguments : Inputs - in0, in1
2782  Outputs - out_m
2783  Return Type - unsigned byte
2784  Details : Signed byte even elements from 'in0' and 'in1' are packed
2785  together in one vector and the resulted vector is xor'ed with
2786  128 to shift the range from signed to unsigned byte
2787 */
2788 #define PCKEV_XORI128_UB(in0, in1) \
2789 ( { \
2790  v16u8 out_m; \
2791  out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2792  out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2793  out_m; \
2794 } )
2795 
2796 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2797  as 8x4 unsigned byte block
2798  Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
2799 */
2800 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2801  dst0, dst1, pdst, stride) \
2802 { \
2803  v16u8 tmp0_m, tmp1_m; \
2804  uint8_t *pdst_m = (uint8_t *) (pdst); \
2805  \
2806  tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2807  tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2808  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
2809  ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
2810 }
2811 
2812 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2813  of results and store 4 words in destination memory as per
2814  stride
2815  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2816 */
2817 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2818 { \
2819  uint32_t out0_m, out1_m, out2_m, out3_m; \
2820  v16i8 tmp0_m, tmp1_m; \
2821  \
2822  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2823  \
2824  out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2825  out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2826  out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2827  out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2828  \
2829  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2830 }
2831 
2832 /* Description : Pack even byte elements and store byte vector in destination
2833  memory
2834  Arguments : Inputs - in0, in1, pdst
2835 */
2836 #define PCKEV_ST_SB(in0, in1, pdst) \
2837 { \
2838  v16i8 tmp_m; \
2839  tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2840  ST_SB(tmp_m, (pdst)); \
2841 }
2842 
2843 /* Description : Horizontal 2 tap filter kernel code
2844  Arguments : Inputs - in0, in1, mask, coeff, shift
2845 */
2846 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2847 ( { \
2848  v16i8 tmp0_m; \
2849  v8u16 tmp1_m; \
2850  \
2851  tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2852  tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2853  tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2854  tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2855  \
2856  tmp1_m; \
2857 } )
2858 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */