FFmpeg
generic_macros_msa.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
21 #ifndef AVUTIL_MIPS_GENERIC_MACROS_MSA_H
22 #define AVUTIL_MIPS_GENERIC_MACROS_MSA_H
23 
24 #include <stdint.h>
25 #include <msa.h>
26 #include <config.h>
27 
28 #if HAVE_MSA2
29 #include <msa2.h>
30 #endif
31 
32 #define ALIGNMENT 16
33 #define ALLOC_ALIGNED(align) __attribute__ ((aligned((align) << 1)))
34 
35 #define LD_V(RTYPE, psrc) *((RTYPE *)(psrc))
36 #define LD_UB(...) LD_V(v16u8, __VA_ARGS__)
37 #define LD_SB(...) LD_V(v16i8, __VA_ARGS__)
38 #define LD_UH(...) LD_V(v8u16, __VA_ARGS__)
39 #define LD_SH(...) LD_V(v8i16, __VA_ARGS__)
40 #define LD_UW(...) LD_V(v4u32, __VA_ARGS__)
41 #define LD_SW(...) LD_V(v4i32, __VA_ARGS__)
42 
43 #define ST_V(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
44 #define ST_UB(...) ST_V(v16u8, __VA_ARGS__)
45 #define ST_SB(...) ST_V(v16i8, __VA_ARGS__)
46 #define ST_UH(...) ST_V(v8u16, __VA_ARGS__)
47 #define ST_SH(...) ST_V(v8i16, __VA_ARGS__)
48 #define ST_UW(...) ST_V(v4u32, __VA_ARGS__)
49 #define ST_SW(...) ST_V(v4i32, __VA_ARGS__)
50 
51 #if (__mips_isa_rev >= 6)
52  #define LH(psrc) \
53  ( { \
54  uint16_t val_lh_m = *(uint16_t *)(psrc); \
55  val_lh_m; \
56  } )
57 
58  #define LW(psrc) \
59  ( { \
60  uint32_t val_lw_m = *(uint32_t *)(psrc); \
61  val_lw_m; \
62  } )
63 
64  #if (__mips == 64)
65  #define LD(psrc) \
66  ( { \
67  uint64_t val_ld_m = *(uint64_t *)(psrc); \
68  val_ld_m; \
69  } )
70  #else // !(__mips == 64)
71  #define LD(psrc) \
72  ( { \
73  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
74  uint32_t val0_ld_m, val1_ld_m; \
75  uint64_t val_ld_m = 0; \
76  \
77  val0_ld_m = LW(psrc_ld_m); \
78  val1_ld_m = LW(psrc_ld_m + 4); \
79  \
80  val_ld_m = (uint64_t) (val1_ld_m); \
81  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
82  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
83  \
84  val_ld_m; \
85  } )
86  #endif // (__mips == 64)
87 
88  #define SH(val, pdst) *(uint16_t *)(pdst) = (val);
89  #define SW(val, pdst) *(uint32_t *)(pdst) = (val);
90  #define SD(val, pdst) *(uint64_t *)(pdst) = (val);
91 
92 #else // !(__mips_isa_rev >= 6)
93  #define LH(psrc) \
94  ( { \
95  uint8_t *psrc_lh_m = (uint8_t *) (psrc); \
96  uint16_t val_lh_m; \
97  \
98  __asm__ volatile ( \
99  "ulh %[val_lh_m], %[psrc_lh_m] \n\t" \
100  \
101  : [val_lh_m] "=r" (val_lh_m) \
102  : [psrc_lh_m] "m" (*psrc_lh_m) \
103  ); \
104  \
105  val_lh_m; \
106  } )
107 
108  #define LW(psrc) \
109  ( { \
110  uint8_t *psrc_lw_m = (uint8_t *) (psrc); \
111  uint32_t val_lw_m; \
112  \
113  __asm__ volatile ( \
114  "ulw %[val_lw_m], %[psrc_lw_m] \n\t" \
115  \
116  : [val_lw_m] "=r" (val_lw_m) \
117  : [psrc_lw_m] "m" (*psrc_lw_m) \
118  ); \
119  \
120  val_lw_m; \
121  } )
122 
123  #if (__mips == 64)
124  #define LD(psrc) \
125  ( { \
126  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
127  uint64_t val_ld_m = 0; \
128  \
129  __asm__ volatile ( \
130  "uld %[val_ld_m], %[psrc_ld_m] \n\t" \
131  \
132  : [val_ld_m] "=r" (val_ld_m) \
133  : [psrc_ld_m] "m" (*psrc_ld_m) \
134  ); \
135  \
136  val_ld_m; \
137  } )
138  #else // !(__mips == 64)
139  #define LD(psrc) \
140  ( { \
141  uint8_t *psrc_ld_m = (uint8_t *) (psrc); \
142  uint32_t val0_ld_m, val1_ld_m; \
143  uint64_t val_ld_m = 0; \
144  \
145  val0_ld_m = LW(psrc_ld_m); \
146  val1_ld_m = LW(psrc_ld_m + 4); \
147  \
148  val_ld_m = (uint64_t) (val1_ld_m); \
149  val_ld_m = (uint64_t) ((val_ld_m << 32) & 0xFFFFFFFF00000000); \
150  val_ld_m = (uint64_t) (val_ld_m | (uint64_t) val0_ld_m); \
151  \
152  val_ld_m; \
153  } )
154  #endif // (__mips == 64)
155 
156  #define SH(val, pdst) \
157  { \
158  uint8_t *pdst_sh_m = (uint8_t *) (pdst); \
159  uint16_t val_sh_m = (val); \
160  \
161  __asm__ volatile ( \
162  "ush %[val_sh_m], %[pdst_sh_m] \n\t" \
163  \
164  : [pdst_sh_m] "=m" (*pdst_sh_m) \
165  : [val_sh_m] "r" (val_sh_m) \
166  ); \
167  }
168 
169  #define SW(val, pdst) \
170  { \
171  uint8_t *pdst_sw_m = (uint8_t *) (pdst); \
172  uint32_t val_sw_m = (val); \
173  \
174  __asm__ volatile ( \
175  "usw %[val_sw_m], %[pdst_sw_m] \n\t" \
176  \
177  : [pdst_sw_m] "=m" (*pdst_sw_m) \
178  : [val_sw_m] "r" (val_sw_m) \
179  ); \
180  }
181 
182  #define SD(val, pdst) \
183  { \
184  uint8_t *pdst_sd_m = (uint8_t *) (pdst); \
185  uint32_t val0_sd_m, val1_sd_m; \
186  \
187  val0_sd_m = (uint32_t) ((val) & 0x00000000FFFFFFFF); \
188  val1_sd_m = (uint32_t) (((val) >> 32) & 0x00000000FFFFFFFF); \
189  \
190  SW(val0_sd_m, pdst_sd_m); \
191  SW(val1_sd_m, pdst_sd_m + 4); \
192  }
193 #endif // (__mips_isa_rev >= 6)
194 
195 /* Description : Load 4 words with stride
196  Arguments : Inputs - psrc (source pointer to load from)
197  - stride
198  Outputs - out0, out1, out2, out3
199  Details : Loads word in 'out0' from (psrc)
200  Loads word in 'out1' from (psrc + stride)
201  Loads word in 'out2' from (psrc + 2 * stride)
202  Loads word in 'out3' from (psrc + 3 * stride)
203 */
204 #define LW4(psrc, stride, out0, out1, out2, out3) \
205 { \
206  out0 = LW((psrc)); \
207  out1 = LW((psrc) + stride); \
208  out2 = LW((psrc) + 2 * stride); \
209  out3 = LW((psrc) + 3 * stride); \
210 }
211 
212 #define LW2(psrc, stride, out0, out1) \
213 { \
214  out0 = LW((psrc)); \
215  out1 = LW((psrc) + stride); \
216 }
217 
218 /* Description : Load double words with stride
219  Arguments : Inputs - psrc (source pointer to load from)
220  - stride
221  Outputs - out0, out1
222  Details : Loads double word in 'out0' from (psrc)
223  Loads double word in 'out1' from (psrc + stride)
224 */
225 #define LD2(psrc, stride, out0, out1) \
226 { \
227  out0 = LD((psrc)); \
228  out1 = LD((psrc) + stride); \
229 }
230 #define LD4(psrc, stride, out0, out1, out2, out3) \
231 { \
232  LD2((psrc), stride, out0, out1); \
233  LD2((psrc) + 2 * stride, stride, out2, out3); \
234 }
235 
236 /* Description : Store 4 words with stride
237  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
238  Details : Stores word from 'in0' to (pdst)
239  Stores word from 'in1' to (pdst + stride)
240  Stores word from 'in2' to (pdst + 2 * stride)
241  Stores word from 'in3' to (pdst + 3 * stride)
242 */
243 #define SW4(in0, in1, in2, in3, pdst, stride) \
244 { \
245  SW(in0, (pdst)) \
246  SW(in1, (pdst) + stride); \
247  SW(in2, (pdst) + 2 * stride); \
248  SW(in3, (pdst) + 3 * stride); \
249 }
250 
251 /* Description : Store 4 double words with stride
252  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
253  Details : Stores double word from 'in0' to (pdst)
254  Stores double word from 'in1' to (pdst + stride)
255  Stores double word from 'in2' to (pdst + 2 * stride)
256  Stores double word from 'in3' to (pdst + 3 * stride)
257 */
258 #define SD4(in0, in1, in2, in3, pdst, stride) \
259 { \
260  SD(in0, (pdst)) \
261  SD(in1, (pdst) + stride); \
262  SD(in2, (pdst) + 2 * stride); \
263  SD(in3, (pdst) + 3 * stride); \
264 }
265 
266 /* Description : Load vector elements with stride
267  Arguments : Inputs - psrc (source pointer to load from)
268  - stride
269  Outputs - out0, out1
270  Return Type - as per RTYPE
271  Details : Loads elements in 'out0' from (psrc)
272  Loads elements in 'out1' from (psrc + stride)
273 */
274 #define LD_V2(RTYPE, psrc, stride, out0, out1) \
275 { \
276  out0 = LD_V(RTYPE, (psrc)); \
277  out1 = LD_V(RTYPE, (psrc) + stride); \
278 }
279 #define LD_UB2(...) LD_V2(v16u8, __VA_ARGS__)
280 #define LD_SB2(...) LD_V2(v16i8, __VA_ARGS__)
281 #define LD_UH2(...) LD_V2(v8u16, __VA_ARGS__)
282 #define LD_SH2(...) LD_V2(v8i16, __VA_ARGS__)
283 #define LD_SW2(...) LD_V2(v4i32, __VA_ARGS__)
284 
285 #define LD_V3(RTYPE, psrc, stride, out0, out1, out2) \
286 { \
287  LD_V2(RTYPE, (psrc), stride, out0, out1); \
288  out2 = LD_V(RTYPE, (psrc) + 2 * stride); \
289 }
290 #define LD_UB3(...) LD_V3(v16u8, __VA_ARGS__)
291 #define LD_SB3(...) LD_V3(v16i8, __VA_ARGS__)
292 
293 #define LD_V4(RTYPE, psrc, stride, out0, out1, out2, out3) \
294 { \
295  LD_V2(RTYPE, (psrc), stride, out0, out1); \
296  LD_V2(RTYPE, (psrc) + 2 * stride , stride, out2, out3); \
297 }
298 #define LD_UB4(...) LD_V4(v16u8, __VA_ARGS__)
299 #define LD_SB4(...) LD_V4(v16i8, __VA_ARGS__)
300 #define LD_UH4(...) LD_V4(v8u16, __VA_ARGS__)
301 #define LD_SH4(...) LD_V4(v8i16, __VA_ARGS__)
302 
303 #define LD_V5(RTYPE, psrc, stride, out0, out1, out2, out3, out4) \
304 { \
305  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
306  out4 = LD_V(RTYPE, (psrc) + 4 * stride); \
307 }
308 #define LD_UB5(...) LD_V5(v16u8, __VA_ARGS__)
309 #define LD_SB5(...) LD_V5(v16i8, __VA_ARGS__)
310 
311 #define LD_V6(RTYPE, psrc, stride, out0, out1, out2, out3, out4, out5) \
312 { \
313  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
314  LD_V2(RTYPE, (psrc) + 4 * stride, stride, out4, out5); \
315 }
316 #define LD_UB6(...) LD_V6(v16u8, __VA_ARGS__)
317 #define LD_SB6(...) LD_V6(v16i8, __VA_ARGS__)
318 #define LD_UH6(...) LD_V6(v8u16, __VA_ARGS__)
319 #define LD_SH6(...) LD_V6(v8i16, __VA_ARGS__)
320 
321 #define LD_V7(RTYPE, psrc, stride, \
322  out0, out1, out2, out3, out4, out5, out6) \
323 { \
324  LD_V5(RTYPE, (psrc), stride, out0, out1, out2, out3, out4); \
325  LD_V2(RTYPE, (psrc) + 5 * stride, stride, out5, out6); \
326 }
327 #define LD_UB7(...) LD_V7(v16u8, __VA_ARGS__)
328 #define LD_SB7(...) LD_V7(v16i8, __VA_ARGS__)
329 
330 #define LD_V8(RTYPE, psrc, stride, \
331  out0, out1, out2, out3, out4, out5, out6, out7) \
332 { \
333  LD_V4(RTYPE, (psrc), stride, out0, out1, out2, out3); \
334  LD_V4(RTYPE, (psrc) + 4 * stride, stride, out4, out5, out6, out7); \
335 }
336 #define LD_UB8(...) LD_V8(v16u8, __VA_ARGS__)
337 #define LD_SB8(...) LD_V8(v16i8, __VA_ARGS__)
338 #define LD_UH8(...) LD_V8(v8u16, __VA_ARGS__)
339 #define LD_SH8(...) LD_V8(v8i16, __VA_ARGS__)
340 
341 #define LD_V16(RTYPE, psrc, stride, \
342  out0, out1, out2, out3, out4, out5, out6, out7, \
343  out8, out9, out10, out11, out12, out13, out14, out15) \
344 { \
345  LD_V8(RTYPE, (psrc), stride, \
346  out0, out1, out2, out3, out4, out5, out6, out7); \
347  LD_V8(RTYPE, (psrc) + 8 * stride, stride, \
348  out8, out9, out10, out11, out12, out13, out14, out15); \
349 }
350 #define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
351 
352 /* Description : Store vectors with stride
353  Arguments : Inputs - in0, in1, stride
354  Outputs - pdst (destination pointer to store to)
355  Details : Stores elements from 'in0' to (pdst)
356  Stores elements from 'in1' to (pdst + stride)
357 */
358 #define ST_V2(RTYPE, in0, in1, pdst, stride) \
359 { \
360  ST_V(RTYPE, in0, (pdst)); \
361  ST_V(RTYPE, in1, (pdst) + stride); \
362 }
363 #define ST_UB2(...) ST_V2(v16u8, __VA_ARGS__)
364 #define ST_SB2(...) ST_V2(v16i8, __VA_ARGS__)
365 #define ST_UH2(...) ST_V2(v8u16, __VA_ARGS__)
366 #define ST_SH2(...) ST_V2(v8i16, __VA_ARGS__)
367 #define ST_SW2(...) ST_V2(v4i32, __VA_ARGS__)
368 
369 #define ST_V4(RTYPE, in0, in1, in2, in3, pdst, stride) \
370 { \
371  ST_V2(RTYPE, in0, in1, (pdst), stride); \
372  ST_V2(RTYPE, in2, in3, (pdst) + 2 * stride, stride); \
373 }
374 #define ST_UB4(...) ST_V4(v16u8, __VA_ARGS__)
375 #define ST_SB4(...) ST_V4(v16i8, __VA_ARGS__)
376 #define ST_SH4(...) ST_V4(v8i16, __VA_ARGS__)
377 #define ST_SW4(...) ST_V4(v4i32, __VA_ARGS__)
378 
379 #define ST_V6(RTYPE, in0, in1, in2, in3, in4, in5, pdst, stride) \
380 { \
381  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
382  ST_V2(RTYPE, in4, in5, (pdst) + 4 * stride, stride); \
383 }
384 #define ST_SH6(...) ST_V6(v8i16, __VA_ARGS__)
385 
386 #define ST_V8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
387 { \
388  ST_V4(RTYPE, in0, in1, in2, in3, (pdst), stride); \
389  ST_V4(RTYPE, in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
390 }
391 #define ST_UB8(...) ST_V8(v16u8, __VA_ARGS__)
392 #define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
393 #define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
394 
395 /* Description : Store half word elements of vector with stride
396  * Arguments : Inputs - in source vector
397  * - pdst (destination pointer to store to)
398  * - stride
399  * Details : Stores half word 'idx0' from 'in' to (pdst)
400  * Stores half word 'idx1' from 'in' to (pdst + stride)
401  * Similar for other elements
402  */
403 #define ST_H1(in, idx, pdst) \
404 { \
405  uint16_t out0_m; \
406  out0_m = __msa_copy_u_h((v8i16) in, idx); \
407  SH(out0_m, (pdst)); \
408 }
409 #define ST_H2(in, idx0, idx1, pdst, stride) \
410 { \
411  uint16_t out0_m, out1_m; \
412  out0_m = __msa_copy_u_h((v8i16) in, idx0); \
413  out1_m = __msa_copy_u_h((v8i16) in, idx1); \
414  SH(out0_m, (pdst)); \
415  SH(out1_m, (pdst) + stride); \
416 }
417 #define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
418 { \
419  uint16_t out0_m, out1_m, out2_m, out3_m; \
420  out0_m = __msa_copy_u_h((v8i16) in, idx0); \
421  out1_m = __msa_copy_u_h((v8i16) in, idx1); \
422  out2_m = __msa_copy_u_h((v8i16) in, idx2); \
423  out3_m = __msa_copy_u_h((v8i16) in, idx3); \
424  SH(out0_m, (pdst)); \
425  SH(out1_m, (pdst) + stride); \
426  SH(out2_m, (pdst) + 2 * stride); \
427  SH(out3_m, (pdst) + 3 * stride); \
428 }
429 #define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \
430  idx6, idx7, pdst, stride) \
431 { \
432  ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
433  ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
434 }
435 
436 /* Description : Store word elements of vector with stride
437  * Arguments : Inputs - in source vector
438  * - pdst (destination pointer to store to)
439  * - stride
440  * Details : Stores word 'idx0' from 'in' to (pdst)
441  * Stores word 'idx1' from 'in' to (pdst + stride)
442  * Similar for other elements
443  */
444 #define ST_W1(in, idx, pdst) \
445 { \
446  uint32_t out0_m; \
447  out0_m = __msa_copy_u_w((v4i32) in, idx); \
448  SW(out0_m, (pdst)); \
449 }
450 #define ST_W2(in, idx0, idx1, pdst, stride) \
451 { \
452  uint32_t out0_m, out1_m; \
453  out0_m = __msa_copy_u_w((v4i32) in, idx0); \
454  out1_m = __msa_copy_u_w((v4i32) in, idx1); \
455  SW(out0_m, (pdst)); \
456  SW(out1_m, (pdst) + stride); \
457 }
458 #define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \
459 { \
460  uint32_t out0_m, out1_m, out2_m, out3_m; \
461  out0_m = __msa_copy_u_w((v4i32) in, idx0); \
462  out1_m = __msa_copy_u_w((v4i32) in, idx1); \
463  out2_m = __msa_copy_u_w((v4i32) in, idx2); \
464  out3_m = __msa_copy_u_w((v4i32) in, idx3); \
465  SW(out0_m, (pdst)); \
466  SW(out1_m, (pdst) + stride); \
467  SW(out2_m, (pdst) + 2*stride); \
468  SW(out3_m, (pdst) + 3*stride); \
469 }
470 #define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \
471  idx4, idx5, idx6, idx7, pdst, stride) \
472 { \
473  ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \
474  ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
475 }
476 
477 /* Description : Store double word elements of vector with stride
478  * Arguments : Inputs - in source vector
479  * - pdst (destination pointer to store to)
480  * - stride
481  * Details : Stores double word 'idx0' from 'in' to (pdst)
482  * Stores double word 'idx1' from 'in' to (pdst + stride)
483  * Similar for other elements
484  */
485 #define ST_D1(in, idx, pdst) \
486 { \
487  uint64_t out0_m; \
488  out0_m = __msa_copy_u_d((v2i64) in, idx); \
489  SD(out0_m, (pdst)); \
490 }
491 #define ST_D2(in, idx0, idx1, pdst, stride) \
492 { \
493  uint64_t out0_m, out1_m; \
494  out0_m = __msa_copy_u_d((v2i64) in, idx0); \
495  out1_m = __msa_copy_u_d((v2i64) in, idx1); \
496  SD(out0_m, (pdst)); \
497  SD(out1_m, (pdst) + stride); \
498 }
499 #define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
500 { \
501  uint64_t out0_m, out1_m, out2_m, out3_m; \
502  out0_m = __msa_copy_u_d((v2i64) in0, idx0); \
503  out1_m = __msa_copy_u_d((v2i64) in0, idx1); \
504  out2_m = __msa_copy_u_d((v2i64) in1, idx2); \
505  out3_m = __msa_copy_u_d((v2i64) in1, idx3); \
506  SD(out0_m, (pdst)); \
507  SD(out1_m, (pdst) + stride); \
508  SD(out2_m, (pdst) + 2 * stride); \
509  SD(out3_m, (pdst) + 3 * stride); \
510 }
511 #define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \
512  idx4, idx5, idx6, idx7, pdst, stride) \
513 { \
514  ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
515  ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
516 }
517 
518 /* Description : Store as 12x8 byte block to destination memory from
519  input vectors
520  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
521  Details : Index 0 double word element from input vector 'in0' is copied
522  and stored to destination memory at (pblk_12x8_m) followed by
523  index 2 word element from same input vector 'in0' at
524  (pblk_12x8_m + 8)
525  Similar to remaining lines
526 */
527 #define ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
528 { \
529  uint64_t out0_m, out1_m, out2_m, out3_m; \
530  uint64_t out4_m, out5_m, out6_m, out7_m; \
531  uint32_t out8_m, out9_m, out10_m, out11_m; \
532  uint32_t out12_m, out13_m, out14_m, out15_m; \
533  uint8_t *pblk_12x8_m = (uint8_t *) (pdst); \
534  \
535  out0_m = __msa_copy_u_d((v2i64) in0, 0); \
536  out1_m = __msa_copy_u_d((v2i64) in1, 0); \
537  out2_m = __msa_copy_u_d((v2i64) in2, 0); \
538  out3_m = __msa_copy_u_d((v2i64) in3, 0); \
539  out4_m = __msa_copy_u_d((v2i64) in4, 0); \
540  out5_m = __msa_copy_u_d((v2i64) in5, 0); \
541  out6_m = __msa_copy_u_d((v2i64) in6, 0); \
542  out7_m = __msa_copy_u_d((v2i64) in7, 0); \
543  \
544  out8_m = __msa_copy_u_w((v4i32) in0, 2); \
545  out9_m = __msa_copy_u_w((v4i32) in1, 2); \
546  out10_m = __msa_copy_u_w((v4i32) in2, 2); \
547  out11_m = __msa_copy_u_w((v4i32) in3, 2); \
548  out12_m = __msa_copy_u_w((v4i32) in4, 2); \
549  out13_m = __msa_copy_u_w((v4i32) in5, 2); \
550  out14_m = __msa_copy_u_w((v4i32) in6, 2); \
551  out15_m = __msa_copy_u_w((v4i32) in7, 2); \
552  \
553  SD(out0_m, pblk_12x8_m); \
554  SW(out8_m, pblk_12x8_m + 8); \
555  pblk_12x8_m += stride; \
556  SD(out1_m, pblk_12x8_m); \
557  SW(out9_m, pblk_12x8_m + 8); \
558  pblk_12x8_m += stride; \
559  SD(out2_m, pblk_12x8_m); \
560  SW(out10_m, pblk_12x8_m + 8); \
561  pblk_12x8_m += stride; \
562  SD(out3_m, pblk_12x8_m); \
563  SW(out11_m, pblk_12x8_m + 8); \
564  pblk_12x8_m += stride; \
565  SD(out4_m, pblk_12x8_m); \
566  SW(out12_m, pblk_12x8_m + 8); \
567  pblk_12x8_m += stride; \
568  SD(out5_m, pblk_12x8_m); \
569  SW(out13_m, pblk_12x8_m + 8); \
570  pblk_12x8_m += stride; \
571  SD(out6_m, pblk_12x8_m); \
572  SW(out14_m, pblk_12x8_m + 8); \
573  pblk_12x8_m += stride; \
574  SD(out7_m, pblk_12x8_m); \
575  SW(out15_m, pblk_12x8_m + 8); \
576 }
577 
578 /* Description : average with rounding (in0 + in1 + 1) / 2.
579  Arguments : Inputs - in0, in1, in2, in3,
580  Outputs - out0, out1
581  Return Type - as per RTYPE
582  Details : Each byte element from 'in0' vector is added with each byte
583  element from 'in1' vector. The addition of the elements plus 1
584  (for rounding) is done unsigned with full precision,
585  i.e. the result has one extra bit. Unsigned division by 2
586  (or logical shift right by one bit) is performed before writing
587  the result to vector 'out0'
588  Similar for the pair of 'in2' and 'in3'
589 */
590 #define AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
591 { \
592  out0 = (RTYPE) __msa_aver_u_b((v16u8) in0, (v16u8) in1); \
593  out1 = (RTYPE) __msa_aver_u_b((v16u8) in2, (v16u8) in3); \
594 }
595 #define AVER_UB2_UB(...) AVER_UB2(v16u8, __VA_ARGS__)
596 
597 #define AVER_UB4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
598  out0, out1, out2, out3) \
599 { \
600  AVER_UB2(RTYPE, in0, in1, in2, in3, out0, out1) \
601  AVER_UB2(RTYPE, in4, in5, in6, in7, out2, out3) \
602 }
603 #define AVER_UB4_UB(...) AVER_UB4(v16u8, __VA_ARGS__)
604 
605 /* Description : Immediate number of columns to slide
606  Arguments : Inputs - s, d, slide_val
607  Outputs - out
608  Return Type - as per RTYPE
609  Details : Byte elements from 'd' vector are slide into 's' by
610  number of elements specified by 'slide_val'
611 */
612 #define SLDI_B(RTYPE, d, s, slide_val, out) \
613 { \
614  out = (RTYPE) __msa_sldi_b((v16i8) d, (v16i8) s, slide_val); \
615 }
616 
617 #define SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
618 { \
619  SLDI_B(RTYPE, d0, s0, slide_val, out0) \
620  SLDI_B(RTYPE, d1, s1, slide_val, out1) \
621 }
622 #define SLDI_B2_UB(...) SLDI_B2(v16u8, __VA_ARGS__)
623 #define SLDI_B2_SB(...) SLDI_B2(v16i8, __VA_ARGS__)
624 #define SLDI_B2_SH(...) SLDI_B2(v8i16, __VA_ARGS__)
625 #define SLDI_B2_SW(...) SLDI_B2(v4i32, __VA_ARGS__)
626 
627 #define SLDI_B3(RTYPE, d0, s0, d1, s1, d2, s2, slide_val, \
628  out0, out1, out2) \
629 { \
630  SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
631  SLDI_B(RTYPE, d2, s2, slide_val, out2) \
632 }
633 #define SLDI_B3_UB(...) SLDI_B3(v16u8, __VA_ARGS__)
634 #define SLDI_B3_SB(...) SLDI_B3(v16i8, __VA_ARGS__)
635 #define SLDI_B3_UH(...) SLDI_B3(v8u16, __VA_ARGS__)
636 
637 #define SLDI_B4(RTYPE, d0, s0, d1, s1, d2, s2, d3, s3, \
638  slide_val, out0, out1, out2, out3) \
639 { \
640  SLDI_B2(RTYPE, d0, s0, d1, s1, slide_val, out0, out1) \
641  SLDI_B2(RTYPE, d2, s2, d3, s3, slide_val, out2, out3) \
642 }
643 #define SLDI_B4_UB(...) SLDI_B4(v16u8, __VA_ARGS__)
644 #define SLDI_B4_SB(...) SLDI_B4(v16i8, __VA_ARGS__)
645 #define SLDI_B4_SH(...) SLDI_B4(v8i16, __VA_ARGS__)
646 
647 /* Description : Shuffle byte vector elements as per mask vector
648  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
649  Outputs - out0, out1
650  Return Type - as per RTYPE
651  Details : Selective byte elements from in0 & in1 are copied to out0 as
652  per control vector mask0
653  Selective byte elements from in2 & in3 are copied to out1 as
654  per control vector mask1
655 */
656 #define VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
657 { \
658  out0 = (RTYPE) __msa_vshf_b((v16i8) mask0, (v16i8) in1, (v16i8) in0); \
659  out1 = (RTYPE) __msa_vshf_b((v16i8) mask1, (v16i8) in3, (v16i8) in2); \
660 }
661 #define VSHF_B2_UB(...) VSHF_B2(v16u8, __VA_ARGS__)
662 #define VSHF_B2_SB(...) VSHF_B2(v16i8, __VA_ARGS__)
663 #define VSHF_B2_UH(...) VSHF_B2(v8u16, __VA_ARGS__)
664 #define VSHF_B2_SH(...) VSHF_B2(v8i16, __VA_ARGS__)
665 
666 #define VSHF_B3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
667  out0, out1, out2) \
668 { \
669  VSHF_B2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
670  out2 = (RTYPE) __msa_vshf_b((v16i8) mask2, (v16i8) in5, (v16i8) in4); \
671 }
672 #define VSHF_B3_SB(...) VSHF_B3(v16i8, __VA_ARGS__)
673 
674 #define VSHF_B4(RTYPE, in0, in1, mask0, mask1, mask2, mask3, \
675  out0, out1, out2, out3) \
676 { \
677  VSHF_B2(RTYPE, in0, in1, in0, in1, mask0, mask1, out0, out1); \
678  VSHF_B2(RTYPE, in0, in1, in0, in1, mask2, mask3, out2, out3); \
679 }
680 #define VSHF_B4_SB(...) VSHF_B4(v16i8, __VA_ARGS__)
681 #define VSHF_B4_SH(...) VSHF_B4(v8i16, __VA_ARGS__)
682 
683 /* Description : Shuffle halfword vector elements as per mask vector
684  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
685  Outputs - out0, out1
686  Return Type - as per RTYPE
687  Details : Selective halfword elements from in0 & in1 are copied to out0
688  as per control vector mask0
689  Selective halfword elements from in2 & in3 are copied to out1
690  as per control vector mask1
691 */
692 #define VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
693 { \
694  out0 = (RTYPE) __msa_vshf_h((v8i16) mask0, (v8i16) in1, (v8i16) in0); \
695  out1 = (RTYPE) __msa_vshf_h((v8i16) mask1, (v8i16) in3, (v8i16) in2); \
696 }
697 #define VSHF_H2_SH(...) VSHF_H2(v8i16, __VA_ARGS__)
698 
699 #define VSHF_H3(RTYPE, in0, in1, in2, in3, in4, in5, mask0, mask1, mask2, \
700  out0, out1, out2) \
701 { \
702  VSHF_H2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1); \
703  out2 = (RTYPE) __msa_vshf_h((v8i16) mask2, (v8i16) in5, (v8i16) in4); \
704 }
705 #define VSHF_H3_SH(...) VSHF_H3(v8i16, __VA_ARGS__)
706 
707 /* Description : Shuffle byte vector elements as per mask vector
708  Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
709  Outputs - out0, out1
710  Return Type - as per RTYPE
711  Details : Selective byte elements from in0 & in1 are copied to out0 as
712  per control vector mask0
713  Selective byte elements from in2 & in3 are copied to out1 as
714  per control vector mask1
715 */
716 #define VSHF_W2(RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1) \
717 { \
718  out0 = (RTYPE) __msa_vshf_w((v4i32) mask0, (v4i32) in1, (v4i32) in0); \
719  out1 = (RTYPE) __msa_vshf_w((v4i32) mask1, (v4i32) in3, (v4i32) in2); \
720 }
721 #define VSHF_W2_SB(...) VSHF_W2(v16i8, __VA_ARGS__)
722 
723 /* Description : Dot product of byte vector elements
724  Arguments : Inputs - mult0, mult1
725  cnst0, cnst1
726  Outputs - out0, out1
727  Return Type - as per RTYPE
728  Details : Unsigned byte elements from mult0 are multiplied with
729  unsigned byte elements from cnst0 producing a result
730  twice the size of input i.e. unsigned halfword.
731  Then this multiplication results of adjacent odd-even elements
732  are added together and stored to the out vector
733  (2 unsigned halfword results)
734 */
735 #define DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
736 { \
737  out0 = (RTYPE) __msa_dotp_u_h((v16u8) mult0, (v16u8) cnst0); \
738  out1 = (RTYPE) __msa_dotp_u_h((v16u8) mult1, (v16u8) cnst1); \
739 }
740 #define DOTP_UB2_UH(...) DOTP_UB2(v8u16, __VA_ARGS__)
741 
742 #define DOTP_UB4(RTYPE, mult0, mult1, mult2, mult3, \
743  cnst0, cnst1, cnst2, cnst3, \
744  out0, out1, out2, out3) \
745 { \
746  DOTP_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
747  DOTP_UB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
748 }
749 #define DOTP_UB4_UH(...) DOTP_UB4(v8u16, __VA_ARGS__)
750 
751 /* Description : Dot product of byte vector elements
752  Arguments : Inputs - mult0, mult1
753  cnst0, cnst1
754  Outputs - out0, out1
755  Return Type - as per RTYPE
756  Details : Signed byte elements from mult0 are multiplied with
757  signed byte elements from cnst0 producing a result
758  twice the size of input i.e. signed halfword.
759  Then this multiplication results of adjacent odd-even elements
760  are added together and stored to the out vector
761  (2 signed halfword results)
762 */
763 #define DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
764 { \
765  out0 = (RTYPE) __msa_dotp_s_h((v16i8) mult0, (v16i8) cnst0); \
766  out1 = (RTYPE) __msa_dotp_s_h((v16i8) mult1, (v16i8) cnst1); \
767 }
768 #define DOTP_SB2_SH(...) DOTP_SB2(v8i16, __VA_ARGS__)
769 
770 #define DOTP_SB3(RTYPE, mult0, mult1, mult2, cnst0, cnst1, cnst2, \
771  out0, out1, out2) \
772 { \
773  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
774  out2 = (RTYPE) __msa_dotp_s_h((v16i8) mult2, (v16i8) cnst2); \
775 }
776 #define DOTP_SB3_SH(...) DOTP_SB3(v8i16, __VA_ARGS__)
777 
778 #define DOTP_SB4(RTYPE, mult0, mult1, mult2, mult3, \
779  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
780 { \
781  DOTP_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
782  DOTP_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
783 }
784 #define DOTP_SB4_SH(...) DOTP_SB4(v8i16, __VA_ARGS__)
785 
786 /* Description : Dot product of halfword vector elements
787  Arguments : Inputs - mult0, mult1
788  cnst0, cnst1
789  Outputs - out0, out1
790  Return Type - as per RTYPE
791  Details : Signed halfword elements from mult0 are multiplied with
792  signed halfword elements from cnst0 producing a result
793  twice the size of input i.e. signed word.
794  Then this multiplication results of adjacent odd-even elements
795  are added together and stored to the out vector
796  (2 signed word results)
797 */
798 #define DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
799 { \
800  out0 = (RTYPE) __msa_dotp_s_w((v8i16) mult0, (v8i16) cnst0); \
801  out1 = (RTYPE) __msa_dotp_s_w((v8i16) mult1, (v8i16) cnst1); \
802 }
803 #define DOTP_SH2_SW(...) DOTP_SH2(v4i32, __VA_ARGS__)
804 
805 #define DOTP_SH4(RTYPE, mult0, mult1, mult2, mult3, \
806  cnst0, cnst1, cnst2, cnst3, \
807  out0, out1, out2, out3) \
808 { \
809  DOTP_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
810  DOTP_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
811 }
812 #define DOTP_SH4_SW(...) DOTP_SH4(v4i32, __VA_ARGS__)
813 
814 /* Description : Dot product & addition of byte vector elements
815  Arguments : Inputs - mult0, mult1
816  cnst0, cnst1
817  Outputs - out0, out1
818  Return Type - as per RTYPE
819  Details : Signed byte elements from mult0 are multiplied with
820  signed byte elements from cnst0 producing a result
821  twice the size of input i.e. signed halfword.
822  Then this multiplication results of adjacent odd-even elements
823  are added to the out vector
824  (2 signed halfword results)
825 */
826 #define DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
827 { \
828  out0 = (RTYPE) __msa_dpadd_s_h((v8i16) out0, \
829  (v16i8) mult0, (v16i8) cnst0); \
830  out1 = (RTYPE) __msa_dpadd_s_h((v8i16) out1, \
831  (v16i8) mult1, (v16i8) cnst1); \
832 }
833 #define DPADD_SB2_SH(...) DPADD_SB2(v8i16, __VA_ARGS__)
834 
835 #define DPADD_SB4(RTYPE, mult0, mult1, mult2, mult3, \
836  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
837 { \
838  DPADD_SB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
839  DPADD_SB2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
840 }
841 #define DPADD_SB4_SH(...) DPADD_SB4(v8i16, __VA_ARGS__)
842 
843 /* Description : Dot product & addition of byte vector elements
844  Arguments : Inputs - mult0, mult1
845  cnst0, cnst1
846  Outputs - out0, out1
847  Return Type - as per RTYPE
848  Details : Unsigned byte elements from mult0 are multiplied with
849  unsigned byte elements from cnst0 producing a result
850  twice the size of input i.e. unsigned halfword.
851  Then this multiplication results of adjacent odd-even elements
852  are added to the out vector
853  (2 unsigned halfword results)
854 */
855 #define DPADD_UB2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
856 { \
857  out0 = (RTYPE) __msa_dpadd_u_h((v8u16) out0, \
858  (v16u8) mult0, (v16u8) cnst0); \
859  out1 = (RTYPE) __msa_dpadd_u_h((v8u16) out1, \
860  (v16u8) mult1, (v16u8) cnst1); \
861 }
862 #define DPADD_UB2_UH(...) DPADD_UB2(v8u16, __VA_ARGS__)
863 
864 /* Description : Dot product & addition of halfword vector elements
865  Arguments : Inputs - mult0, mult1
866  cnst0, cnst1
867  Outputs - out0, out1
868  Return Type - as per RTYPE
869  Details : Signed halfword elements from mult0 are multiplied with
870  signed halfword elements from cnst0 producing a result
871  twice the size of input i.e. signed word.
872  Then this multiplication results of adjacent odd-even elements
873  are added to the out vector
874  (2 signed word results)
875 */
876 #define DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1) \
877 { \
878  out0 = (RTYPE) __msa_dpadd_s_w((v4i32) out0, \
879  (v8i16) mult0, (v8i16) cnst0); \
880  out1 = (RTYPE) __msa_dpadd_s_w((v4i32) out1, \
881  (v8i16) mult1, (v8i16) cnst1); \
882 }
883 #define DPADD_SH2_SW(...) DPADD_SH2(v4i32, __VA_ARGS__)
884 
885 #define DPADD_SH4(RTYPE, mult0, mult1, mult2, mult3, \
886  cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3) \
887 { \
888  DPADD_SH2(RTYPE, mult0, mult1, cnst0, cnst1, out0, out1); \
889  DPADD_SH2(RTYPE, mult2, mult3, cnst2, cnst3, out2, out3); \
890 }
891 #define DPADD_SH4_SW(...) DPADD_SH4(v4i32, __VA_ARGS__)
892 
893 /* Description : Minimum values between unsigned elements of
894  either vector are copied to the output vector
895  Arguments : Inputs - in0, in1, min_vec
896  Outputs - in0, in1, (in place)
897  Return Type - as per RTYPE
898  Details : Minimum of unsigned halfword element values from 'in0' and
899  'min_value' are written to output vector 'in0'
900 */
901 #define MIN_UH2(RTYPE, in0, in1, min_vec) \
902 { \
903  in0 = (RTYPE) __msa_min_u_h((v8u16) in0, min_vec); \
904  in1 = (RTYPE) __msa_min_u_h((v8u16) in1, min_vec); \
905 }
906 #define MIN_UH2_UH(...) MIN_UH2(v8u16, __VA_ARGS__)
907 
908 #define MIN_UH4(RTYPE, in0, in1, in2, in3, min_vec) \
909 { \
910  MIN_UH2(RTYPE, in0, in1, min_vec); \
911  MIN_UH2(RTYPE, in2, in3, min_vec); \
912 }
913 #define MIN_UH4_UH(...) MIN_UH4(v8u16, __VA_ARGS__)
914 
915 /* Description : Clips all halfword elements of input vector between min & max
916  out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
917  Arguments : Inputs - in (input vector)
918  - min (min threshold)
919  - max (max threshold)
920  Outputs - in (output vector with clipped elements)
921  Return Type - signed halfword
922 */
923 #define CLIP_SH(in, min, max) \
924 { \
925  in = __msa_max_s_h((v8i16) min, (v8i16) in); \
926  in = __msa_min_s_h((v8i16) max, (v8i16) in); \
927 }
928 
929 /* Description : Clips all signed halfword elements of input vector
930  between 0 & 255
931  Arguments : Inputs - in (input vector)
932  Outputs - in (output vector with clipped elements)
933  Return Type - signed halfwords
934 */
935 #define CLIP_SH_0_255(in) \
936 { \
937  in = __msa_maxi_s_h((v8i16) in, 0); \
938  in = (v8i16) __msa_sat_u_h((v8u16) in, 7); \
939 }
940 
941 #define CLIP_SH2_0_255(in0, in1) \
942 { \
943  CLIP_SH_0_255(in0); \
944  CLIP_SH_0_255(in1); \
945 }
946 
947 #define CLIP_SH4_0_255(in0, in1, in2, in3) \
948 { \
949  CLIP_SH2_0_255(in0, in1); \
950  CLIP_SH2_0_255(in2, in3); \
951 }
952 
953 #define CLIP_SH8_0_255(in0, in1, in2, in3, \
954  in4, in5, in6, in7) \
955 { \
956  CLIP_SH4_0_255(in0, in1, in2, in3); \
957  CLIP_SH4_0_255(in4, in5, in6, in7); \
958 }
959 
960 /* Description : Clips all signed word elements of input vector
961  between 0 & 255
962  Arguments : Inputs - in (input vector)
963  Outputs - in (output vector with clipped elements)
964  Return Type - signed word
965 */
966 #define CLIP_SW_0_255(in) \
967 { \
968  in = __msa_maxi_s_w((v4i32) in, 0); \
969  in = (v4i32) __msa_sat_u_w((v4u32) in, 7); \
970 }
971 
972 #define CLIP_SW2_0_255(in0, in1) \
973 { \
974  CLIP_SW_0_255(in0); \
975  CLIP_SW_0_255(in1); \
976 }
977 
978 #define CLIP_SW4_0_255(in0, in1, in2, in3) \
979 { \
980  CLIP_SW2_0_255(in0, in1); \
981  CLIP_SW2_0_255(in2, in3); \
982 }
983 
984 #define CLIP_SW8_0_255(in0, in1, in2, in3, \
985  in4, in5, in6, in7) \
986 { \
987  CLIP_SW4_0_255(in0, in1, in2, in3); \
988  CLIP_SW4_0_255(in4, in5, in6, in7); \
989 }
990 
991 /* Description : Addition of 4 signed word elements
992  4 signed word elements of input vector are added together and
993  resulted integer sum is returned
994  Arguments : Inputs - in (signed word vector)
995  Outputs - sum_m (i32 sum)
996  Return Type - signed word
997 */
998 #define HADD_SW_S32(in) \
999 ( { \
1000  v2i64 res0_m, res1_m; \
1001  int32_t sum_m; \
1002  \
1003  res0_m = __msa_hadd_s_d((v4i32) in, (v4i32) in); \
1004  res1_m = __msa_splati_d(res0_m, 1); \
1005  res0_m += res1_m; \
1006  sum_m = __msa_copy_s_w((v4i32) res0_m, 0); \
1007  sum_m; \
1008 } )
1009 
1010 /* Description : Addition of 8 unsigned halfword elements
1011  8 unsigned halfword elements of input vector are added
1012  together and resulted integer sum is returned
1013  Arguments : Inputs - in (unsigned halfword vector)
1014  Outputs - sum_m (u32 sum)
1015  Return Type - unsigned word
1016 */
1017 #define HADD_UH_U32(in) \
1018 ( { \
1019  v4u32 res_m; \
1020  v2u64 res0_m, res1_m; \
1021  uint32_t sum_m; \
1022  \
1023  res_m = __msa_hadd_u_w((v8u16) in, (v8u16) in); \
1024  res0_m = __msa_hadd_u_d(res_m, res_m); \
1025  res1_m = (v2u64) __msa_splati_d((v2i64) res0_m, 1); \
1026  res0_m += res1_m; \
1027  sum_m = __msa_copy_u_w((v4i32) res0_m, 0); \
1028  sum_m; \
1029 } )
1030 
1031 /* Description : Horizontal addition of signed byte vector elements
1032  Arguments : Inputs - in0, in1
1033  Outputs - out0, out1
1034  Return Type - as per RTYPE
1035  Details : Each signed odd byte element from 'in0' is added to
1036  even signed byte element from 'in0' (pairwise) and the
1037  halfword result is stored in 'out0'
1038 */
1039 #define HADD_SB2(RTYPE, in0, in1, out0, out1) \
1040 { \
1041  out0 = (RTYPE) __msa_hadd_s_h((v16i8) in0, (v16i8) in0); \
1042  out1 = (RTYPE) __msa_hadd_s_h((v16i8) in1, (v16i8) in1); \
1043 }
1044 #define HADD_SB2_SH(...) HADD_SB2(v8i16, __VA_ARGS__)
1045 
1046 #define HADD_SB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1047 { \
1048  HADD_SB2(RTYPE, in0, in1, out0, out1); \
1049  HADD_SB2(RTYPE, in2, in3, out2, out3); \
1050 }
1051 #define HADD_SB4_UH(...) HADD_SB4(v8u16, __VA_ARGS__)
1052 #define HADD_SB4_SH(...) HADD_SB4(v8i16, __VA_ARGS__)
1053 
1054 /* Description : Horizontal addition of unsigned byte vector elements
1055  Arguments : Inputs - in0, in1
1056  Outputs - out0, out1
1057  Return Type - as per RTYPE
1058  Details : Each unsigned odd byte element from 'in0' is added to
1059  even unsigned byte element from 'in0' (pairwise) and the
1060  halfword result is stored in 'out0'
1061 */
1062 #define HADD_UB2(RTYPE, in0, in1, out0, out1) \
1063 { \
1064  out0 = (RTYPE) __msa_hadd_u_h((v16u8) in0, (v16u8) in0); \
1065  out1 = (RTYPE) __msa_hadd_u_h((v16u8) in1, (v16u8) in1); \
1066 }
1067 #define HADD_UB2_UH(...) HADD_UB2(v8u16, __VA_ARGS__)
1068 
1069 #define HADD_UB3(RTYPE, in0, in1, in2, out0, out1, out2) \
1070 { \
1071  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1072  out2 = (RTYPE) __msa_hadd_u_h((v16u8) in2, (v16u8) in2); \
1073 }
1074 #define HADD_UB3_UH(...) HADD_UB3(v8u16, __VA_ARGS__)
1075 
1076 #define HADD_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1077 { \
1078  HADD_UB2(RTYPE, in0, in1, out0, out1); \
1079  HADD_UB2(RTYPE, in2, in3, out2, out3); \
1080 }
1081 #define HADD_UB4_UB(...) HADD_UB4(v16u8, __VA_ARGS__)
1082 #define HADD_UB4_UH(...) HADD_UB4(v8u16, __VA_ARGS__)
1083 #define HADD_UB4_SH(...) HADD_UB4(v8i16, __VA_ARGS__)
1084 
1085 /* Description : Horizontal subtraction of unsigned byte vector elements
1086  Arguments : Inputs - in0, in1
1087  Outputs - out0, out1
1088  Return Type - as per RTYPE
1089  Details : Each unsigned odd byte element from 'in0' is subtracted from
1090  even unsigned byte element from 'in0' (pairwise) and the
1091  halfword result is stored in 'out0'
1092 */
1093 #define HSUB_UB2(RTYPE, in0, in1, out0, out1) \
1094 { \
1095  out0 = (RTYPE) __msa_hsub_u_h((v16u8) in0, (v16u8) in0); \
1096  out1 = (RTYPE) __msa_hsub_u_h((v16u8) in1, (v16u8) in1); \
1097 }
1098 #define HSUB_UB2_UH(...) HSUB_UB2(v8u16, __VA_ARGS__)
1099 #define HSUB_UB2_SH(...) HSUB_UB2(v8i16, __VA_ARGS__)
1100 
1101 #define HSUB_UB4(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
1102 { \
1103  HSUB_UB2(RTYPE, in0, in1, out0, out1); \
1104  HSUB_UB2(RTYPE, in2, in3, out2, out3); \
1105 }
1106 #define HSUB_UB4_UH(...) HSUB_UB4(v8u16, __VA_ARGS__)
1107 #define HSUB_UB4_SH(...) HSUB_UB4(v8i16, __VA_ARGS__)
1108 
1109 /* Description : SAD (Sum of Absolute Difference)
1110  Arguments : Inputs - in0, in1, ref0, ref1 (unsigned byte src & ref)
1111  Outputs - sad_m (halfword vector with sad)
1112  Return Type - unsigned halfword
1113  Details : Absolute difference of all the byte elements from 'in0' with
1114  'ref0' is calculated and preserved in 'diff0'. From the 16
1115  unsigned absolute diff values, even-odd pairs are added
1116  together to generate 8 halfword results.
1117 */
1118 #if HAVE_MSA2
1119 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1120 ( { \
1121  v8u16 sad_m = { 0 }; \
1122  sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in0, (v16u8) ref0); \
1123  sad_m += __builtin_msa2_sad_adj2_u_w2x_b((v16u8) in1, (v16u8) ref1); \
1124  sad_m; \
1125 } )
1126 #else
1127 #define SAD_UB2_UH(in0, in1, ref0, ref1) \
1128 ( { \
1129  v16u8 diff0_m, diff1_m; \
1130  v8u16 sad_m = { 0 }; \
1131  \
1132  diff0_m = __msa_asub_u_b((v16u8) in0, (v16u8) ref0); \
1133  diff1_m = __msa_asub_u_b((v16u8) in1, (v16u8) ref1); \
1134  \
1135  sad_m += __msa_hadd_u_h((v16u8) diff0_m, (v16u8) diff0_m); \
1136  sad_m += __msa_hadd_u_h((v16u8) diff1_m, (v16u8) diff1_m); \
1137  \
1138  sad_m; \
1139 } )
1140 #endif // #if HAVE_MSA2
1141 
1142 /* Description : Insert specified word elements from input vectors to 1
1143  destination vector
1144  Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
1145  Outputs - out (output vector)
1146  Return Type - as per RTYPE
1147 */
1148 #define INSERT_W2(RTYPE, in0, in1, out) \
1149 { \
1150  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1151  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1152 }
1153 #define INSERT_W2_UB(...) INSERT_W2(v16u8, __VA_ARGS__)
1154 #define INSERT_W2_SB(...) INSERT_W2(v16i8, __VA_ARGS__)
1155 
1156 #define INSERT_W4(RTYPE, in0, in1, in2, in3, out) \
1157 { \
1158  out = (RTYPE) __msa_insert_w((v4i32) out, 0, in0); \
1159  out = (RTYPE) __msa_insert_w((v4i32) out, 1, in1); \
1160  out = (RTYPE) __msa_insert_w((v4i32) out, 2, in2); \
1161  out = (RTYPE) __msa_insert_w((v4i32) out, 3, in3); \
1162 }
1163 #define INSERT_W4_UB(...) INSERT_W4(v16u8, __VA_ARGS__)
1164 #define INSERT_W4_SB(...) INSERT_W4(v16i8, __VA_ARGS__)
1165 #define INSERT_W4_SH(...) INSERT_W4(v8i16, __VA_ARGS__)
1166 #define INSERT_W4_SW(...) INSERT_W4(v4i32, __VA_ARGS__)
1167 
1168 /* Description : Insert specified double word elements from input vectors to 1
1169  destination vector
1170  Arguments : Inputs - in0, in1 (2 input vectors)
1171  Outputs - out (output vector)
1172  Return Type - as per RTYPE
1173 */
1174 #define INSERT_D2(RTYPE, in0, in1, out) \
1175 { \
1176  out = (RTYPE) __msa_insert_d((v2i64) out, 0, in0); \
1177  out = (RTYPE) __msa_insert_d((v2i64) out, 1, in1); \
1178 }
1179 #define INSERT_D2_UB(...) INSERT_D2(v16u8, __VA_ARGS__)
1180 #define INSERT_D2_SB(...) INSERT_D2(v16i8, __VA_ARGS__)
1181 #define INSERT_D2_SH(...) INSERT_D2(v8i16, __VA_ARGS__)
1182 #define INSERT_D2_SD(...) INSERT_D2(v2i64, __VA_ARGS__)
1183 
1184 /* Description : Interleave even byte elements from vectors
1185  Arguments : Inputs - in0, in1, in2, in3
1186  Outputs - out0, out1
1187  Return Type - as per RTYPE
1188  Details : Even byte elements of 'in0' and even byte
1189  elements of 'in1' are interleaved and copied to 'out0'
1190  Even byte elements of 'in2' and even byte
1191  elements of 'in3' are interleaved and copied to 'out1'
1192 */
1193 #define ILVEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1194 { \
1195  out0 = (RTYPE) __msa_ilvev_b((v16i8) in1, (v16i8) in0); \
1196  out1 = (RTYPE) __msa_ilvev_b((v16i8) in3, (v16i8) in2); \
1197 }
1198 #define ILVEV_B2_UB(...) ILVEV_B2(v16u8, __VA_ARGS__)
1199 #define ILVEV_B2_SB(...) ILVEV_B2(v16i8, __VA_ARGS__)
1200 #define ILVEV_B2_SH(...) ILVEV_B2(v8i16, __VA_ARGS__)
1201 #define ILVEV_B2_SD(...) ILVEV_B2(v2i64, __VA_ARGS__)
1202 
1203 /* Description : Interleave even halfword elements from vectors
1204  Arguments : Inputs - in0, in1, in2, in3
1205  Outputs - out0, out1
1206  Return Type - as per RTYPE
1207  Details : Even halfword elements of 'in0' and even halfword
1208  elements of 'in1' are interleaved and copied to 'out0'
1209  Even halfword elements of 'in2' and even halfword
1210  elements of 'in3' are interleaved and copied to 'out1'
1211 */
1212 #define ILVEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1213 { \
1214  out0 = (RTYPE) __msa_ilvev_h((v8i16) in1, (v8i16) in0); \
1215  out1 = (RTYPE) __msa_ilvev_h((v8i16) in3, (v8i16) in2); \
1216 }
1217 #define ILVEV_H2_UB(...) ILVEV_H2(v16u8, __VA_ARGS__)
1218 #define ILVEV_H2_SH(...) ILVEV_H2(v8i16, __VA_ARGS__)
1219 #define ILVEV_H2_SW(...) ILVEV_H2(v4i32, __VA_ARGS__)
1220 
1221 /* Description : Interleave even word elements from vectors
1222  Arguments : Inputs - in0, in1, in2, in3
1223  Outputs - out0, out1
1224  Return Type - as per RTYPE
1225  Details : Even word elements of 'in0' and even word
1226  elements of 'in1' are interleaved and copied to 'out0'
1227  Even word elements of 'in2' and even word
1228  elements of 'in3' are interleaved and copied to 'out1'
1229 */
1230 #define ILVEV_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1231 { \
1232  out0 = (RTYPE) __msa_ilvev_w((v4i32) in1, (v4i32) in0); \
1233  out1 = (RTYPE) __msa_ilvev_w((v4i32) in3, (v4i32) in2); \
1234 }
1235 #define ILVEV_W2_UB(...) ILVEV_W2(v16u8, __VA_ARGS__)
1236 #define ILVEV_W2_SB(...) ILVEV_W2(v16i8, __VA_ARGS__)
1237 #define ILVEV_W2_UH(...) ILVEV_W2(v8u16, __VA_ARGS__)
1238 #define ILVEV_W2_SD(...) ILVEV_W2(v2i64, __VA_ARGS__)
1239 
1240 /* Description : Interleave even double word elements from vectors
1241  Arguments : Inputs - in0, in1, in2, in3
1242  Outputs - out0, out1
1243  Return Type - as per RTYPE
1244  Details : Even double word elements of 'in0' and even double word
1245  elements of 'in1' are interleaved and copied to 'out0'
1246  Even double word elements of 'in2' and even double word
1247  elements of 'in3' are interleaved and copied to 'out1'
1248 */
1249 #define ILVEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1250 { \
1251  out0 = (RTYPE) __msa_ilvev_d((v2i64) in1, (v2i64) in0); \
1252  out1 = (RTYPE) __msa_ilvev_d((v2i64) in3, (v2i64) in2); \
1253 }
1254 #define ILVEV_D2_UB(...) ILVEV_D2(v16u8, __VA_ARGS__)
1255 #define ILVEV_D2_SB(...) ILVEV_D2(v16i8, __VA_ARGS__)
1256 #define ILVEV_D2_SW(...) ILVEV_D2(v4i32, __VA_ARGS__)
1257 
1258 /* Description : Interleave left half of byte elements from vectors
1259  Arguments : Inputs - in0, in1, in2, in3
1260  Outputs - out0, out1
1261  Return Type - as per RTYPE
1262  Details : Left half of byte elements of in0 and left half of byte
1263  elements of in1 are interleaved and copied to out0.
1264  Left half of byte elements of in2 and left half of byte
1265  elements of in3 are interleaved and copied to out1.
1266 */
1267 #define ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1268 { \
1269  out0 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1270  out1 = (RTYPE) __msa_ilvl_b((v16i8) in2, (v16i8) in3); \
1271 }
1272 #define ILVL_B2_UB(...) ILVL_B2(v16u8, __VA_ARGS__)
1273 #define ILVL_B2_SB(...) ILVL_B2(v16i8, __VA_ARGS__)
1274 #define ILVL_B2_UH(...) ILVL_B2(v8u16, __VA_ARGS__)
1275 #define ILVL_B2_SH(...) ILVL_B2(v8i16, __VA_ARGS__)
1276 
1277 #define ILVL_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1278  out0, out1, out2, out3) \
1279 { \
1280  ILVL_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1281  ILVL_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1282 }
1283 #define ILVL_B4_UB(...) ILVL_B4(v16u8, __VA_ARGS__)
1284 #define ILVL_B4_SB(...) ILVL_B4(v16i8, __VA_ARGS__)
1285 #define ILVL_B4_UH(...) ILVL_B4(v8u16, __VA_ARGS__)
1286 #define ILVL_B4_SH(...) ILVL_B4(v8i16, __VA_ARGS__)
1287 
1288 /* Description : Interleave left half of halfword elements from vectors
1289  Arguments : Inputs - in0, in1, in2, in3
1290  Outputs - out0, out1
1291  Return Type - as per RTYPE
1292  Details : Left half of halfword elements of in0 and left half of halfword
1293  elements of in1 are interleaved and copied to out0.
1294  Left half of halfword elements of in2 and left half of halfword
1295  elements of in3 are interleaved and copied to out1.
1296 */
1297 #define ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1298 { \
1299  out0 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1300  out1 = (RTYPE) __msa_ilvl_h((v8i16) in2, (v8i16) in3); \
1301 }
1302 #define ILVL_H2_SH(...) ILVL_H2(v8i16, __VA_ARGS__)
1303 #define ILVL_H2_SW(...) ILVL_H2(v4i32, __VA_ARGS__)
1304 
1305 #define ILVL_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1306  out0, out1, out2, out3) \
1307 { \
1308  ILVL_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1309  ILVL_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1310 }
1311 #define ILVL_H4_SH(...) ILVL_H4(v8i16, __VA_ARGS__)
1312 #define ILVL_H4_SW(...) ILVL_H4(v4i32, __VA_ARGS__)
1313 
1314 /* Description : Interleave left half of word elements from vectors
1315  Arguments : Inputs - in0, in1, in2, in3
1316  Outputs - out0, out1
1317  Return Type - as per RTYPE
1318  Details : Left half of word elements of in0 and left half of word
1319  elements of in1 are interleaved and copied to out0.
1320  Left half of word elements of in2 and left half of word
1321  elements of in3 are interleaved and copied to out1.
1322 */
1323 #define ILVL_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1324 { \
1325  out0 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1326  out1 = (RTYPE) __msa_ilvl_w((v4i32) in2, (v4i32) in3); \
1327 }
1328 #define ILVL_W2_UB(...) ILVL_W2(v16u8, __VA_ARGS__)
1329 #define ILVL_W2_SB(...) ILVL_W2(v16i8, __VA_ARGS__)
1330 #define ILVL_W2_SH(...) ILVL_W2(v8i16, __VA_ARGS__)
1331 
1332 /* Description : Interleave right half of byte elements from vectors
1333  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1334  Outputs - out0, out1, out2, out3
1335  Return Type - as per RTYPE
1336  Details : Right half of byte elements of in0 and right half of byte
1337  elements of in1 are interleaved and copied to out0.
1338  Right half of byte elements of in2 and right half of byte
1339  elements of in3 are interleaved and copied to out1.
1340  Similar for other pairs
1341 */
1342 #define ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1343 { \
1344  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1345  out1 = (RTYPE) __msa_ilvr_b((v16i8) in2, (v16i8) in3); \
1346 }
1347 #define ILVR_B2_UB(...) ILVR_B2(v16u8, __VA_ARGS__)
1348 #define ILVR_B2_SB(...) ILVR_B2(v16i8, __VA_ARGS__)
1349 #define ILVR_B2_UH(...) ILVR_B2(v8u16, __VA_ARGS__)
1350 #define ILVR_B2_SH(...) ILVR_B2(v8i16, __VA_ARGS__)
1351 #define ILVR_B2_SW(...) ILVR_B2(v4i32, __VA_ARGS__)
1352 
1353 #define ILVR_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1354 { \
1355  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1356  out2 = (RTYPE) __msa_ilvr_b((v16i8) in4, (v16i8) in5); \
1357 }
1358 #define ILVR_B3_UB(...) ILVR_B3(v16u8, __VA_ARGS__)
1359 #define ILVR_B3_SB(...) ILVR_B3(v16i8, __VA_ARGS__)
1360 #define ILVR_B3_UH(...) ILVR_B3(v8u16, __VA_ARGS__)
1361 #define ILVR_B3_SH(...) ILVR_B3(v8i16, __VA_ARGS__)
1362 
1363 #define ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1364  out0, out1, out2, out3) \
1365 { \
1366  ILVR_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1367  ILVR_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1368 }
1369 #define ILVR_B4_UB(...) ILVR_B4(v16u8, __VA_ARGS__)
1370 #define ILVR_B4_SB(...) ILVR_B4(v16i8, __VA_ARGS__)
1371 #define ILVR_B4_UH(...) ILVR_B4(v8u16, __VA_ARGS__)
1372 #define ILVR_B4_SH(...) ILVR_B4(v8i16, __VA_ARGS__)
1373 #define ILVR_B4_SW(...) ILVR_B4(v4i32, __VA_ARGS__)
1374 
1375 #define ILVR_B8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1376  in8, in9, in10, in11, in12, in13, in14, in15, \
1377  out0, out1, out2, out3, out4, out5, out6, out7) \
1378 { \
1379  ILVR_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1380  out0, out1, out2, out3); \
1381  ILVR_B4(RTYPE, in8, in9, in10, in11, in12, in13, in14, in15, \
1382  out4, out5, out6, out7); \
1383 }
1384 #define ILVR_B8_UH(...) ILVR_B8(v8u16, __VA_ARGS__)
1385 
1386 /* Description : Interleave right half of halfword elements from vectors
1387  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1388  Outputs - out0, out1, out2, out3
1389  Return Type - as per RTYPE
1390  Details : Right half of halfword elements of in0 and right half of
1391  halfword elements of in1 are interleaved and copied to out0.
1392  Right half of halfword elements of in2 and right half of
1393  halfword elements of in3 are interleaved and copied to out1.
1394  Similar for other pairs
1395 */
1396 #define ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1397 { \
1398  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1399  out1 = (RTYPE) __msa_ilvr_h((v8i16) in2, (v8i16) in3); \
1400 }
1401 #define ILVR_H2_SH(...) ILVR_H2(v8i16, __VA_ARGS__)
1402 #define ILVR_H2_SW(...) ILVR_H2(v4i32, __VA_ARGS__)
1403 
1404 #define ILVR_H3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1405 { \
1406  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1407  out2 = (RTYPE) __msa_ilvr_h((v8i16) in4, (v8i16) in5); \
1408 }
1409 #define ILVR_H3_SH(...) ILVR_H3(v8i16, __VA_ARGS__)
1410 
1411 #define ILVR_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1412  out0, out1, out2, out3) \
1413 { \
1414  ILVR_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1415  ILVR_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1416 }
1417 #define ILVR_H4_SH(...) ILVR_H4(v8i16, __VA_ARGS__)
1418 #define ILVR_H4_SW(...) ILVR_H4(v4i32, __VA_ARGS__)
1419 
1420 #define ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1) \
1421 { \
1422  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1423  out1 = (RTYPE) __msa_ilvr_w((v4i32) in2, (v4i32) in3); \
1424 }
1425 #define ILVR_W2_UB(...) ILVR_W2(v16u8, __VA_ARGS__)
1426 #define ILVR_W2_SB(...) ILVR_W2(v16i8, __VA_ARGS__)
1427 #define ILVR_W2_SH(...) ILVR_W2(v8i16, __VA_ARGS__)
1428 
1429 #define ILVR_W4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1430  out0, out1, out2, out3) \
1431 { \
1432  ILVR_W2(RTYPE, in0, in1, in2, in3, out0, out1); \
1433  ILVR_W2(RTYPE, in4, in5, in6, in7, out2, out3); \
1434 }
1435 #define ILVR_W4_SB(...) ILVR_W4(v16i8, __VA_ARGS__)
1436 #define ILVR_W4_UB(...) ILVR_W4(v16u8, __VA_ARGS__)
1437 
1438 /* Description : Interleave right half of double word elements from vectors
1439  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1440  Outputs - out0, out1, out2, out3
1441  Return Type - as per RTYPE
1442  Details : Right half of double word elements of in0 and right half of
1443  double word elements of in1 are interleaved and copied to out0.
1444  Right half of double word elements of in2 and right half of
1445  double word elements of in3 are interleaved and copied to out1.
1446 */
1447 #define ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1448 { \
1449  out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
1450  out1 = (RTYPE) __msa_ilvr_d((v2i64) in2, (v2i64) in3); \
1451 }
1452 #define ILVR_D2_UB(...) ILVR_D2(v16u8, __VA_ARGS__)
1453 #define ILVR_D2_SB(...) ILVR_D2(v16i8, __VA_ARGS__)
1454 #define ILVR_D2_SH(...) ILVR_D2(v8i16, __VA_ARGS__)
1455 
1456 #define ILVR_D3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1457 { \
1458  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1459  out2 = (RTYPE) __msa_ilvr_d((v2i64) in4, (v2i64) in5); \
1460 }
1461 #define ILVR_D3_SB(...) ILVR_D3(v16i8, __VA_ARGS__)
1462 
1463 #define ILVR_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1464  out0, out1, out2, out3) \
1465 { \
1466  ILVR_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1467  ILVR_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1468 }
1469 #define ILVR_D4_SB(...) ILVR_D4(v16i8, __VA_ARGS__)
1470 #define ILVR_D4_UB(...) ILVR_D4(v16u8, __VA_ARGS__)
1471 
1472 /* Description : Interleave left half of double word elements from vectors
1473  Arguments : Inputs - in0, in1, in2, in3
1474  Outputs - out0, out1
1475  Return Type - as per RTYPE
1476  Details : Left half of double word elements of in0 and left half of
1477  double word elements of in1 are interleaved and copied to out0.
1478  Left half of double word elements of in2 and left half of
1479  double word elements of in3 are interleaved and copied to out1.
1480 */
1481 #define ILVL_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1482 { \
1483  out0 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
1484  out1 = (RTYPE) __msa_ilvl_d((v2i64) in2, (v2i64) in3); \
1485 }
1486 #define ILVL_D2_UB(...) ILVL_D2(v16u8, __VA_ARGS__)
1487 #define ILVL_D2_SB(...) ILVL_D2(v16i8, __VA_ARGS__)
1488 #define ILVL_D2_SH(...) ILVL_D2(v8i16, __VA_ARGS__)
1489 
1490 /* Description : Interleave both left and right half of input vectors
1491  Arguments : Inputs - in0, in1
1492  Outputs - out0, out1
1493  Return Type - as per RTYPE
1494  Details : Right half of byte elements from 'in0' and 'in1' are
1495  interleaved and stored to 'out0'
1496  Left half of byte elements from 'in0' and 'in1' are
1497  interleaved and stored to 'out1'
1498 */
1499 #define ILVRL_B2(RTYPE, in0, in1, out0, out1) \
1500 { \
1501  out0 = (RTYPE) __msa_ilvr_b((v16i8) in0, (v16i8) in1); \
1502  out1 = (RTYPE) __msa_ilvl_b((v16i8) in0, (v16i8) in1); \
1503 }
1504 #define ILVRL_B2_UB(...) ILVRL_B2(v16u8, __VA_ARGS__)
1505 #define ILVRL_B2_SB(...) ILVRL_B2(v16i8, __VA_ARGS__)
1506 #define ILVRL_B2_UH(...) ILVRL_B2(v8u16, __VA_ARGS__)
1507 #define ILVRL_B2_SH(...) ILVRL_B2(v8i16, __VA_ARGS__)
1508 #define ILVRL_B2_SW(...) ILVRL_B2(v4i32, __VA_ARGS__)
1509 
1510 #define ILVRL_H2(RTYPE, in0, in1, out0, out1) \
1511 { \
1512  out0 = (RTYPE) __msa_ilvr_h((v8i16) in0, (v8i16) in1); \
1513  out1 = (RTYPE) __msa_ilvl_h((v8i16) in0, (v8i16) in1); \
1514 }
1515 #define ILVRL_H2_UB(...) ILVRL_H2(v16u8, __VA_ARGS__)
1516 #define ILVRL_H2_SB(...) ILVRL_H2(v16i8, __VA_ARGS__)
1517 #define ILVRL_H2_SH(...) ILVRL_H2(v8i16, __VA_ARGS__)
1518 #define ILVRL_H2_SW(...) ILVRL_H2(v4i32, __VA_ARGS__)
1519 
1520 #define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
1521 { \
1522  out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
1523  out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
1524 }
1525 #define ILVRL_W2_UB(...) ILVRL_W2(v16u8, __VA_ARGS__)
1526 #define ILVRL_W2_SH(...) ILVRL_W2(v8i16, __VA_ARGS__)
1527 #define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
1528 
1529 /* Description : Maximum values between signed elements of vector and
1530  5-bit signed immediate value are copied to the output vector
1531  Arguments : Inputs - in0, in1, in2, in3, max_val
1532  Outputs - in0, in1, in2, in3 (in place)
1533  Return Type - as per RTYPE
1534  Details : Maximum of signed halfword element values from 'in0' and
1535  'max_val' are written to output vector 'in0'
1536 */
1537 #define MAXI_SH2(RTYPE, in0, in1, max_val) \
1538 { \
1539  in0 = (RTYPE) __msa_maxi_s_h((v8i16) in0, max_val); \
1540  in1 = (RTYPE) __msa_maxi_s_h((v8i16) in1, max_val); \
1541 }
1542 #define MAXI_SH2_UH(...) MAXI_SH2(v8u16, __VA_ARGS__)
1543 #define MAXI_SH2_SH(...) MAXI_SH2(v8i16, __VA_ARGS__)
1544 
1545 #define MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val) \
1546 { \
1547  MAXI_SH2(RTYPE, in0, in1, max_val); \
1548  MAXI_SH2(RTYPE, in2, in3, max_val); \
1549 }
1550 #define MAXI_SH4_UH(...) MAXI_SH4(v8u16, __VA_ARGS__)
1551 #define MAXI_SH4_SH(...) MAXI_SH4(v8i16, __VA_ARGS__)
1552 
1553 #define MAXI_SH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, max_val) \
1554 { \
1555  MAXI_SH4(RTYPE, in0, in1, in2, in3, max_val); \
1556  MAXI_SH4(RTYPE, in4, in5, in6, in7, max_val); \
1557 }
1558 #define MAXI_SH8_UH(...) MAXI_SH8(v8u16, __VA_ARGS__)
1559 #define MAXI_SH8_SH(...) MAXI_SH8(v8i16, __VA_ARGS__)
1560 
1561 /* Description : Saturate the halfword element values to the max
1562  unsigned value of (sat_val+1 bits)
1563  The element data width remains unchanged
1564  Arguments : Inputs - in0, in1, in2, in3, sat_val
1565  Outputs - in0, in1, in2, in3 (in place)
1566  Return Type - as per RTYPE
1567  Details : Each unsigned halfword element from 'in0' is saturated to the
1568  value generated with (sat_val+1) bit range
1569  Results are in placed to original vectors
1570 */
1571 #define SAT_UH2(RTYPE, in0, in1, sat_val) \
1572 { \
1573  in0 = (RTYPE) __msa_sat_u_h((v8u16) in0, sat_val); \
1574  in1 = (RTYPE) __msa_sat_u_h((v8u16) in1, sat_val); \
1575 }
1576 #define SAT_UH2_UH(...) SAT_UH2(v8u16, __VA_ARGS__)
1577 #define SAT_UH2_SH(...) SAT_UH2(v8i16, __VA_ARGS__)
1578 
1579 #define SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val) \
1580 { \
1581  SAT_UH2(RTYPE, in0, in1, sat_val); \
1582  SAT_UH2(RTYPE, in2, in3, sat_val); \
1583 }
1584 #define SAT_UH4_UH(...) SAT_UH4(v8u16, __VA_ARGS__)
1585 #define SAT_UH4_SH(...) SAT_UH4(v8i16, __VA_ARGS__)
1586 
1587 #define SAT_UH8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, sat_val) \
1588 { \
1589  SAT_UH4(RTYPE, in0, in1, in2, in3, sat_val); \
1590  SAT_UH4(RTYPE, in4, in5, in6, in7, sat_val); \
1591 }
1592 #define SAT_UH8_UH(...) SAT_UH8(v8u16, __VA_ARGS__)
1593 #define SAT_UH8_SH(...) SAT_UH8(v8i16, __VA_ARGS__)
1594 
1595 /* Description : Saturate the halfword element values to the max
1596  unsigned value of (sat_val+1 bits)
1597  The element data width remains unchanged
1598  Arguments : Inputs - in0, in1, in2, in3, sat_val
1599  Outputs - in0, in1, in2, in3 (in place)
1600  Return Type - as per RTYPE
1601  Details : Each unsigned halfword element from 'in0' is saturated to the
1602  value generated with (sat_val+1) bit range
1603  Results are in placed to original vectors
1604 */
1605 #define SAT_SH2(RTYPE, in0, in1, sat_val) \
1606 { \
1607  in0 = (RTYPE) __msa_sat_s_h((v8i16) in0, sat_val); \
1608  in1 = (RTYPE) __msa_sat_s_h((v8i16) in1, sat_val); \
1609 }
1610 #define SAT_SH2_SH(...) SAT_SH2(v8i16, __VA_ARGS__)
1611 
1612 #define SAT_SH3(RTYPE, in0, in1, in2, sat_val) \
1613 { \
1614  SAT_SH2(RTYPE, in0, in1, sat_val); \
1615  in2 = (RTYPE) __msa_sat_s_h((v8i16) in2, sat_val); \
1616 }
1617 #define SAT_SH3_SH(...) SAT_SH3(v8i16, __VA_ARGS__)
1618 
1619 #define SAT_SH4(RTYPE, in0, in1, in2, in3, sat_val) \
1620 { \
1621  SAT_SH2(RTYPE, in0, in1, sat_val); \
1622  SAT_SH2(RTYPE, in2, in3, sat_val); \
1623 }
1624 #define SAT_SH4_SH(...) SAT_SH4(v8i16, __VA_ARGS__)
1625 
1626 /* Description : Saturate the word element values to the max
1627  unsigned value of (sat_val+1 bits)
1628  The element data width remains unchanged
1629  Arguments : Inputs - in0, in1, in2, in3, sat_val
1630  Outputs - in0, in1, in2, in3 (in place)
1631  Return Type - as per RTYPE
1632  Details : Each unsigned word element from 'in0' is saturated to the
1633  value generated with (sat_val+1) bit range
1634  Results are in placed to original vectors
1635 */
1636 #define SAT_SW2(RTYPE, in0, in1, sat_val) \
1637 { \
1638  in0 = (RTYPE) __msa_sat_s_w((v4i32) in0, sat_val); \
1639  in1 = (RTYPE) __msa_sat_s_w((v4i32) in1, sat_val); \
1640 }
1641 #define SAT_SW2_SW(...) SAT_SW2(v4i32, __VA_ARGS__)
1642 
1643 #define SAT_SW4(RTYPE, in0, in1, in2, in3, sat_val) \
1644 { \
1645  SAT_SW2(RTYPE, in0, in1, sat_val); \
1646  SAT_SW2(RTYPE, in2, in3, sat_val); \
1647 }
1648 #define SAT_SW4_SW(...) SAT_SW4(v4i32, __VA_ARGS__)
1649 
1650 /* Description : Indexed halfword element values are replicated to all
1651  elements in output vector
1652  Arguments : Inputs - in, idx0, idx1
1653  Outputs - out0, out1
1654  Return Type - as per RTYPE
1655  Details : 'idx0' element value from 'in' vector is replicated to all
1656  elements in 'out0' vector
1657  Valid index range for halfword operation is 0-7
1658 */
1659 #define SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1) \
1660 { \
1661  out0 = (RTYPE) __msa_splati_h((v8i16) in, idx0); \
1662  out1 = (RTYPE) __msa_splati_h((v8i16) in, idx1); \
1663 }
1664 #define SPLATI_H2_SB(...) SPLATI_H2(v16i8, __VA_ARGS__)
1665 #define SPLATI_H2_SH(...) SPLATI_H2(v8i16, __VA_ARGS__)
1666 
1667 #define SPLATI_H3(RTYPE, in, idx0, idx1, idx2, \
1668  out0, out1, out2) \
1669 { \
1670  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1671  out2 = (RTYPE) __msa_splati_h((v8i16) in, idx2); \
1672 }
1673 #define SPLATI_H3_SB(...) SPLATI_H3(v16i8, __VA_ARGS__)
1674 #define SPLATI_H3_SH(...) SPLATI_H3(v8i16, __VA_ARGS__)
1675 
1676 #define SPLATI_H4(RTYPE, in, idx0, idx1, idx2, idx3, \
1677  out0, out1, out2, out3) \
1678 { \
1679  SPLATI_H2(RTYPE, in, idx0, idx1, out0, out1); \
1680  SPLATI_H2(RTYPE, in, idx2, idx3, out2, out3); \
1681 }
1682 #define SPLATI_H4_SB(...) SPLATI_H4(v16i8, __VA_ARGS__)
1683 #define SPLATI_H4_SH(...) SPLATI_H4(v8i16, __VA_ARGS__)
1684 
1685 /* Description : Indexed word element values are replicated to all
1686  elements in output vector
1687  Arguments : Inputs - in, stidx
1688  Outputs - out0, out1
1689  Return Type - as per RTYPE
1690  Details : 'stidx' element value from 'in' vector is replicated to all
1691  elements in 'out0' vector
1692  'stidx + 1' element value from 'in' vector is replicated to all
1693  elements in 'out1' vector
1694  Valid index range for halfword operation is 0-3
1695 */
1696 #define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
1697 { \
1698  out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
1699  out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
1700 }
1701 #define SPLATI_W2_SH(...) SPLATI_W2(v8i16, __VA_ARGS__)
1702 #define SPLATI_W2_SW(...) SPLATI_W2(v4i32, __VA_ARGS__)
1703 
1704 #define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
1705 { \
1706  SPLATI_W2(RTYPE, in, 0, out0, out1); \
1707  SPLATI_W2(RTYPE, in, 2, out2, out3); \
1708 }
1709 #define SPLATI_W4_SH(...) SPLATI_W4(v8i16, __VA_ARGS__)
1710 #define SPLATI_W4_SW(...) SPLATI_W4(v4i32, __VA_ARGS__)
1711 
1712 /* Description : Pack even byte elements of vector pairs
1713  Arguments : Inputs - in0, in1, in2, in3
1714  Outputs - out0, out1
1715  Return Type - as per RTYPE
1716  Details : Even byte elements of in0 are copied to the left half of
1717  out0 & even byte elements of in1 are copied to the right
1718  half of out0.
1719  Even byte elements of in2 are copied to the left half of
1720  out1 & even byte elements of in3 are copied to the right
1721  half of out1.
1722 */
1723 #define PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1) \
1724 { \
1725  out0 = (RTYPE) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
1726  out1 = (RTYPE) __msa_pckev_b((v16i8) in2, (v16i8) in3); \
1727 }
1728 #define PCKEV_B2_SB(...) PCKEV_B2(v16i8, __VA_ARGS__)
1729 #define PCKEV_B2_UB(...) PCKEV_B2(v16u8, __VA_ARGS__)
1730 #define PCKEV_B2_SH(...) PCKEV_B2(v8i16, __VA_ARGS__)
1731 #define PCKEV_B2_SW(...) PCKEV_B2(v4i32, __VA_ARGS__)
1732 
1733 #define PCKEV_B3(RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2) \
1734 { \
1735  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1736  out2 = (RTYPE) __msa_pckev_b((v16i8) in4, (v16i8) in5); \
1737 }
1738 #define PCKEV_B3_UB(...) PCKEV_B3(v16u8, __VA_ARGS__)
1739 #define PCKEV_B3_SB(...) PCKEV_B3(v16i8, __VA_ARGS__)
1740 
1741 #define PCKEV_B4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1742  out0, out1, out2, out3) \
1743 { \
1744  PCKEV_B2(RTYPE, in0, in1, in2, in3, out0, out1); \
1745  PCKEV_B2(RTYPE, in4, in5, in6, in7, out2, out3); \
1746 }
1747 #define PCKEV_B4_SB(...) PCKEV_B4(v16i8, __VA_ARGS__)
1748 #define PCKEV_B4_UB(...) PCKEV_B4(v16u8, __VA_ARGS__)
1749 #define PCKEV_B4_SH(...) PCKEV_B4(v8i16, __VA_ARGS__)
1750 #define PCKEV_B4_SW(...) PCKEV_B4(v4i32, __VA_ARGS__)
1751 
1752 /* Description : Pack even halfword elements of vector pairs
1753  Arguments : Inputs - in0, in1, in2, in3
1754  Outputs - out0, out1
1755  Return Type - as per RTYPE
1756  Details : Even halfword elements of in0 are copied to the left half of
1757  out0 & even halfword elements of in1 are copied to the right
1758  half of out0.
1759  Even halfword elements of in2 are copied to the left half of
1760  out1 & even halfword elements of in3 are copied to the right
1761  half of out1.
1762 */
1763 #define PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1) \
1764 { \
1765  out0 = (RTYPE) __msa_pckev_h((v8i16) in0, (v8i16) in1); \
1766  out1 = (RTYPE) __msa_pckev_h((v8i16) in2, (v8i16) in3); \
1767 }
1768 #define PCKEV_H2_SH(...) PCKEV_H2(v8i16, __VA_ARGS__)
1769 #define PCKEV_H2_SW(...) PCKEV_H2(v4i32, __VA_ARGS__)
1770 
1771 #define PCKEV_H4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1772  out0, out1, out2, out3) \
1773 { \
1774  PCKEV_H2(RTYPE, in0, in1, in2, in3, out0, out1); \
1775  PCKEV_H2(RTYPE, in4, in5, in6, in7, out2, out3); \
1776 }
1777 #define PCKEV_H4_SH(...) PCKEV_H4(v8i16, __VA_ARGS__)
1778 #define PCKEV_H4_SW(...) PCKEV_H4(v4i32, __VA_ARGS__)
1779 
1780 /* Description : Pack even double word elements of vector pairs
1781  Arguments : Inputs - in0, in1, in2, in3
1782  Outputs - out0, out1
1783  Return Type - as per RTYPE
1784  Details : Even double elements of in0 are copied to the left half of
1785  out0 & even double elements of in1 are copied to the right
1786  half of out0.
1787  Even double elements of in2 are copied to the left half of
1788  out1 & even double elements of in3 are copied to the right
1789  half of out1.
1790 */
1791 #define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1792 { \
1793  out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
1794  out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
1795 }
1796 #define PCKEV_D2_UB(...) PCKEV_D2(v16u8, __VA_ARGS__)
1797 #define PCKEV_D2_SB(...) PCKEV_D2(v16i8, __VA_ARGS__)
1798 #define PCKEV_D2_SH(...) PCKEV_D2(v8i16, __VA_ARGS__)
1799 
1800 #define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1801  out0, out1, out2, out3) \
1802 { \
1803  PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
1804  PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
1805 }
1806 #define PCKEV_D4_UB(...) PCKEV_D4(v16u8, __VA_ARGS__)
1807 
1808 /* Description : Pack odd double word elements of vector pairs
1809  Arguments : Inputs - in0, in1
1810  Outputs - out0, out1
1811  Return Type - as per RTYPE
1812  Details : As operation is on same input 'in0' vector, index 1 double word
1813  element is overwritten to index 0 and result is written to out0
1814  As operation is on same input 'in1' vector, index 1 double word
1815  element is overwritten to index 0 and result is written to out1
1816 */
1817 #define PCKOD_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
1818 { \
1819  out0 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
1820  out1 = (RTYPE) __msa_pckod_d((v2i64) in2, (v2i64) in3); \
1821 }
1822 #define PCKOD_D2_UB(...) PCKOD_D2(v16u8, __VA_ARGS__)
1823 #define PCKOD_D2_SH(...) PCKOD_D2(v8i16, __VA_ARGS__)
1824 #define PCKOD_D2_SD(...) PCKOD_D2(v2i64, __VA_ARGS__)
1825 
1826 /* Description : Each byte element is logically xor'ed with immediate 128
1827  Arguments : Inputs - in0, in1
1828  Outputs - in0, in1 (in-place)
1829  Return Type - as per RTYPE
1830  Details : Each unsigned byte element from input vector 'in0' is
1831  logically xor'ed with 128 and result is in-place stored in
1832  'in0' vector
1833  Each unsigned byte element from input vector 'in1' is
1834  logically xor'ed with 128 and result is in-place stored in
1835  'in1' vector
1836  Similar for other pairs
1837 */
1838 #define XORI_B2_128(RTYPE, in0, in1) \
1839 { \
1840  in0 = (RTYPE) __msa_xori_b((v16u8) in0, 128); \
1841  in1 = (RTYPE) __msa_xori_b((v16u8) in1, 128); \
1842 }
1843 #define XORI_B2_128_UB(...) XORI_B2_128(v16u8, __VA_ARGS__)
1844 #define XORI_B2_128_SB(...) XORI_B2_128(v16i8, __VA_ARGS__)
1845 #define XORI_B2_128_SH(...) XORI_B2_128(v8i16, __VA_ARGS__)
1846 
1847 #define XORI_B3_128(RTYPE, in0, in1, in2) \
1848 { \
1849  XORI_B2_128(RTYPE, in0, in1); \
1850  in2 = (RTYPE) __msa_xori_b((v16u8) in2, 128); \
1851 }
1852 #define XORI_B3_128_SB(...) XORI_B3_128(v16i8, __VA_ARGS__)
1853 
1854 #define XORI_B4_128(RTYPE, in0, in1, in2, in3) \
1855 { \
1856  XORI_B2_128(RTYPE, in0, in1); \
1857  XORI_B2_128(RTYPE, in2, in3); \
1858 }
1859 #define XORI_B4_128_UB(...) XORI_B4_128(v16u8, __VA_ARGS__)
1860 #define XORI_B4_128_SB(...) XORI_B4_128(v16i8, __VA_ARGS__)
1861 #define XORI_B4_128_SH(...) XORI_B4_128(v8i16, __VA_ARGS__)
1862 
1863 #define XORI_B5_128(RTYPE, in0, in1, in2, in3, in4) \
1864 { \
1865  XORI_B3_128(RTYPE, in0, in1, in2); \
1866  XORI_B2_128(RTYPE, in3, in4); \
1867 }
1868 #define XORI_B5_128_SB(...) XORI_B5_128(v16i8, __VA_ARGS__)
1869 
1870 #define XORI_B6_128(RTYPE, in0, in1, in2, in3, in4, in5) \
1871 { \
1872  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1873  XORI_B2_128(RTYPE, in4, in5); \
1874 }
1875 #define XORI_B6_128_SB(...) XORI_B6_128(v16i8, __VA_ARGS__)
1876 
1877 #define XORI_B7_128(RTYPE, in0, in1, in2, in3, in4, in5, in6) \
1878 { \
1879  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1880  XORI_B3_128(RTYPE, in4, in5, in6); \
1881 }
1882 #define XORI_B7_128_SB(...) XORI_B7_128(v16i8, __VA_ARGS__)
1883 
1884 #define XORI_B8_128(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7) \
1885 { \
1886  XORI_B4_128(RTYPE, in0, in1, in2, in3); \
1887  XORI_B4_128(RTYPE, in4, in5, in6, in7); \
1888 }
1889 #define XORI_B8_128_SB(...) XORI_B8_128(v16i8, __VA_ARGS__)
1890 #define XORI_B8_128_UB(...) XORI_B8_128(v16u8, __VA_ARGS__)
1891 
1892 /* Description : Addition of signed halfword elements and signed saturation
1893  Arguments : Inputs - in0, in1, in2, in3
1894  Outputs - out0, out1
1895  Return Type - as per RTYPE
1896  Details : Signed halfword elements from 'in0' are added to signed
1897  halfword elements of 'in1'. The result is then signed saturated
1898  between -32768 to +32767 (as per halfword data type)
1899  Similar for other pairs
1900 */
1901 #define ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1) \
1902 { \
1903  out0 = (RTYPE) __msa_adds_s_h((v8i16) in0, (v8i16) in1); \
1904  out1 = (RTYPE) __msa_adds_s_h((v8i16) in2, (v8i16) in3); \
1905 }
1906 #define ADDS_SH2_SH(...) ADDS_SH2(v8i16, __VA_ARGS__)
1907 
1908 #define ADDS_SH4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1909  out0, out1, out2, out3) \
1910 { \
1911  ADDS_SH2(RTYPE, in0, in1, in2, in3, out0, out1); \
1912  ADDS_SH2(RTYPE, in4, in5, in6, in7, out2, out3); \
1913 }
1914 #define ADDS_SH4_UH(...) ADDS_SH4(v8u16, __VA_ARGS__)
1915 #define ADDS_SH4_SH(...) ADDS_SH4(v8i16, __VA_ARGS__)
1916 
1917 /* Description : Shift left all elements of vector (generic for all data types)
1918  Arguments : Inputs - in0, in1, in2, in3, shift
1919  Outputs - in0, in1, in2, in3 (in place)
1920  Return Type - as per input vector RTYPE
1921  Details : Each element of vector 'in0' is left shifted by 'shift' and
1922  result is in place written to 'in0'
1923  Similar for other pairs
1924 */
1925 #define SLLI_2V(in0, in1, shift) \
1926 { \
1927  in0 = in0 << shift; \
1928  in1 = in1 << shift; \
1929 }
1930 #define SLLI_4V(in0, in1, in2, in3, shift) \
1931 { \
1932  in0 = in0 << shift; \
1933  in1 = in1 << shift; \
1934  in2 = in2 << shift; \
1935  in3 = in3 << shift; \
1936 }
1937 
1938 /* Description : Arithmetic shift right all elements of vector
1939  (generic for all data types)
1940  Arguments : Inputs - in0, in1, in2, in3, shift
1941  Outputs - in0, in1, in2, in3 (in place)
1942  Return Type - as per input vector RTYPE
1943  Details : Each element of vector 'in0' is right shifted by 'shift' and
1944  result is in place written to 'in0'
1945  Here, 'shift' is GP variable passed in
1946  Similar for other pairs
1947 */
1948 #define SRA_4V(in0, in1, in2, in3, shift) \
1949 { \
1950  in0 = in0 >> shift; \
1951  in1 = in1 >> shift; \
1952  in2 = in2 >> shift; \
1953  in3 = in3 >> shift; \
1954 }
1955 
1956 /* Description : Shift right logical all halfword elements of vector
1957  Arguments : Inputs - in0, in1, in2, in3, shift
1958  Outputs - in0, in1, in2, in3 (in place)
1959  Return Type - as per RTYPE
1960  Details : Each element of vector 'in0' is shifted right logical by
1961  number of bits respective element holds in vector 'shift' and
1962  result is in place written to 'in0'
1963  Here, 'shift' is a vector passed in
1964  Similar for other pairs
1965 */
1966 #define SRL_H4(RTYPE, in0, in1, in2, in3, shift) \
1967 { \
1968  in0 = (RTYPE) __msa_srl_h((v8i16) in0, (v8i16) shift); \
1969  in1 = (RTYPE) __msa_srl_h((v8i16) in1, (v8i16) shift); \
1970  in2 = (RTYPE) __msa_srl_h((v8i16) in2, (v8i16) shift); \
1971  in3 = (RTYPE) __msa_srl_h((v8i16) in3, (v8i16) shift); \
1972 }
1973 #define SRL_H4_UH(...) SRL_H4(v8u16, __VA_ARGS__)
1974 
1975 #define SRLR_H4(RTYPE, in0, in1, in2, in3, shift) \
1976 { \
1977  in0 = (RTYPE) __msa_srlr_h((v8i16) in0, (v8i16) shift); \
1978  in1 = (RTYPE) __msa_srlr_h((v8i16) in1, (v8i16) shift); \
1979  in2 = (RTYPE) __msa_srlr_h((v8i16) in2, (v8i16) shift); \
1980  in3 = (RTYPE) __msa_srlr_h((v8i16) in3, (v8i16) shift); \
1981 }
1982 #define SRLR_H4_UH(...) SRLR_H4(v8u16, __VA_ARGS__)
1983 #define SRLR_H4_SH(...) SRLR_H4(v8i16, __VA_ARGS__)
1984 
1985 #define SRLR_H8(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, shift) \
1986 { \
1987  SRLR_H4(RTYPE, in0, in1, in2, in3, shift); \
1988  SRLR_H4(RTYPE, in4, in5, in6, in7, shift); \
1989 }
1990 #define SRLR_H8_UH(...) SRLR_H8(v8u16, __VA_ARGS__)
1991 #define SRLR_H8_SH(...) SRLR_H8(v8i16, __VA_ARGS__)
1992 
1993 /* Description : Shift right arithmetic rounded halfwords
1994  Arguments : Inputs - in0, in1, shift
1995  Outputs - in0, in1, (in place)
1996  Return Type - as per RTYPE
1997  Details : Each element of vector 'in0' is shifted right arithmetic by
1998  number of bits respective element holds in vector 'shift'.
1999  The last discarded bit is added to shifted value for rounding
2000  and the result is in place written to 'in0'
2001  Here, 'shift' is a vector passed in
2002  Similar for other pairs
2003 */
2004 #define SRAR_H2(RTYPE, in0, in1, shift) \
2005 { \
2006  in0 = (RTYPE) __msa_srar_h((v8i16) in0, (v8i16) shift); \
2007  in1 = (RTYPE) __msa_srar_h((v8i16) in1, (v8i16) shift); \
2008 }
2009 #define SRAR_H2_UH(...) SRAR_H2(v8u16, __VA_ARGS__)
2010 #define SRAR_H2_SH(...) SRAR_H2(v8i16, __VA_ARGS__)
2011 
2012 #define SRAR_H3(RTYPE, in0, in1, in2, shift) \
2013 { \
2014  SRAR_H2(RTYPE, in0, in1, shift) \
2015  in2 = (RTYPE) __msa_srar_h((v8i16) in2, (v8i16) shift); \
2016 }
2017 #define SRAR_H3_SH(...) SRAR_H3(v8i16, __VA_ARGS__)
2018 
2019 #define SRAR_H4(RTYPE, in0, in1, in2, in3, shift) \
2020 { \
2021  SRAR_H2(RTYPE, in0, in1, shift) \
2022  SRAR_H2(RTYPE, in2, in3, shift) \
2023 }
2024 #define SRAR_H4_UH(...) SRAR_H4(v8u16, __VA_ARGS__)
2025 #define SRAR_H4_SH(...) SRAR_H4(v8i16, __VA_ARGS__)
2026 
2027 /* Description : Shift right arithmetic rounded words
2028  Arguments : Inputs - in0, in1, shift
2029  Outputs - in0, in1, (in place)
2030  Return Type - as per RTYPE
2031  Details : Each element of vector 'in0' is shifted right arithmetic by
2032  number of bits respective element holds in vector 'shift'.
2033  The last discarded bit is added to shifted value for rounding
2034  and the result is in place written to 'in0'
2035  Here, 'shift' is a vector passed in
2036  Similar for other pairs
2037 */
2038 #define SRAR_W2(RTYPE, in0, in1, shift) \
2039 { \
2040  in0 = (RTYPE) __msa_srar_w((v4i32) in0, (v4i32) shift); \
2041  in1 = (RTYPE) __msa_srar_w((v4i32) in1, (v4i32) shift); \
2042 }
2043 #define SRAR_W2_SW(...) SRAR_W2(v4i32, __VA_ARGS__)
2044 
2045 #define SRAR_W4(RTYPE, in0, in1, in2, in3, shift) \
2046 { \
2047  SRAR_W2(RTYPE, in0, in1, shift) \
2048  SRAR_W2(RTYPE, in2, in3, shift) \
2049 }
2050 #define SRAR_W4_SW(...) SRAR_W4(v4i32, __VA_ARGS__)
2051 
2052 /* Description : Shift right arithmetic rounded (immediate)
2053  Arguments : Inputs - in0, in1, in2, in3, shift
2054  Outputs - in0, in1, in2, in3 (in place)
2055  Return Type - as per RTYPE
2056  Details : Each element of vector 'in0' is shifted right arithmetic by
2057  value in 'shift'.
2058  The last discarded bit is added to shifted value for rounding
2059  and the result is in place written to 'in0'
2060  Similar for other pairs
2061 */
2062 #define SRARI_H2(RTYPE, in0, in1, shift) \
2063 { \
2064  in0 = (RTYPE) __msa_srari_h((v8i16) in0, shift); \
2065  in1 = (RTYPE) __msa_srari_h((v8i16) in1, shift); \
2066 }
2067 #define SRARI_H2_UH(...) SRARI_H2(v8u16, __VA_ARGS__)
2068 #define SRARI_H2_SH(...) SRARI_H2(v8i16, __VA_ARGS__)
2069 
2070 #define SRARI_H4(RTYPE, in0, in1, in2, in3, shift) \
2071 { \
2072  SRARI_H2(RTYPE, in0, in1, shift); \
2073  SRARI_H2(RTYPE, in2, in3, shift); \
2074 }
2075 #define SRARI_H4_UH(...) SRARI_H4(v8u16, __VA_ARGS__)
2076 #define SRARI_H4_SH(...) SRARI_H4(v8i16, __VA_ARGS__)
2077 
2078 /* Description : Shift right arithmetic rounded (immediate)
2079  Arguments : Inputs - in0, in1, shift
2080  Outputs - in0, in1 (in place)
2081  Return Type - as per RTYPE
2082  Details : Each element of vector 'in0' is shifted right arithmetic by
2083  value in 'shift'.
2084  The last discarded bit is added to shifted value for rounding
2085  and the result is in place written to 'in0'
2086  Similar for other pairs
2087 */
2088 #define SRARI_W2(RTYPE, in0, in1, shift) \
2089 { \
2090  in0 = (RTYPE) __msa_srari_w((v4i32) in0, shift); \
2091  in1 = (RTYPE) __msa_srari_w((v4i32) in1, shift); \
2092 }
2093 #define SRARI_W2_SW(...) SRARI_W2(v4i32, __VA_ARGS__)
2094 
2095 #define SRARI_W4(RTYPE, in0, in1, in2, in3, shift) \
2096 { \
2097  SRARI_W2(RTYPE, in0, in1, shift); \
2098  SRARI_W2(RTYPE, in2, in3, shift); \
2099 }
2100 #define SRARI_W4_SH(...) SRARI_W4(v8i16, __VA_ARGS__)
2101 #define SRARI_W4_SW(...) SRARI_W4(v4i32, __VA_ARGS__)
2102 
2103 /* Description : Multiplication of pairs of vectors
2104  Arguments : Inputs - in0, in1, in2, in3
2105  Outputs - out0, out1
2106  Details : Each element from 'in0' is multiplied with elements from 'in1'
2107  and result is written to 'out0'
2108  Similar for other pairs
2109 */
2110 #define MUL2(in0, in1, in2, in3, out0, out1) \
2111 { \
2112  out0 = in0 * in1; \
2113  out1 = in2 * in3; \
2114 }
2115 #define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2116 { \
2117  MUL2(in0, in1, in2, in3, out0, out1); \
2118  MUL2(in4, in5, in6, in7, out2, out3); \
2119 }
2120 
2121 /* Description : Addition of 2 pairs of vectors
2122  Arguments : Inputs - in0, in1, in2, in3
2123  Outputs - out0, out1
2124  Details : Each element from 2 pairs vectors is added and 2 results are
2125  produced
2126 */
2127 #define ADD2(in0, in1, in2, in3, out0, out1) \
2128 { \
2129  out0 = in0 + in1; \
2130  out1 = in2 + in3; \
2131 }
2132 #define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2133 { \
2134  ADD2(in0, in1, in2, in3, out0, out1); \
2135  ADD2(in4, in5, in6, in7, out2, out3); \
2136 }
2137 
2138 /* Description : Subtraction of 2 pairs of vectors
2139  Arguments : Inputs - in0, in1, in2, in3
2140  Outputs - out0, out1
2141  Details : Each element from 2 pairs vectors is subtracted and 2 results
2142  are produced
2143 */
2144 #define SUB2(in0, in1, in2, in3, out0, out1) \
2145 { \
2146  out0 = in0 - in1; \
2147  out1 = in2 - in3; \
2148 }
2149 #define SUB4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3) \
2150 { \
2151  out0 = in0 - in1; \
2152  out1 = in2 - in3; \
2153  out2 = in4 - in5; \
2154  out3 = in6 - in7; \
2155 }
2156 
2157 /* Description : Sign extend byte elements from right half of the vector
2158  Arguments : Input - in (byte vector)
2159  Output - out (sign extended halfword vector)
2160  Return Type - signed halfword
2161  Details : Sign bit of byte elements from input vector 'in' is
2162  extracted and interleaved with same vector 'in' to generate
2163  8 halfword elements keeping sign intact
2164 */
2165 #define UNPCK_R_SB_SH(in, out) \
2166 { \
2167  v16i8 sign_m; \
2168  \
2169  sign_m = __msa_clti_s_b((v16i8) in, 0); \
2170  out = (v8i16) __msa_ilvr_b(sign_m, (v16i8) in); \
2171 }
2172 
2173 /* Description : Sign extend halfword elements from right half of the vector
2174  Arguments : Inputs - in (input halfword vector)
2175  Outputs - out (sign extended word vectors)
2176  Return Type - signed word
2177  Details : Sign bit of halfword elements from input vector 'in' is
2178  extracted and interleaved with same vector 'in0' to generate
2179  4 word elements keeping sign intact
2180 */
2181 #if HAVE_MSA2
2182 #define UNPCK_R_SH_SW(in, out) \
2183 { \
2184  out = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2185 }
2186 #else
2187 #define UNPCK_R_SH_SW(in, out) \
2188 { \
2189  v8i16 sign_m; \
2190  \
2191  sign_m = __msa_clti_s_h((v8i16) in, 0); \
2192  out = (v4i32) __msa_ilvr_h(sign_m, (v8i16) in); \
2193 }
2194 #endif // #if HAVE_MSA2
2195 
2196 /* Description : Sign extend byte elements from input vector and return
2197  halfword results in pair of vectors
2198  Arguments : Inputs - in (1 input byte vector)
2199  Outputs - out0, out1 (sign extended 2 halfword vectors)
2200  Return Type - signed halfword
2201  Details : Sign bit of byte elements from input vector 'in' is
2202  extracted and interleaved right with same vector 'in0' to
2203  generate 8 signed halfword elements in 'out0'
2204  Then interleaved left with same vector 'in0' to
2205  generate 8 signed halfword elements in 'out1'
2206 */
2207 #if HAVE_MSA2
2208 #define UNPCK_SB_SH(in, out0, out1) \
2209 { \
2210  out0 = (v4i32) __builtin_msa2_w2x_lo_s_b((v16i8) in); \
2211  out1 = (v4i32) __builtin_msa2_w2x_hi_s_b((v16i8) in); \
2212 }
2213 #else
2214 #define UNPCK_SB_SH(in, out0, out1) \
2215 { \
2216  v16i8 tmp_m; \
2217  \
2218  tmp_m = __msa_clti_s_b((v16i8) in, 0); \
2219  ILVRL_B2_SH(tmp_m, in, out0, out1); \
2220 }
2221 #endif // #if HAVE_MSA2
2222 
2223 /* Description : Zero extend unsigned byte elements to halfword elements
2224  Arguments : Inputs - in (1 input unsigned byte vector)
2225  Outputs - out0, out1 (unsigned 2 halfword vectors)
2226  Return Type - signed halfword
2227  Details : Zero extended right half of vector is returned in 'out0'
2228  Zero extended left half of vector is returned in 'out1'
2229 */
2230 #define UNPCK_UB_SH(in, out0, out1) \
2231 { \
2232  v16i8 zero_m = { 0 }; \
2233  \
2234  ILVRL_B2_SH(zero_m, in, out0, out1); \
2235 }
2236 
2237 /* Description : Sign extend halfword elements from input vector and return
2238  result in pair of vectors
2239  Arguments : Inputs - in (1 input halfword vector)
2240  Outputs - out0, out1 (sign extended 2 word vectors)
2241  Return Type - signed word
2242  Details : Sign bit of halfword elements from input vector 'in' is
2243  extracted and interleaved right with same vector 'in0' to
2244  generate 4 signed word elements in 'out0'
2245  Then interleaved left with same vector 'in0' to
2246  generate 4 signed word elements in 'out1'
2247 */
2248 #if HAVE_MSA2
2249 #define UNPCK_SH_SW(in, out0, out1) \
2250 { \
2251  out0 = (v4i32) __builtin_msa2_w2x_lo_s_h((v8i16) in); \
2252  out1 = (v4i32) __builtin_msa2_w2x_hi_s_h((v8i16) in); \
2253 }
2254 #else
2255 #define UNPCK_SH_SW(in, out0, out1) \
2256 { \
2257  v8i16 tmp_m; \
2258  \
2259  tmp_m = __msa_clti_s_h((v8i16) in, 0); \
2260  ILVRL_H2_SW(tmp_m, in, out0, out1); \
2261 }
2262 #endif // #if HAVE_MSA2
2263 
2264 /* Description : Swap two variables
2265  Arguments : Inputs - in0, in1
2266  Outputs - in0, in1 (in-place)
2267  Details : Swapping of two input variables using xor
2268 */
2269 #define SWAP(in0, in1) \
2270 { \
2271  in0 = in0 ^ in1; \
2272  in1 = in0 ^ in1; \
2273  in0 = in0 ^ in1; \
2274 }
2275 
2276 /* Description : Butterfly of 4 input vectors
2277  Arguments : Inputs - in0, in1, in2, in3
2278  Outputs - out0, out1, out2, out3
2279  Details : Butterfly operation
2280 */
2281 #define BUTTERFLY_4(in0, in1, in2, in3, out0, out1, out2, out3) \
2282 { \
2283  out0 = in0 + in3; \
2284  out1 = in1 + in2; \
2285  \
2286  out2 = in1 - in2; \
2287  out3 = in0 - in3; \
2288 }
2289 
2290 /* Description : Butterfly of 8 input vectors
2291  Arguments : Inputs - in0 ... in7
2292  Outputs - out0 .. out7
2293  Details : Butterfly operation
2294 */
2295 #define BUTTERFLY_8(in0, in1, in2, in3, in4, in5, in6, in7, \
2296  out0, out1, out2, out3, out4, out5, out6, out7) \
2297 { \
2298  out0 = in0 + in7; \
2299  out1 = in1 + in6; \
2300  out2 = in2 + in5; \
2301  out3 = in3 + in4; \
2302  \
2303  out4 = in3 - in4; \
2304  out5 = in2 - in5; \
2305  out6 = in1 - in6; \
2306  out7 = in0 - in7; \
2307 }
2308 
2309 /* Description : Butterfly of 16 input vectors
2310  Arguments : Inputs - in0 ... in15
2311  Outputs - out0 .. out15
2312  Details : Butterfly operation
2313 */
2314 #define BUTTERFLY_16(in0, in1, in2, in3, in4, in5, in6, in7, \
2315  in8, in9, in10, in11, in12, in13, in14, in15, \
2316  out0, out1, out2, out3, out4, out5, out6, out7, \
2317  out8, out9, out10, out11, out12, out13, out14, out15) \
2318 { \
2319  out0 = in0 + in15; \
2320  out1 = in1 + in14; \
2321  out2 = in2 + in13; \
2322  out3 = in3 + in12; \
2323  out4 = in4 + in11; \
2324  out5 = in5 + in10; \
2325  out6 = in6 + in9; \
2326  out7 = in7 + in8; \
2327  \
2328  out8 = in7 - in8; \
2329  out9 = in6 - in9; \
2330  out10 = in5 - in10; \
2331  out11 = in4 - in11; \
2332  out12 = in3 - in12; \
2333  out13 = in2 - in13; \
2334  out14 = in1 - in14; \
2335  out15 = in0 - in15; \
2336 }
2337 
2338 /* Description : Transposes input 4x4 byte block
2339  Arguments : Inputs - in0, in1, in2, in3 (input 4x4 byte block)
2340  Outputs - out0, out1, out2, out3 (output 4x4 byte block)
2341  Return Type - unsigned byte
2342  Details :
2343 */
2344 #define TRANSPOSE4x4_UB_UB(in0, in1, in2, in3, out0, out1, out2, out3) \
2345 { \
2346  v16i8 zero_m = { 0 }; \
2347  v16i8 s0_m, s1_m, s2_m, s3_m; \
2348  \
2349  ILVR_D2_SB(in1, in0, in3, in2, s0_m, s1_m); \
2350  ILVRL_B2_SB(s1_m, s0_m, s2_m, s3_m); \
2351  \
2352  out0 = (v16u8) __msa_ilvr_b(s3_m, s2_m); \
2353  out1 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out0, 4); \
2354  out2 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out1, 4); \
2355  out3 = (v16u8) __msa_sldi_b(zero_m, (v16i8) out2, 4); \
2356 }
2357 
2358 /* Description : Transposes input 8x4 byte block into 4x8
2359  Arguments : Inputs - in0, in1, in2, in3 (input 8x4 byte block)
2360  Outputs - out0, out1, out2, out3 (output 4x8 byte block)
2361  Return Type - as per RTYPE
2362  Details :
2363 */
2364 #define TRANSPOSE8x4_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2365  out0, out1, out2, out3) \
2366 { \
2367  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2368  \
2369  ILVEV_W2_SB(in0, in4, in1, in5, tmp0_m, tmp1_m); \
2370  tmp2_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2371  ILVEV_W2_SB(in2, in6, in3, in7, tmp0_m, tmp1_m); \
2372  \
2373  tmp3_m = __msa_ilvr_b(tmp1_m, tmp0_m); \
2374  ILVRL_H2_SB(tmp3_m, tmp2_m, tmp0_m, tmp1_m); \
2375  \
2376  ILVRL_W2(RTYPE, tmp1_m, tmp0_m, out0, out2); \
2377  out1 = (RTYPE) __msa_ilvl_d((v2i64) out2, (v2i64) out0); \
2378  out3 = (RTYPE) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2379 }
2380 #define TRANSPOSE8x4_UB_UB(...) TRANSPOSE8x4_UB(v16u8, __VA_ARGS__)
2381 #define TRANSPOSE8x4_UB_UH(...) TRANSPOSE8x4_UB(v8u16, __VA_ARGS__)
2382 
2383 /* Description : Transposes input 8x8 byte block
2384  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2385  (input 8x8 byte block)
2386  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2387  (output 8x8 byte block)
2388  Return Type - as per RTYPE
2389  Details :
2390 */
2391 #define TRANSPOSE8x8_UB(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2392  out0, out1, out2, out3, out4, out5, out6, out7) \
2393 { \
2394  v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2395  v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2396  v16i8 zeros = { 0 }; \
2397  \
2398  ILVR_B4_SB(in2, in0, in3, in1, in6, in4, in7, in5, \
2399  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2400  ILVRL_B2_SB(tmp1_m, tmp0_m, tmp4_m, tmp5_m); \
2401  ILVRL_B2_SB(tmp3_m, tmp2_m, tmp6_m, tmp7_m); \
2402  ILVRL_W2(RTYPE, tmp6_m, tmp4_m, out0, out2); \
2403  ILVRL_W2(RTYPE, tmp7_m, tmp5_m, out4, out6); \
2404  SLDI_B4(RTYPE, zeros, out0, zeros, out2, zeros, out4, zeros, out6, \
2405  8, out1, out3, out5, out7); \
2406 }
2407 #define TRANSPOSE8x8_UB_UB(...) TRANSPOSE8x8_UB(v16u8, __VA_ARGS__)
2408 #define TRANSPOSE8x8_UB_UH(...) TRANSPOSE8x8_UB(v8u16, __VA_ARGS__)
2409 
2410 /* Description : Transposes 16x4 block into 4x16 with byte elements in vectors
2411  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2412  in8, in9, in10, in11, in12, in13, in14, in15
2413  Outputs - out0, out1, out2, out3
2414  Return Type - unsigned byte
2415  Details :
2416 */
2417 #define TRANSPOSE16x4_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2418  in8, in9, in10, in11, in12, in13, in14, in15, \
2419  out0, out1, out2, out3) \
2420 { \
2421  v2i64 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2422  \
2423  ILVEV_W2_SD(in0, in4, in8, in12, tmp0_m, tmp1_m); \
2424  out1 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2425  \
2426  ILVEV_W2_SD(in1, in5, in9, in13, tmp0_m, tmp1_m); \
2427  out3 = (v16u8) __msa_ilvev_d(tmp1_m, tmp0_m); \
2428  \
2429  ILVEV_W2_SD(in2, in6, in10, in14, tmp0_m, tmp1_m); \
2430  \
2431  tmp2_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2432  ILVEV_W2_SD(in3, in7, in11, in15, tmp0_m, tmp1_m); \
2433  \
2434  tmp3_m = __msa_ilvev_d(tmp1_m, tmp0_m); \
2435  ILVEV_B2_SD(out1, out3, tmp2_m, tmp3_m, tmp0_m, tmp1_m); \
2436  out0 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2437  out2 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2438  \
2439  tmp0_m = (v2i64) __msa_ilvod_b((v16i8) out3, (v16i8) out1); \
2440  tmp1_m = (v2i64) __msa_ilvod_b((v16i8) tmp3_m, (v16i8) tmp2_m); \
2441  out1 = (v16u8) __msa_ilvev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2442  out3 = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2443 }
2444 
2445 /* Description : Transposes 16x8 block into 8x16 with byte elements in vectors
2446  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
2447  in8, in9, in10, in11, in12, in13, in14, in15
2448  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2449  Return Type - unsigned byte
2450  Details :
2451 */
2452 #define TRANSPOSE16x8_UB_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2453  in8, in9, in10, in11, in12, in13, in14, in15, \
2454  out0, out1, out2, out3, out4, out5, out6, out7) \
2455 { \
2456  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2457  v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2458  \
2459  ILVEV_D2_UB(in0, in8, in1, in9, out7, out6); \
2460  ILVEV_D2_UB(in2, in10, in3, in11, out5, out4); \
2461  ILVEV_D2_UB(in4, in12, in5, in13, out3, out2); \
2462  ILVEV_D2_UB(in6, in14, in7, in15, out1, out0); \
2463  \
2464  tmp0_m = (v16u8) __msa_ilvev_b((v16i8) out6, (v16i8) out7); \
2465  tmp4_m = (v16u8) __msa_ilvod_b((v16i8) out6, (v16i8) out7); \
2466  tmp1_m = (v16u8) __msa_ilvev_b((v16i8) out4, (v16i8) out5); \
2467  tmp5_m = (v16u8) __msa_ilvod_b((v16i8) out4, (v16i8) out5); \
2468  out5 = (v16u8) __msa_ilvev_b((v16i8) out2, (v16i8) out3); \
2469  tmp6_m = (v16u8) __msa_ilvod_b((v16i8) out2, (v16i8) out3); \
2470  out7 = (v16u8) __msa_ilvev_b((v16i8) out0, (v16i8) out1); \
2471  tmp7_m = (v16u8) __msa_ilvod_b((v16i8) out0, (v16i8) out1); \
2472  \
2473  ILVEV_H2_UB(tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m); \
2474  out0 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2475  out4 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2476  \
2477  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
2478  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) out7, (v8i16) out5); \
2479  out2 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2480  out6 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2481  \
2482  ILVEV_H2_UB(tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m); \
2483  out1 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2484  out5 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2485  \
2486  tmp2_m = (v16u8) __msa_ilvod_h((v8i16) tmp5_m, (v8i16) tmp4_m); \
2487  tmp3_m = (v16u8) __msa_ilvod_h((v8i16) tmp7_m, (v8i16) tmp6_m); \
2488  out3 = (v16u8) __msa_ilvev_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2489  out7 = (v16u8) __msa_ilvod_w((v4i32) tmp3_m, (v4i32) tmp2_m); \
2490 }
2491 
2492 /* Description : Transposes 4x4 block with half word elements in vectors
2493  Arguments : Inputs - in0, in1, in2, in3
2494  Outputs - out0, out1, out2, out3
2495  Return Type - signed halfword
2496  Details :
2497 */
2498 #define TRANSPOSE4x4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3) \
2499 { \
2500  v8i16 s0_m, s1_m; \
2501  \
2502  ILVR_H2_SH(in1, in0, in3, in2, s0_m, s1_m); \
2503  ILVRL_W2_SH(s1_m, s0_m, out0, out2); \
2504  out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
2505  out3 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out2); \
2506 }
2507 
2508 /* Description : Transposes 8x8 block with half word elements in vectors
2509  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
2510  Outputs - out0, out1, out2, out3, out4, out5, out6, out7
2511  Return Type - as per RTYPE
2512  Details :
2513 */
2514 #define TRANSPOSE8x8_H(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
2515  out0, out1, out2, out3, out4, out5, out6, out7) \
2516 { \
2517  v8i16 s0_m, s1_m; \
2518  v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2519  v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
2520  \
2521  ILVR_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2522  ILVRL_H2_SH(s1_m, s0_m, tmp0_m, tmp1_m); \
2523  ILVL_H2_SH(in6, in4, in7, in5, s0_m, s1_m); \
2524  ILVRL_H2_SH(s1_m, s0_m, tmp2_m, tmp3_m); \
2525  ILVR_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2526  ILVRL_H2_SH(s1_m, s0_m, tmp4_m, tmp5_m); \
2527  ILVL_H2_SH(in2, in0, in3, in1, s0_m, s1_m); \
2528  ILVRL_H2_SH(s1_m, s0_m, tmp6_m, tmp7_m); \
2529  PCKEV_D4(RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
2530  tmp3_m, tmp7_m, out0, out2, out4, out6); \
2531  out1 = (RTYPE) __msa_pckod_d((v2i64) tmp0_m, (v2i64) tmp4_m); \
2532  out3 = (RTYPE) __msa_pckod_d((v2i64) tmp1_m, (v2i64) tmp5_m); \
2533  out5 = (RTYPE) __msa_pckod_d((v2i64) tmp2_m, (v2i64) tmp6_m); \
2534  out7 = (RTYPE) __msa_pckod_d((v2i64) tmp3_m, (v2i64) tmp7_m); \
2535 }
2536 #define TRANSPOSE8x8_UH_UH(...) TRANSPOSE8x8_H(v8u16, __VA_ARGS__)
2537 #define TRANSPOSE8x8_SH_SH(...) TRANSPOSE8x8_H(v8i16, __VA_ARGS__)
2538 
2539 /* Description : Transposes 4x4 block with word elements in vectors
2540  Arguments : Inputs - in0, in1, in2, in3
2541  Outputs - out0, out1, out2, out3
2542  Return Type - signed word
2543  Details :
2544 */
2545 #define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3) \
2546 { \
2547  v4i32 s0_m, s1_m, s2_m, s3_m; \
2548  \
2549  ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
2550  ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
2551  \
2552  out0 = (v4i32) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
2553  out1 = (v4i32) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
2554  out2 = (v4i32) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
2555  out3 = (v4i32) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
2556 }
2557 
2558 /* Description : Average byte elements from pair of vectors and store 8x4 byte
2559  block in destination memory
2560  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2561  Details : Each byte element from input vector pair 'in0' and 'in1' are
2562  averaged (a + b)/2 and stored in 'tmp0_m'
2563  Each byte element from input vector pair 'in2' and 'in3' are
2564  averaged (a + b)/2 and stored in 'tmp1_m'
2565  Each byte element from input vector pair 'in4' and 'in5' are
2566  averaged (a + b)/2 and stored in 'tmp2_m'
2567  Each byte element from input vector pair 'in6' and 'in7' are
2568  averaged (a + b)/2 and stored in 'tmp3_m'
2569  The half vector results from all 4 vectors are stored in
2570  destination memory as 8x4 byte block
2571 */
2572 #define AVE_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2573 { \
2574  uint64_t out0_m, out1_m, out2_m, out3_m; \
2575  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2576  \
2577  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2578  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2579  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2580  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2581  \
2582  out0_m = __msa_copy_u_d((v2i64) tmp0_m, 0); \
2583  out1_m = __msa_copy_u_d((v2i64) tmp1_m, 0); \
2584  out2_m = __msa_copy_u_d((v2i64) tmp2_m, 0); \
2585  out3_m = __msa_copy_u_d((v2i64) tmp3_m, 0); \
2586  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2587 }
2588 
2589 /* Description : Average byte elements from pair of vectors and store 16x4 byte
2590  block in destination memory
2591  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2592  Details : Each byte element from input vector pair 'in0' and 'in1' are
2593  averaged (a + b)/2 and stored in 'tmp0_m'
2594  Each byte element from input vector pair 'in2' and 'in3' are
2595  averaged (a + b)/2 and stored in 'tmp1_m'
2596  Each byte element from input vector pair 'in4' and 'in5' are
2597  averaged (a + b)/2 and stored in 'tmp2_m'
2598  Each byte element from input vector pair 'in6' and 'in7' are
2599  averaged (a + b)/2 and stored in 'tmp3_m'
2600  The results from all 4 vectors are stored in destination
2601  memory as 16x4 byte block
2602 */
2603 #define AVE_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2604 { \
2605  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2606  \
2607  tmp0_m = __msa_ave_u_b((v16u8) in0, (v16u8) in1); \
2608  tmp1_m = __msa_ave_u_b((v16u8) in2, (v16u8) in3); \
2609  tmp2_m = __msa_ave_u_b((v16u8) in4, (v16u8) in5); \
2610  tmp3_m = __msa_ave_u_b((v16u8) in6, (v16u8) in7); \
2611  \
2612  ST_UB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst, stride); \
2613 }
2614 
2615 /* Description : Average rounded byte elements from pair of vectors and store
2616  8x4 byte block in destination memory
2617  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2618  Details : Each byte element from input vector pair 'in0' and 'in1' are
2619  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2620  Each byte element from input vector pair 'in2' and 'in3' are
2621  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2622  Each byte element from input vector pair 'in4' and 'in5' are
2623  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2624  Each byte element from input vector pair 'in6' and 'in7' are
2625  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2626  The half vector results from all 4 vectors are stored in
2627  destination memory as 8x4 byte block
2628 */
2629 #define AVER_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2630 { \
2631  uint64_t out0_m, out1_m, out2_m, out3_m; \
2632  v16u8 tp0_m, tp1_m, tp2_m, tp3_m; \
2633  \
2634  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2635  tp0_m, tp1_m, tp2_m, tp3_m); \
2636  \
2637  out0_m = __msa_copy_u_d((v2i64) tp0_m, 0); \
2638  out1_m = __msa_copy_u_d((v2i64) tp1_m, 0); \
2639  out2_m = __msa_copy_u_d((v2i64) tp2_m, 0); \
2640  out3_m = __msa_copy_u_d((v2i64) tp3_m, 0); \
2641  SD4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2642 }
2643 
2644 /* Description : Average rounded byte elements from pair of vectors and store
2645  16x4 byte block in destination memory
2646  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2647  Details : Each byte element from input vector pair 'in0' and 'in1' are
2648  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2649  Each byte element from input vector pair 'in2' and 'in3' are
2650  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2651  Each byte element from input vector pair 'in4' and 'in5' are
2652  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2653  Each byte element from input vector pair 'in6' and 'in7' are
2654  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2655  The vector results from all 4 vectors are stored in
2656  destination memory as 16x4 byte block
2657 */
2658 #define AVER_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
2659 { \
2660  v16u8 t0_m, t1_m, t2_m, t3_m; \
2661  \
2662  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2663  t0_m, t1_m, t2_m, t3_m); \
2664  ST_UB4(t0_m, t1_m, t2_m, t3_m, pdst, stride); \
2665 }
2666 
2667 /* Description : Average rounded byte elements from pair of vectors,
2668  average rounded with destination and store 8x4 byte block
2669  in destination memory
2670  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2671  Details : Each byte element from input vector pair 'in0' and 'in1' are
2672  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2673  Each byte element from input vector pair 'in2' and 'in3' are
2674  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2675  Each byte element from input vector pair 'in4' and 'in5' are
2676  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2677  Each byte element from input vector pair 'in6' and 'in7' are
2678  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2679  The half vector results from all 4 vectors are stored in
2680  destination memory as 8x4 byte block
2681 */
2682 #define AVER_DST_ST8x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2683  pdst, stride) \
2684 { \
2685  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2686  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2687  \
2688  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2689  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2690  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2691  AVER_ST8x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2692  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2693 }
2694 
2695 /* Description : Average rounded byte elements from pair of vectors,
2696  average rounded with destination and store 16x4 byte block
2697  in destination memory
2698  Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride
2699  Details : Each byte element from input vector pair 'in0' and 'in1' are
2700  average rounded (a + b + 1)/2 and stored in 'tmp0_m'
2701  Each byte element from input vector pair 'in2' and 'in3' are
2702  average rounded (a + b + 1)/2 and stored in 'tmp1_m'
2703  Each byte element from input vector pair 'in4' and 'in5' are
2704  average rounded (a + b + 1)/2 and stored in 'tmp2_m'
2705  Each byte element from input vector pair 'in6' and 'in7' are
2706  average rounded (a + b + 1)/2 and stored in 'tmp3_m'
2707  The vector results from all 4 vectors are stored in
2708  destination memory as 16x4 byte block
2709 */
2710 #define AVER_DST_ST16x4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2711  pdst, stride) \
2712 { \
2713  v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
2714  v16u8 dst0_m, dst1_m, dst2_m, dst3_m; \
2715  \
2716  LD_UB4(pdst, stride, dst0_m, dst1_m, dst2_m, dst3_m); \
2717  AVER_UB4_UB(in0, in1, in2, in3, in4, in5, in6, in7, \
2718  tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
2719  AVER_ST16x4_UB(dst0_m, tmp0_m, dst1_m, tmp1_m, \
2720  dst2_m, tmp2_m, dst3_m, tmp3_m, pdst, stride); \
2721 }
2722 
2723 /* Description : Add block 4x4
2724  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2725  Details : Least significant 4 bytes from each input vector are added to
2726  the destination bytes, clipped between 0-255 and then stored.
2727 */
2728 #define ADDBLK_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2729 { \
2730  uint32_t src0_m, src1_m, src2_m, src3_m; \
2731  uint32_t out0_m, out1_m, out2_m, out3_m; \
2732  v8i16 inp0_m, inp1_m, res0_m, res1_m; \
2733  v16i8 dst0_m = { 0 }; \
2734  v16i8 dst1_m = { 0 }; \
2735  v16i8 zero_m = { 0 }; \
2736  \
2737  ILVR_D2_SH(in1, in0, in3, in2, inp0_m, inp1_m) \
2738  LW4(pdst, stride, src0_m, src1_m, src2_m, src3_m); \
2739  INSERT_W2_SB(src0_m, src1_m, dst0_m); \
2740  INSERT_W2_SB(src2_m, src3_m, dst1_m); \
2741  ILVR_B2_SH(zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m); \
2742  ADD2(res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m); \
2743  CLIP_SH2_0_255(res0_m, res1_m); \
2744  PCKEV_B2_SB(res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m); \
2745  \
2746  out0_m = __msa_copy_u_w((v4i32) dst0_m, 0); \
2747  out1_m = __msa_copy_u_w((v4i32) dst0_m, 1); \
2748  out2_m = __msa_copy_u_w((v4i32) dst1_m, 0); \
2749  out3_m = __msa_copy_u_w((v4i32) dst1_m, 1); \
2750  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2751 }
2752 
2753 /* Description : Dot product and addition of 3 signed halfword input vectors
2754  Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
2755  Outputs - out0_m
2756  Return Type - signed halfword
2757  Details : Dot product of 'in0' with 'coeff0'
2758  Dot product of 'in1' with 'coeff1'
2759  Dot product of 'in2' with 'coeff2'
2760  Addition of all the 3 vector results
2761 
2762  out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
2763 */
2764 #define DPADD_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
2765 ( { \
2766  v8i16 out0_m; \
2767  \
2768  out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
2769  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
2770  out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
2771  \
2772  out0_m; \
2773 } )
2774 
2775 /* Description : Pack even elements of input vectors & xor with 128
2776  Arguments : Inputs - in0, in1
2777  Outputs - out_m
2778  Return Type - unsigned byte
2779  Details : Signed byte even elements from 'in0' and 'in1' are packed
2780  together in one vector and the resulted vector is xor'ed with
2781  128 to shift the range from signed to unsigned byte
2782 */
2783 #define PCKEV_XORI128_UB(in0, in1) \
2784 ( { \
2785  v16u8 out_m; \
2786  out_m = (v16u8) __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2787  out_m = (v16u8) __msa_xori_b((v16u8) out_m, 128); \
2788  out_m; \
2789 } )
2790 
2791 /* Description : Converts inputs to unsigned bytes, interleave, average & store
2792  as 8x4 unsigned byte block
2793  Arguments : Inputs - in0, in1, in2, in3, dst0, dst1, pdst, stride
2794 */
2795 #define CONVERT_UB_AVG_ST8x4_UB(in0, in1, in2, in3, \
2796  dst0, dst1, pdst, stride) \
2797 { \
2798  v16u8 tmp0_m, tmp1_m; \
2799  uint8_t *pdst_m = (uint8_t *) (pdst); \
2800  \
2801  tmp0_m = PCKEV_XORI128_UB(in0, in1); \
2802  tmp1_m = PCKEV_XORI128_UB(in2, in3); \
2803  AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
2804  ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
2805 }
2806 
2807 /* Description : Pack even byte elements, extract 0 & 2 index words from pair
2808  of results and store 4 words in destination memory as per
2809  stride
2810  Arguments : Inputs - in0, in1, in2, in3, pdst, stride
2811 */
2812 #define PCKEV_ST4x4_UB(in0, in1, in2, in3, pdst, stride) \
2813 { \
2814  uint32_t out0_m, out1_m, out2_m, out3_m; \
2815  v16i8 tmp0_m, tmp1_m; \
2816  \
2817  PCKEV_B2_SB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
2818  \
2819  out0_m = __msa_copy_u_w((v4i32) tmp0_m, 0); \
2820  out1_m = __msa_copy_u_w((v4i32) tmp0_m, 2); \
2821  out2_m = __msa_copy_u_w((v4i32) tmp1_m, 0); \
2822  out3_m = __msa_copy_u_w((v4i32) tmp1_m, 2); \
2823  \
2824  SW4(out0_m, out1_m, out2_m, out3_m, pdst, stride); \
2825 }
2826 
2827 /* Description : Pack even byte elements and store byte vector in destination
2828  memory
2829  Arguments : Inputs - in0, in1, pdst
2830 */
2831 #define PCKEV_ST_SB(in0, in1, pdst) \
2832 { \
2833  v16i8 tmp_m; \
2834  tmp_m = __msa_pckev_b((v16i8) in1, (v16i8) in0); \
2835  ST_SB(tmp_m, (pdst)); \
2836 }
2837 
2838 /* Description : Horizontal 2 tap filter kernel code
2839  Arguments : Inputs - in0, in1, mask, coeff, shift
2840 */
2841 #define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift) \
2842 ( { \
2843  v16i8 tmp0_m; \
2844  v8u16 tmp1_m; \
2845  \
2846  tmp0_m = __msa_vshf_b((v16i8) mask, (v16i8) in1, (v16i8) in0); \
2847  tmp1_m = __msa_dotp_u_h((v16u8) tmp0_m, (v16u8) coeff); \
2848  tmp1_m = (v8u16) __msa_srari_h((v8i16) tmp1_m, shift); \
2849  tmp1_m = __msa_sat_u_h(tmp1_m, shift); \
2850  \
2851  tmp1_m; \
2852 } )
2853 #endif /* AVUTIL_MIPS_GENERIC_MACROS_MSA_H */