FFmpeg
loongson_intrinsics.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * All rights reserved.
4  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
5  * Xiwei Gu <guxiwei-hf@loongson.cn>
6  * Lu Wang <wanglu@loongson.cn>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  *
24  */
25 
26 #ifndef AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
27 #define AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H
28 
29 /*
30  * Copyright (c) 2021 Loongson Technology Corporation Limited
31  * All rights reserved.
32  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
33  * Xiwei Gu <guxiwei-hf@loongson.cn>
34  * Lu Wang <wanglu@loongson.cn>
35  *
36  * This file is a header file for loongarch builtin extension.
37  *
38  */
39 
40 #ifndef LOONGSON_INTRINSICS_H
41 #define LOONGSON_INTRINSICS_H
42 
43 /**
44  * MAJOR version: Macro usage changes.
45  * MINOR version: Add new functions, or bug fixes.
46  * MICRO version: Comment changes or implementation changes.
47  */
48 #define LSOM_VERSION_MAJOR 1
49 #define LSOM_VERSION_MINOR 1
50 #define LSOM_VERSION_MICRO 0
51 
52 #define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
53  { \
54  _OUT0 = _INS(_IN0); \
55  _OUT1 = _INS(_IN1); \
56  }
57 
58 #define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
59  { \
60  _OUT0 = _INS(_IN0, _IN1); \
61  _OUT1 = _INS(_IN2, _IN3); \
62  }
63 
64 #define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
65  { \
66  _OUT0 = _INS(_IN0, _IN1, _IN2); \
67  _OUT1 = _INS(_IN3, _IN4, _IN5); \
68  }
69 
70 #define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
71  { \
72  DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1); \
73  DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3); \
74  }
75 
76 #define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
77  _OUT1, _OUT2, _OUT3) \
78  { \
79  DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1); \
80  DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3); \
81  }
82 
83 #define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
84  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3) \
85  { \
86  DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1); \
87  DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3); \
88  }
89 
90 #ifdef __loongarch_sx
91 #include <lsxintrin.h>
92 /*
93  * =============================================================================
94  * Description : Dot product & addition of byte vector elements
95  * Arguments : Inputs - in_c, in_h, in_l
96  * Outputs - out
97  * Return Type - halfword
98  * Details : Signed byte elements from in_h are multiplied by
99  * signed byte elements from in_l, and then added adjacent to
100  * each other to get results with the twice size of input.
101  * Then the results plus to signed half-word elements from in_c.
102  * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
103  * in_c : 1,2,3,4, 1,2,3,4
104  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
105  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
106  * out : 23,40,41,26, 23,40,41,26
107  * =============================================================================
108  */
109 static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
110  __m128i in_l) {
111  __m128i out;
112 
113  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
114  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
115  return out;
116 }
117 
118 /*
119  * =============================================================================
120  * Description : Dot product & addition of byte vector elements
121  * Arguments : Inputs - in_c, in_h, in_l
122  * Outputs - out
123  * Return Type - halfword
124  * Details : Unsigned byte elements from in_h are multiplied by
125  * unsigned byte elements from in_l, and then added adjacent to
126  * each other to get results with the twice size of input.
127  * The results plus to signed half-word elements from in_c.
128  * Example : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
129  * in_c : 1,2,3,4, 1,2,3,4
130  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
131  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
132  * out : 23,40,41,26, 23,40,41,26
133  * =============================================================================
134  */
135 static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
136  __m128i in_l) {
137  __m128i out;
138 
139  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
140  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
141  return out;
142 }
143 
144 /*
145  * =============================================================================
146  * Description : Dot product & addition of byte vector elements
147  * Arguments : Inputs - in_c, in_h, in_l
148  * Outputs - out
149  * Return Type - halfword
150  * Details : Unsigned byte elements from in_h are multiplied by
151  * signed byte elements from in_l, and then added adjacent to
152  * each other to get results with the twice size of input.
153  * The results plus to signed half-word elements from in_c.
154  * Example : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
155  * in_c : 1,1,1,1, 1,1,1,1
156  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
157  * in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
158  * out : -4,-24,-60,-112, 6,26,62,114
159  * =============================================================================
160  */
161 static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
162  __m128i in_l) {
163  __m128i out;
164 
165  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
166  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
167  return out;
168 }
169 
170 /*
171  * =============================================================================
172  * Description : Dot product & addition of half-word vector elements
173  * Arguments : Inputs - in_c, in_h, in_l
174  * Outputs - out
175  * Return Type - __m128i
176  * Details : Signed half-word elements from in_h are multiplied by
177  * signed half-word elements from in_l, and then added adjacent to
178  * each other to get results with the twice size of input.
179  * Then the results plus to signed word elements from in_c.
180  * Example : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
181  * in_c : 1,2,3,4
182  * in_h : 1,2,3,4, 5,6,7,8
183  * in_l : 8,7,6,5, 4,3,2,1
184  * out : 23,40,41,26
185  * =============================================================================
186  */
187 static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
188  __m128i in_l) {
189  __m128i out;
190 
191  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
192  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
193  return out;
194 }
195 
196 /*
197  * =============================================================================
198  * Description : Dot product of byte vector elements
199  * Arguments : Inputs - in_h, in_l
200  * Outputs - out
201  * Return Type - halfword
202  * Details : Signed byte elements from in_h are multiplied by
203  * signed byte elements from in_l, and then added adjacent to
204  * each other to get results with the twice size of input.
205  * Example : out = __lsx_vdp2_h_b(in_h, in_l)
206  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
207  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
208  * out : 22,38,38,22, 22,38,38,22
209  * =============================================================================
210  */
211 static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
212  __m128i out;
213 
214  out = __lsx_vmulwev_h_b(in_h, in_l);
215  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
216  return out;
217 }
218 
219 /*
220  * =============================================================================
221  * Description : Dot product of byte vector elements
222  * Arguments : Inputs - in_h, in_l
223  * Outputs - out
224  * Return Type - halfword
225  * Details : Unsigned byte elements from in_h are multiplied by
226  * unsigned byte elements from in_l, and then added adjacent to
227  * each other to get results with the twice size of input.
228  * Example : out = __lsx_vdp2_h_bu(in_h, in_l)
229  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
230  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
231  * out : 22,38,38,22, 22,38,38,22
232  * =============================================================================
233  */
234 static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
235  __m128i out;
236 
237  out = __lsx_vmulwev_h_bu(in_h, in_l);
238  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
239  return out;
240 }
241 
242 /*
243  * =============================================================================
244  * Description : Dot product of byte vector elements
245  * Arguments : Inputs - in_h, in_l
246  * Outputs - out
247  * Return Type - halfword
248  * Details : Unsigned byte elements from in_h are multiplied by
249  * signed byte elements from in_l, and then added adjacent to
250  * each other to get results with the twice size of input.
251  * Example : out = __lsx_vdp2_h_bu_b(in_h, in_l)
252  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
253  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
254  * out : 22,38,38,22, 22,38,38,6
255  * =============================================================================
256  */
257 static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
258  __m128i out;
259 
260  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
261  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
262  return out;
263 }
264 
265 /*
266  * =============================================================================
267  * Description : Dot product of byte vector elements
268  * Arguments : Inputs - in_h, in_l
269  * Outputs - out
270  * Return Type - halfword
271  * Details : Signed byte elements from in_h are multiplied by
272  * signed byte elements from in_l, and then added adjacent to
273  * each other to get results with the twice size of input.
274  * Example : out = __lsx_vdp2_w_h(in_h, in_l)
275  * in_h : 1,2,3,4, 5,6,7,8
276  * in_l : 8,7,6,5, 4,3,2,1
277  * out : 22,38,38,22
278  * =============================================================================
279  */
280 static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
281  __m128i out;
282 
283  out = __lsx_vmulwev_w_h(in_h, in_l);
284  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
285  return out;
286 }
287 
288 /*
289  * =============================================================================
290  * Description : Clip all halfword elements of input vector between min & max
291  * out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
292  * (_in))
293  * Arguments : Inputs - _in (input vector)
294  * - min (min threshold)
295  * - max (max threshold)
296  * Outputs - out (output vector with clipped elements)
297  * Return Type - signed halfword
298  * Example : out = __lsx_vclip_h(_in)
299  * _in : -8,2,280,249, -8,255,280,249
300  * min : 1,1,1,1, 1,1,1,1
301  * max : 9,9,9,9, 9,9,9,9
302  * out : 1,2,9,9, 1,9,9,9
303  * =============================================================================
304  */
305 static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
306  __m128i out;
307 
308  out = __lsx_vmax_h(min, _in);
309  out = __lsx_vmin_h(max, out);
310  return out;
311 }
312 
313 /*
314  * =============================================================================
315  * Description : Set each element of vector between 0 and 255
316  * Arguments : Inputs - _in
317  * Outputs - out
318  * Return Type - halfword
319  * Details : Signed byte elements from _in are clamped between 0 and 255.
320  * Example : out = __lsx_vclip255_h(_in)
321  * _in : -8,255,280,249, -8,255,280,249
322  * out : 0,255,255,249, 0,255,255,249
323  * =============================================================================
324  */
325 static inline __m128i __lsx_vclip255_h(__m128i _in) {
326  __m128i out;
327 
328  out = __lsx_vmaxi_h(_in, 0);
329  out = __lsx_vsat_hu(out, 7);
330  return out;
331 }
332 
333 /*
334  * =============================================================================
335  * Description : Set each element of vector between 0 and 255
336  * Arguments : Inputs - _in
337  * Outputs - out
338  * Return Type - word
339  * Details : Signed byte elements from _in are clamped between 0 and 255.
340  * Example : out = __lsx_vclip255_w(_in)
341  * _in : -8,255,280,249
342  * out : 0,255,255,249
343  * =============================================================================
344  */
345 static inline __m128i __lsx_vclip255_w(__m128i _in) {
346  __m128i out;
347 
348  out = __lsx_vmaxi_w(_in, 0);
349  out = __lsx_vsat_wu(out, 7);
350  return out;
351 }
352 
353 /*
354  * =============================================================================
355  * Description : Swap two variables
356  * Arguments : Inputs - _in0, _in1
357  * Outputs - _in0, _in1 (in-place)
358  * Details : Swapping of two input variables using xor
359  * Example : LSX_SWAP(_in0, _in1)
360  * _in0 : 1,2,3,4
361  * _in1 : 5,6,7,8
362  * _in0(out) : 5,6,7,8
363  * _in1(out) : 1,2,3,4
364  * =============================================================================
365  */
366 #define LSX_SWAP(_in0, _in1) \
367  { \
368  _in0 = __lsx_vxor_v(_in0, _in1); \
369  _in1 = __lsx_vxor_v(_in0, _in1); \
370  _in0 = __lsx_vxor_v(_in0, _in1); \
371  }
372 
373 /*
374  * =============================================================================
375  * Description : Transpose 4x4 block with word elements in vectors
376  * Arguments : Inputs - in0, in1, in2, in3
377  * Outputs - out0, out1, out2, out3
378  * Details :
379  * Example :
380  * 1, 2, 3, 4 1, 5, 9,13
381  * 5, 6, 7, 8 to 2, 6,10,14
382  * 9,10,11,12 =====> 3, 7,11,15
383  * 13,14,15,16 4, 8,12,16
384  * =============================================================================
385  */
386 #define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
387  { \
388  __m128i _t0, _t1, _t2, _t3; \
389  \
390  _t0 = __lsx_vilvl_w(_in1, _in0); \
391  _t1 = __lsx_vilvh_w(_in1, _in0); \
392  _t2 = __lsx_vilvl_w(_in3, _in2); \
393  _t3 = __lsx_vilvh_w(_in3, _in2); \
394  _out0 = __lsx_vilvl_d(_t2, _t0); \
395  _out1 = __lsx_vilvh_d(_t2, _t0); \
396  _out2 = __lsx_vilvl_d(_t3, _t1); \
397  _out3 = __lsx_vilvh_d(_t3, _t1); \
398  }
399 
400 /*
401  * =============================================================================
402  * Description : Transpose 8x8 block with byte elements in vectors
403  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
404  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
405  * _out7
406  * Details : The rows of the matrix become columns, and the columns
407  * become rows.
408  * Example : LSX_TRANSPOSE8x8_B
409  * _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
410  * _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
411  * _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
412  * _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
413  * _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
414  * _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
415  * _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
416  * _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
417  *
418  * _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
419  * _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
420  * _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
421  * _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
422  * _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
423  * _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
424  * _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
425  * _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
426  * =============================================================================
427  */
428 #define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
429  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
430  _out7) \
431  { \
432  __m128i zero = { 0 }; \
433  __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 }; \
434  __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
435  \
436  _t0 = __lsx_vilvl_b(_in2, _in0); \
437  _t1 = __lsx_vilvl_b(_in3, _in1); \
438  _t2 = __lsx_vilvl_b(_in6, _in4); \
439  _t3 = __lsx_vilvl_b(_in7, _in5); \
440  _t4 = __lsx_vilvl_b(_t1, _t0); \
441  _t5 = __lsx_vilvh_b(_t1, _t0); \
442  _t6 = __lsx_vilvl_b(_t3, _t2); \
443  _t7 = __lsx_vilvh_b(_t3, _t2); \
444  _out0 = __lsx_vilvl_w(_t6, _t4); \
445  _out2 = __lsx_vilvh_w(_t6, _t4); \
446  _out4 = __lsx_vilvl_w(_t7, _t5); \
447  _out6 = __lsx_vilvh_w(_t7, _t5); \
448  _out1 = __lsx_vshuf_b(zero, _out0, shuf8); \
449  _out3 = __lsx_vshuf_b(zero, _out2, shuf8); \
450  _out5 = __lsx_vshuf_b(zero, _out4, shuf8); \
451  _out7 = __lsx_vshuf_b(zero, _out6, shuf8); \
452  }
453 
454 /*
455  * =============================================================================
456  * Description : Transpose 8x8 block with half-word elements in vectors
457  * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
458  * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
459  * Details :
460  * Example :
461  * 00,01,02,03,04,05,06,07 00,10,20,30,40,50,60,70
462  * 10,11,12,13,14,15,16,17 01,11,21,31,41,51,61,71
463  * 20,21,22,23,24,25,26,27 02,12,22,32,42,52,62,72
464  * 30,31,32,33,34,35,36,37 to 03,13,23,33,43,53,63,73
465  * 40,41,42,43,44,45,46,47 ======> 04,14,24,34,44,54,64,74
466  * 50,51,52,53,54,55,56,57 05,15,25,35,45,55,65,75
467  * 60,61,62,63,64,65,66,67 06,16,26,36,46,56,66,76
468  * 70,71,72,73,74,75,76,77 07,17,27,37,47,57,67,77
469  * =============================================================================
470  */
471 #define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
472  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
473  _out7) \
474  { \
475  __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
476  \
477  _s0 = __lsx_vilvl_h(_in6, _in4); \
478  _s1 = __lsx_vilvl_h(_in7, _in5); \
479  _t0 = __lsx_vilvl_h(_s1, _s0); \
480  _t1 = __lsx_vilvh_h(_s1, _s0); \
481  _s0 = __lsx_vilvh_h(_in6, _in4); \
482  _s1 = __lsx_vilvh_h(_in7, _in5); \
483  _t2 = __lsx_vilvl_h(_s1, _s0); \
484  _t3 = __lsx_vilvh_h(_s1, _s0); \
485  _s0 = __lsx_vilvl_h(_in2, _in0); \
486  _s1 = __lsx_vilvl_h(_in3, _in1); \
487  _t4 = __lsx_vilvl_h(_s1, _s0); \
488  _t5 = __lsx_vilvh_h(_s1, _s0); \
489  _s0 = __lsx_vilvh_h(_in2, _in0); \
490  _s1 = __lsx_vilvh_h(_in3, _in1); \
491  _t6 = __lsx_vilvl_h(_s1, _s0); \
492  _t7 = __lsx_vilvh_h(_s1, _s0); \
493  \
494  _out0 = __lsx_vpickev_d(_t0, _t4); \
495  _out2 = __lsx_vpickev_d(_t1, _t5); \
496  _out4 = __lsx_vpickev_d(_t2, _t6); \
497  _out6 = __lsx_vpickev_d(_t3, _t7); \
498  _out1 = __lsx_vpickod_d(_t0, _t4); \
499  _out3 = __lsx_vpickod_d(_t1, _t5); \
500  _out5 = __lsx_vpickod_d(_t2, _t6); \
501  _out7 = __lsx_vpickod_d(_t3, _t7); \
502  }
503 
504 /*
505  * =============================================================================
506  * Description : Transpose input 8x4 byte block into 4x8
507  * Arguments : Inputs - _in0, _in1, _in2, _in3 (input 8x4 byte block)
508  * Outputs - _out0, _out1, _out2, _out3 (output 4x8 byte block)
509  * Return Type - as per RTYPE
510  * Details : The rows of the matrix become columns, and the columns become
511  * rows.
512  * Example : LSX_TRANSPOSE8x4_B
513  * _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
514  * _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
515  * _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
516  * _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
517  * _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
518  * _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
519  * _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
520  * _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
521  *
522  * _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
523  * _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
524  * _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
525  * _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
526  * =============================================================================
527  */
528 #define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
529  _out0, _out1, _out2, _out3) \
530  { \
531  __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
532  \
533  _tmp0_m = __lsx_vpackev_w(_in4, _in0); \
534  _tmp1_m = __lsx_vpackev_w(_in5, _in1); \
535  _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
536  _tmp0_m = __lsx_vpackev_w(_in6, _in2); \
537  _tmp1_m = __lsx_vpackev_w(_in7, _in3); \
538  \
539  _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m); \
540  _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m); \
541  _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m); \
542  \
543  _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m); \
544  _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m); \
545  _out1 = __lsx_vilvh_d(_out2, _out0); \
546  _out3 = __lsx_vilvh_d(_out0, _out2); \
547  }
548 
549 /*
550  * =============================================================================
551  * Description : Transpose 16x8 block with byte elements in vectors
552  * Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7, in8
553  * in9, in10, in11, in12, in13, in14, in15
554  * Outputs - out0, out1, out2, out3, out4, out5, out6, out7
555  * Details :
556  * Example :
557  * 000,001,002,003,004,005,006,007
558  * 008,009,010,011,012,013,014,015
559  * 016,017,018,019,020,021,022,023
560  * 024,025,026,027,028,029,030,031
561  * 032,033,034,035,036,037,038,039
562  * 040,041,042,043,044,045,046,047 000,008,...,112,120
563  * 048,049,050,051,052,053,054,055 001,009,...,113,121
564  * 056,057,058,059,060,061,062,063 to 002,010,...,114,122
565  * 064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
566  * 072,073,074,075,076,077,078,079 004,012,...,116,124
567  * 080,081,082,083,084,085,086,087 005,013,...,117,125
568  * 088,089,090,091,092,093,094,095 006,014,...,118,126
569  * 096,097,098,099,100,101,102,103 007,015,...,119,127
570  * 104,105,106,107,108,109,110,111
571  * 112,113,114,115,116,117,118,119
572  * 120,121,122,123,124,125,126,127
573  * =============================================================================
574  */
575 #define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
576  _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
577  _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
578  _out6, _out7) \
579  { \
580  __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7; \
581  __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
582  DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
583  _tmp0, _tmp1, _tmp2, _tmp3); \
584  DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15, \
585  _in13, _tmp4, _tmp5, _tmp6, _tmp7); \
586  DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2); \
587  DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3); \
588  DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6); \
589  DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7); \
590  DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4); \
591  DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6); \
592  DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5); \
593  DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7); \
594  DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2); \
595  DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3); \
596  DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6); \
597  DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7); \
598  }
599 
600 /*
601  * =============================================================================
602  * Description : Butterfly of 4 input vectors
603  * Arguments : Inputs - in0, in1, in2, in3
604  * Outputs - out0, out1, out2, out3
605  * Details : Butterfly operation
606  * Example :
607  * out0 = in0 + in3;
608  * out1 = in1 + in2;
609  * out2 = in1 - in2;
610  * out3 = in0 - in3;
611  * =============================================================================
612  */
613 #define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
614  { \
615  _out0 = __lsx_vadd_b(_in0, _in3); \
616  _out1 = __lsx_vadd_b(_in1, _in2); \
617  _out2 = __lsx_vsub_b(_in1, _in2); \
618  _out3 = __lsx_vsub_b(_in0, _in3); \
619  }
620 #define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
621  { \
622  _out0 = __lsx_vadd_h(_in0, _in3); \
623  _out1 = __lsx_vadd_h(_in1, _in2); \
624  _out2 = __lsx_vsub_h(_in1, _in2); \
625  _out3 = __lsx_vsub_h(_in0, _in3); \
626  }
627 #define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
628  { \
629  _out0 = __lsx_vadd_w(_in0, _in3); \
630  _out1 = __lsx_vadd_w(_in1, _in2); \
631  _out2 = __lsx_vsub_w(_in1, _in2); \
632  _out3 = __lsx_vsub_w(_in0, _in3); \
633  }
634 #define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
635  { \
636  _out0 = __lsx_vadd_d(_in0, _in3); \
637  _out1 = __lsx_vadd_d(_in1, _in2); \
638  _out2 = __lsx_vsub_d(_in1, _in2); \
639  _out3 = __lsx_vsub_d(_in0, _in3); \
640  }
641 
642 /*
643  * =============================================================================
644  * Description : Butterfly of 8 input vectors
645  * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
646  * Outputs - _out0, _out1, _out2, _out3, ~
647  * Details : Butterfly operation
648  * Example :
649  * _out0 = _in0 + _in7;
650  * _out1 = _in1 + _in6;
651  * _out2 = _in2 + _in5;
652  * _out3 = _in3 + _in4;
653  * _out4 = _in3 - _in4;
654  * _out5 = _in2 - _in5;
655  * _out6 = _in1 - _in6;
656  * _out7 = _in0 - _in7;
657  * =============================================================================
658  */
659 #define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
660  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
661  _out7) \
662  { \
663  _out0 = __lsx_vadd_b(_in0, _in7); \
664  _out1 = __lsx_vadd_b(_in1, _in6); \
665  _out2 = __lsx_vadd_b(_in2, _in5); \
666  _out3 = __lsx_vadd_b(_in3, _in4); \
667  _out4 = __lsx_vsub_b(_in3, _in4); \
668  _out5 = __lsx_vsub_b(_in2, _in5); \
669  _out6 = __lsx_vsub_b(_in1, _in6); \
670  _out7 = __lsx_vsub_b(_in0, _in7); \
671  }
672 
673 #define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
674  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
675  _out7) \
676  { \
677  _out0 = __lsx_vadd_h(_in0, _in7); \
678  _out1 = __lsx_vadd_h(_in1, _in6); \
679  _out2 = __lsx_vadd_h(_in2, _in5); \
680  _out3 = __lsx_vadd_h(_in3, _in4); \
681  _out4 = __lsx_vsub_h(_in3, _in4); \
682  _out5 = __lsx_vsub_h(_in2, _in5); \
683  _out6 = __lsx_vsub_h(_in1, _in6); \
684  _out7 = __lsx_vsub_h(_in0, _in7); \
685  }
686 
687 #define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
688  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
689  _out7) \
690  { \
691  _out0 = __lsx_vadd_w(_in0, _in7); \
692  _out1 = __lsx_vadd_w(_in1, _in6); \
693  _out2 = __lsx_vadd_w(_in2, _in5); \
694  _out3 = __lsx_vadd_w(_in3, _in4); \
695  _out4 = __lsx_vsub_w(_in3, _in4); \
696  _out5 = __lsx_vsub_w(_in2, _in5); \
697  _out6 = __lsx_vsub_w(_in1, _in6); \
698  _out7 = __lsx_vsub_w(_in0, _in7); \
699  }
700 
701 #define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
702  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
703  _out7) \
704  { \
705  _out0 = __lsx_vadd_d(_in0, _in7); \
706  _out1 = __lsx_vadd_d(_in1, _in6); \
707  _out2 = __lsx_vadd_d(_in2, _in5); \
708  _out3 = __lsx_vadd_d(_in3, _in4); \
709  _out4 = __lsx_vsub_d(_in3, _in4); \
710  _out5 = __lsx_vsub_d(_in2, _in5); \
711  _out6 = __lsx_vsub_d(_in1, _in6); \
712  _out7 = __lsx_vsub_d(_in0, _in7); \
713  }
714 
715 #endif // LSX
716 
717 #ifdef __loongarch_asx
718 #include <lasxintrin.h>
719 /*
720  * =============================================================================
721  * Description : Dot product of byte vector elements
722  * Arguments : Inputs - in_h, in_l
723  * Output - out
724  * Return Type - signed halfword
725  * Details : Unsigned byte elements from in_h are multiplied with
726  * unsigned byte elements from in_l producing a result
727  * twice the size of input i.e. signed halfword.
728  * Then this multiplied results of adjacent odd-even elements
729  * are added to the out vector
730  * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
731  * =============================================================================
732  */
733 static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
734  __m256i out;
735 
736  out = __lasx_xvmulwev_h_bu(in_h, in_l);
737  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
738  return out;
739 }
740 
741 /*
742  * =============================================================================
743  * Description : Dot product of byte vector elements
744  * Arguments : Inputs - in_h, in_l
745  * Output - out
746  * Return Type - signed halfword
747  * Details : Signed byte elements from in_h are multiplied with
748  * signed byte elements from in_l producing a result
749  * twice the size of input i.e. signed halfword.
750  * Then this multiplication results of adjacent odd-even elements
751  * are added to the out vector
752  * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
753  * =============================================================================
754  */
755 static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
756  __m256i out;
757 
758  out = __lasx_xvmulwev_h_b(in_h, in_l);
759  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
760  return out;
761 }
762 
763 /*
764  * =============================================================================
765  * Description : Dot product of halfword vector elements
766  * Arguments : Inputs - in_h, in_l
767  * Output - out
768  * Return Type - signed word
769  * Details : Signed halfword elements from in_h are multiplied with
770  * signed halfword elements from in_l producing a result
771  * twice the size of input i.e. signed word.
772  * Then this multiplied results of adjacent odd-even elements
773  * are added to the out vector.
774  * Example : out = __lasx_xvdp2_w_h(in_h, in_l)
775  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
776  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
777  * out : 22,38,38,22, 22,38,38,22
778  * =============================================================================
779  */
780 static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
781  __m256i out;
782 
783  out = __lasx_xvmulwev_w_h(in_h, in_l);
784  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
785  return out;
786 }
787 
788 /*
789  * =============================================================================
790  * Description : Dot product of word vector elements
791  * Arguments : Inputs - in_h, in_l
792  * Output - out
793  * Return Type - signed double
794  * Details : Signed word elements from in_h are multiplied with
795  * signed word elements from in_l producing a result
796  * twice the size of input i.e. signed double-word.
797  * Then this multiplied results of adjacent odd-even elements
798  * are added to the out vector.
799  * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
800  * =============================================================================
801  */
802 static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
803  __m256i out;
804 
805  out = __lasx_xvmulwev_d_w(in_h, in_l);
806  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
807  return out;
808 }
809 
810 /*
811  * =============================================================================
812  * Description : Dot product of halfword vector elements
813  * Arguments : Inputs - in_h, in_l
814  * Output - out
815  * Return Type - signed word
816  * Details : Unsigned halfword elements from in_h are multiplied with
817  * signed halfword elements from in_l producing a result
818  * twice the size of input i.e. unsigned word.
819  * Multiplication result of adjacent odd-even elements
820  * are added to the out vector
821  * Example : See out = __lasx_xvdp2_w_h(in_h, in_l)
822  * =============================================================================
823  */
824 static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
825  __m256i out;
826 
827  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
828  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
829  return out;
830 }
831 
832 /*
833  * =============================================================================
834  * Description : Dot product & addition of byte vector elements
835  * Arguments : Inputs - in_h, in_l
836  * Output - out
837  * Return Type - halfword
838  * Details : Signed byte elements from in_h are multiplied with
839  * signed byte elements from in_l producing a result
840  * twice the size of input i.e. signed halfword.
841  * Then this multiplied results of adjacent odd-even elements
842  * are added to the in_c vector.
843  * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
844  * =============================================================================
845  */
846 static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
847  __m256i in_l) {
848  __m256i out;
849 
850  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
851  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
852  return out;
853 }
854 
855 /*
856  * =============================================================================
857  * Description : Dot product & addition of byte vector elements
858  * Arguments : Inputs - in_h, in_l
859  * Output - out
860  * Return Type - halfword
861  * Details : Unsigned byte elements from in_h are multiplied with
862  * unsigned byte elements from in_l producing a result
863  * twice the size of input i.e. signed halfword.
864  * Then this multiplied results of adjacent odd-even elements
865  * are added to the in_c vector.
866  * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
867  * =============================================================================
868  */
869 static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
870  __m256i in_l) {
871  __m256i out;
872 
873  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
874  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
875  return out;
876 }
877 
878 /*
879  * =============================================================================
880  * Description : Dot product & addition of byte vector elements
881  * Arguments : Inputs - in_h, in_l
882  * Output - out
883  * Return Type - halfword
884  * Details : Unsigned byte elements from in_h are multiplied with
885  * signed byte elements from in_l producing a result
886  * twice the size of input i.e. signed halfword.
887  * Then this multiplied results of adjacent odd-even elements
888  * are added to the in_c vector.
889  * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
890  * =============================================================================
891  */
892 static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
893  __m256i in_l) {
894  __m256i out;
895 
896  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
897  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
898  return out;
899 }
900 
901 /*
902  * =============================================================================
903  * Description : Dot product of halfword vector elements
904  * Arguments : Inputs - in_c, in_h, in_l
905  * Output - out
906  * Return Type - per RTYPE
907  * Details : Signed halfword elements from in_h are multiplied with
908  * signed halfword elements from in_l producing a result
909  * twice the size of input i.e. signed word.
910  * Multiplication result of adjacent odd-even elements
911  * are added to the in_c vector.
912  * Example : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
913  * in_c : 1,2,3,4, 1,2,3,4
914  * in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
915  * in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
916  * out : 23,40,41,26, 23,40,41,26
917  * =============================================================================
918  */
919 static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
920  __m256i in_l) {
921  __m256i out;
922 
923  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
924  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
925  return out;
926 }
927 
928 /*
929  * =============================================================================
930  * Description : Dot product of halfword vector elements
931  * Arguments : Inputs - in_c, in_h, in_l
932  * Output - out
933  * Return Type - signed word
934  * Details : Unsigned halfword elements from in_h are multiplied with
935  * unsigned halfword elements from in_l producing a result
936  * twice the size of input i.e. signed word.
937  * Multiplication result of adjacent odd-even elements
938  * are added to the in_c vector.
939  * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
940  * =============================================================================
941  */
942 static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
943  __m256i in_l) {
944  __m256i out;
945 
946  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
947  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
948  return out;
949 }
950 
951 /*
952  * =============================================================================
953  * Description : Dot product of halfword vector elements
954  * Arguments : Inputs - in_c, in_h, in_l
955  * Output - out
956  * Return Type - signed word
957  * Details : Unsigned halfword elements from in_h are multiplied with
958  * signed halfword elements from in_l producing a result
959  * twice the size of input i.e. signed word.
960  * Multiplication result of adjacent odd-even elements
961  * are added to the in_c vector
962  * Example : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
963  * =============================================================================
964  */
965 static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
966  __m256i in_l) {
967  __m256i out;
968 
969  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
970  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
971  return out;
972 }
973 
974 /*
975  * =============================================================================
976  * Description : Vector Unsigned Dot Product and Subtract
977  * Arguments : Inputs - in_c, in_h, in_l
978  * Output - out
979  * Return Type - signed halfword
980  * Details : Unsigned byte elements from in_h are multiplied with
981  * unsigned byte elements from in_l producing a result
982  * twice the size of input i.e. signed halfword.
983  * Multiplication result of adjacent odd-even elements
984  * are added together and subtracted from double width elements
985  * in_c vector.
986  * Example : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
987  * =============================================================================
988  */
989 static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
990  __m256i in_l) {
991  __m256i out;
992 
993  out = __lasx_xvmulwev_h_bu(in_h, in_l);
994  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
995  out = __lasx_xvsub_h(in_c, out);
996  return out;
997 }
998 
999 /*
1000  * =============================================================================
1001  * Description : Vector Signed Dot Product and Subtract
1002  * Arguments : Inputs - in_c, in_h, in_l
1003  * Output - out
1004  * Return Type - signed word
1005  * Details : Signed halfword elements from in_h are multiplied with
1006  * Signed halfword elements from in_l producing a result
1007  * twice the size of input i.e. signed word.
1008  * Multiplication result of adjacent odd-even elements
1009  * are added together and subtracted from double width elements
1010  * in_c vector.
1011  * Example : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
1012  * in_c : 0,0,0,0, 0,0,0,0
1013  * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
1014  * in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
1015  * out : -7,-3,0,0, 0,-1,0,-1
1016  * =============================================================================
1017  */
1018 static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
1019  __m256i in_l) {
1020  __m256i out;
1021 
1022  out = __lasx_xvmulwev_w_h(in_h, in_l);
1023  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1024  out = __lasx_xvsub_w(in_c, out);
1025  return out;
1026 }
1027 
1028 /*
1029  * =============================================================================
1030  * Description : Dot product of halfword vector elements
1031  * Arguments : Inputs - in_h, in_l
1032  * Output - out
1033  * Return Type - signed word
1034  * Details : Signed halfword elements from in_h are multiplied with
1035  * signed halfword elements from in_l producing a result
1036  * four times the size of input i.e. signed doubleword.
1037  * Then this multiplication results of four adjacent elements
1038  * are added together and stored to the out vector.
1039  * Example : out = __lasx_xvdp4_d_h(in_h, in_l)
1040  * in_h : 3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
1041  * in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
1042  * out : -2,0,1,1
1043  * =============================================================================
1044  */
1045 static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
1046  __m256i out;
1047 
1048  out = __lasx_xvmulwev_w_h(in_h, in_l);
1049  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
1050  out = __lasx_xvhaddw_d_w(out, out);
1051  return out;
1052 }
1053 
1054 /*
1055  * =============================================================================
1056  * Description : The high half of the vector elements are expanded and
1057  * added after being doubled.
1058  * Arguments : Inputs - in_h, in_l
1059  * Output - out
1060  * Details : The in_h vector and the in_l vector are added after the
1061  * higher half of the two-fold sign extension (signed byte
1062  * to signed halfword) and stored to the out vector.
1063  * Example : See out = __lasx_xvaddwh_w_h(in_h, in_l)
1064  * =============================================================================
1065  */
1066 static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
1067  __m256i out;
1068 
1069  out = __lasx_xvilvh_b(in_h, in_l);
1070  out = __lasx_xvhaddw_h_b(out, out);
1071  return out;
1072 }
1073 
1074 /*
1075  * =============================================================================
1076  * Description : The high half of the vector elements are expanded and
1077  * added after being doubled.
1078  * Arguments : Inputs - in_h, in_l
1079  * Output - out
1080  * Details : The in_h vector and the in_l vector are added after the
1081  * higher half of the two-fold sign extension (signed halfword
1082  * to signed word) and stored to the out vector.
1083  * Example : out = __lasx_xvaddwh_w_h(in_h, in_l)
1084  * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1085  * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1086  * out : 1,0,0,-1, 1,0,0, 2
1087  * =============================================================================
1088  */
1089 static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
1090  __m256i out;
1091 
1092  out = __lasx_xvilvh_h(in_h, in_l);
1093  out = __lasx_xvhaddw_w_h(out, out);
1094  return out;
1095 }
1096 
1097 /*
1098  * =============================================================================
1099  * Description : The low half of the vector elements are expanded and
1100  * added after being doubled.
1101  * Arguments : Inputs - in_h, in_l
1102  * Output - out
1103  * Details : The in_h vector and the in_l vector are added after the
1104  * lower half of the two-fold sign extension (signed byte
1105  * to signed halfword) and stored to the out vector.
1106  * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1107  * =============================================================================
1108  */
1109 static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
1110  __m256i out;
1111 
1112  out = __lasx_xvilvl_b(in_h, in_l);
1113  out = __lasx_xvhaddw_h_b(out, out);
1114  return out;
1115 }
1116 
1117 /*
1118  * =============================================================================
1119  * Description : The low half of the vector elements are expanded and
1120  * added after being doubled.
1121  * Arguments : Inputs - in_h, in_l
1122  * Output - out
1123  * Details : The in_h vector and the in_l vector are added after the
1124  * lower half of the two-fold sign extension (signed halfword
1125  * to signed word) and stored to the out vector.
1126  * Example : out = __lasx_xvaddwl_w_h(in_h, in_l)
1127  * in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1128  * in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
1129  * out : 5,-1,4,2, 1,0,2,-1
1130  * =============================================================================
1131  */
1132 static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
1133  __m256i out;
1134 
1135  out = __lasx_xvilvl_h(in_h, in_l);
1136  out = __lasx_xvhaddw_w_h(out, out);
1137  return out;
1138 }
1139 
1140 /*
1141  * =============================================================================
1142  * Description : The low half of the vector elements are expanded and
1143  * added after being doubled.
1144  * Arguments : Inputs - in_h, in_l
1145  * Output - out
1146  * Details : The out vector and the out vector are added after the
1147  * lower half of the two-fold zero extension (unsigned byte
1148  * to unsigned halfword) and stored to the out vector.
1149  * Example : See out = __lasx_xvaddwl_w_h(in_h, in_l)
1150  * =============================================================================
1151  */
1152 static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
1153  __m256i out;
1154 
1155  out = __lasx_xvilvl_b(in_h, in_l);
1156  out = __lasx_xvhaddw_hu_bu(out, out);
1157  return out;
1158 }
1159 
1160 /*
1161  * =============================================================================
1162  * Description : The low half of the vector elements are expanded and
1163  * added after being doubled.
1164  * Arguments : Inputs - in_h, in_l
1165  * Output - out
1166  * Details : The in_l vector after double zero extension (unsigned byte to
1167  * signed halfword),added to the in_h vector.
1168  * Example : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
1169  * =============================================================================
1170  */
1171 static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
1172  __m256i out;
1173 
1174  out = __lasx_xvsllwil_hu_bu(in_l, 0);
1175  out = __lasx_xvadd_h(in_h, out);
1176  return out;
1177 }
1178 
1179 /*
1180  * =============================================================================
1181  * Description : The low half of the vector elements are expanded and
1182  * added after being doubled.
1183  * Arguments : Inputs - in_h, in_l
1184  * Output - out
1185  * Details : The in_l vector after double sign extension (signed halfword to
1186  * signed word), added to the in_h vector.
1187  * Example : out = __lasx_xvaddw_w_w_h(in_h, in_l)
1188  * in_h : 0, 1,0,0, -1,0,0,1,
1189  * in_l : 2,-1,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1,
1190  * out : 2, 0,1,2, -1,0,1,1,
1191  * =============================================================================
1192  */
1193 static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
1194  __m256i out;
1195 
1196  out = __lasx_xvsllwil_w_h(in_l, 0);
1197  out = __lasx_xvadd_w(in_h, out);
1198  return out;
1199 }
1200 
1201 /*
1202  * =============================================================================
1203  * Description : Multiplication and addition calculation after expansion
1204  * of the lower half of the vector.
1205  * Arguments : Inputs - in_c, in_h, in_l
1206  * Output - out
1207  * Details : The in_h vector and the in_l vector are multiplied after
1208  * the lower half of the two-fold sign extension (signed halfword
1209  * to signed word), and the result is added to the vector in_c,
1210  * then stored to the out vector.
1211  * Example : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1212  * in_c : 1,2,3,4, 5,6,7,8
1213  * in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
1214  * in_l : 200, 300, 400, 500, 2000, 3000, 4000, 5000,
1215  * -200,-300,-400,-500, -2000,-3000,-4000,-5000
1216  * out : 201, 602,1203,2004, -995, -1794,-2793,-3992
1217  * =============================================================================
1218  */
1219 static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
1220  __m256i in_l) {
1221  __m256i tmp0, tmp1, out;
1222 
1223  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1224  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1225  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
1226  out = __lasx_xvadd_w(tmp0, in_c);
1227  return out;
1228 }
1229 
1230 /*
1231  * =============================================================================
1232  * Description : Multiplication and addition calculation after expansion
1233  * of the higher half of the vector.
1234  * Arguments : Inputs - in_c, in_h, in_l
1235  * Output - out
1236  * Details : The in_h vector and the in_l vector are multiplied after
1237  * the higher half of the two-fold sign extension (signed
1238  * halfword to signed word), and the result is added to
1239  * the vector in_c, then stored to the out vector.
1240  * Example : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
1241  * =============================================================================
1242  */
1243 static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
1244  __m256i in_l) {
1245  __m256i tmp0, tmp1, out;
1246 
1247  tmp0 = __lasx_xvilvh_h(in_h, in_h);
1248  tmp1 = __lasx_xvilvh_h(in_l, in_l);
1249  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
1250  out = __lasx_xvadd_w(tmp0, in_c);
1251  return out;
1252 }
1253 
1254 /*
1255  * =============================================================================
1256  * Description : Multiplication calculation after expansion of the lower
1257  * half of the vector.
1258  * Arguments : Inputs - in_h, in_l
1259  * Output - out
1260  * Details : The in_h vector and the in_l vector are multiplied after
1261  * the lower half of the two-fold sign extension (signed
1262  * halfword to signed word), then stored to the out vector.
1263  * Example : out = __lasx_xvmulwl_w_h(in_h, in_l)
1264  * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1265  * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1266  * out : 6,1,3,0, 0,0,1,0
1267  * =============================================================================
1268  */
1269 static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
1270  __m256i tmp0, tmp1, out;
1271 
1272  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
1273  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
1274  out = __lasx_xvmul_w(tmp0, tmp1);
1275  return out;
1276 }
1277 
1278 /*
1279  * =============================================================================
1280  * Description : Multiplication calculation after expansion of the lower
1281  * half of the vector.
1282  * Arguments : Inputs - in_h, in_l
1283  * Output - out
1284  * Details : The in_h vector and the in_l vector are multiplied after
1285  * the lower half of the two-fold sign extension (signed
1286  * halfword to signed word), then stored to the out vector.
1287  * Example : out = __lasx_xvmulwh_w_h(in_h, in_l)
1288  * in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
1289  * in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
1290  * out : 0,0,0,0, 0,0,0,1
1291  * =============================================================================
1292  */
1293 static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
1294  __m256i tmp0, tmp1, out;
1295 
1296  tmp0 = __lasx_xvilvh_h(in_h, in_h);
1297  tmp1 = __lasx_xvilvh_h(in_l, in_l);
1298  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
1299  return out;
1300 }
1301 
1302 /*
1303  * =============================================================================
1304  * Description : The low half of the vector elements are added to the high half
1305  * after being doubled, then saturated.
1306  * Arguments : Inputs - in_h, in_l
1307  * Output - out
1308  * Details : The in_h vector adds the in_l vector after the lower half of
1309  * the two-fold zero extension (unsigned byte to unsigned
1310  * halfword) and then saturated. The results are stored to the out
1311  * vector.
1312  * Example : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
1313  * in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
1314  * in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
1315  * 0,0,0,1
1316  * out : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
1317  * =============================================================================
1318  */
1319 static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
1320  __m256i tmp1, out;
1321  __m256i zero = { 0 };
1322 
1323  tmp1 = __lasx_xvilvl_b(zero, in_l);
1324  out = __lasx_xvsadd_hu(in_h, tmp1);
1325  return out;
1326 }
1327 
1328 /*
1329  * =============================================================================
1330  * Description : Clip all halfword elements of input vector between min & max
1331  * out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
1332  * Arguments : Inputs - in (input vector)
1333  * - min (min threshold)
1334  * - max (max threshold)
1335  * Outputs - in (output vector with clipped elements)
1336  * Return Type - signed halfword
1337  * Example : out = __lasx_xvclip_h(in, min, max)
1338  * in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
1339  * min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
1340  * max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
1341  * out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
1342  * =============================================================================
1343  */
1344 static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
1345  __m256i out;
1346 
1347  out = __lasx_xvmax_h(min, in);
1348  out = __lasx_xvmin_h(max, out);
1349  return out;
1350 }
1351 
1352 /*
1353  * =============================================================================
1354  * Description : Clip all signed halfword elements of input vector
1355  * between 0 & 255
1356  * Arguments : Inputs - in (input vector)
1357  * Outputs - out (output vector with clipped elements)
1358  * Return Type - signed halfword
1359  * Example : See out = __lasx_xvclip255_w(in)
1360  * =============================================================================
1361  */
1362 static inline __m256i __lasx_xvclip255_h(__m256i in) {
1363  __m256i out;
1364 
1365  out = __lasx_xvmaxi_h(in, 0);
1366  out = __lasx_xvsat_hu(out, 7);
1367  return out;
1368 }
1369 
1370 /*
1371  * =============================================================================
1372  * Description : Clip all signed word elements of input vector
1373  * between 0 & 255
1374  * Arguments : Inputs - in (input vector)
1375  * Output - out (output vector with clipped elements)
1376  * Return Type - signed word
1377  * Example : out = __lasx_xvclip255_w(in)
1378  * in : -8,255,280,249, -8,255,280,249
1379  * out : 0,255,255,249, 0,255,255,249
1380  * =============================================================================
1381  */
1382 static inline __m256i __lasx_xvclip255_w(__m256i in) {
1383  __m256i out;
1384 
1385  out = __lasx_xvmaxi_w(in, 0);
1386  out = __lasx_xvsat_wu(out, 7);
1387  return out;
1388 }
1389 
1390 /*
1391  * =============================================================================
1392  * Description : Indexed halfword element values are replicated to all
1393  * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1394  * if 'idx >= 8' use xvsplati_h_*.
1395  * Arguments : Inputs - in, idx
1396  * Output - out
1397  * Details : Idx element value from in vector is replicated to all
1398  * elements in out vector.
1399  * Valid index range for halfword operation is 0-7
1400  * Example : out = __lasx_xvsplati_l_h(in, idx)
1401  * in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
1402  * idx : 0x02
1403  * out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
1404  * =============================================================================
1405  */
1406 static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
1407  __m256i out;
1408 
1409  out = __lasx_xvpermi_q(in, in, 0x02);
1410  out = __lasx_xvreplve_h(out, idx);
1411  return out;
1412 }
1413 
1414 /*
1415  * =============================================================================
1416  * Description : Indexed halfword element values are replicated to all
1417  * elements in output vector. If 'idx < 8' use xvsplati_l_*,
1418  * if 'idx >= 8' use xvsplati_h_*.
1419  * Arguments : Inputs - in, idx
1420  * Output - out
1421  * Details : Idx element value from in vector is replicated to all
1422  * elements in out vector.
1423  * Valid index range for halfword operation is 0-7
1424  * Example : out = __lasx_xvsplati_h_h(in, idx)
1425  * in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
1426  * idx : 0x09
1427  * out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1428  * =============================================================================
1429  */
1430 static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
1431  __m256i out;
1432 
1433  out = __lasx_xvpermi_q(in, in, 0x13);
1434  out = __lasx_xvreplve_h(out, idx);
1435  return out;
1436 }
1437 
1438 /*
1439  * =============================================================================
1440  * Description : Transpose 4x4 block with double-word elements in vectors
1441  * Arguments : Inputs - _in0, _in1, _in2, _in3
1442  * Outputs - _out0, _out1, _out2, _out3
1443  * Example : LASX_TRANSPOSE4x4_D
1444  * _in0 : 1,2,3,4
1445  * _in1 : 1,2,3,4
1446  * _in2 : 1,2,3,4
1447  * _in3 : 1,2,3,4
1448  *
1449  * _out0 : 1,1,1,1
1450  * _out1 : 2,2,2,2
1451  * _out2 : 3,3,3,3
1452  * _out3 : 4,4,4,4
1453  * =============================================================================
1454  */
1455 #define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1456  _out3) \
1457  { \
1458  __m256i _tmp0, _tmp1, _tmp2, _tmp3; \
1459  _tmp0 = __lasx_xvilvl_d(_in1, _in0); \
1460  _tmp1 = __lasx_xvilvh_d(_in1, _in0); \
1461  _tmp2 = __lasx_xvilvl_d(_in3, _in2); \
1462  _tmp3 = __lasx_xvilvh_d(_in3, _in2); \
1463  _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20); \
1464  _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31); \
1465  _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20); \
1466  _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31); \
1467  }
1468 
1469 /*
1470  * =============================================================================
1471  * Description : Transpose 8x8 block with word elements in vectors
1472  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1473  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1474  * _out7
1475  * Example : LASX_TRANSPOSE8x8_W
1476  * _in0 : 1,2,3,4,5,6,7,8
1477  * _in1 : 2,2,3,4,5,6,7,8
1478  * _in2 : 3,2,3,4,5,6,7,8
1479  * _in3 : 4,2,3,4,5,6,7,8
1480  * _in4 : 5,2,3,4,5,6,7,8
1481  * _in5 : 6,2,3,4,5,6,7,8
1482  * _in6 : 7,2,3,4,5,6,7,8
1483  * _in7 : 8,2,3,4,5,6,7,8
1484  *
1485  * _out0 : 1,2,3,4,5,6,7,8
1486  * _out1 : 2,2,2,2,2,2,2,2
1487  * _out2 : 3,3,3,3,3,3,3,3
1488  * _out3 : 4,4,4,4,4,4,4,4
1489  * _out4 : 5,5,5,5,5,5,5,5
1490  * _out5 : 6,6,6,6,6,6,6,6
1491  * _out6 : 7,7,7,7,7,7,7,7
1492  * _out7 : 8,8,8,8,8,8,8,8
1493  * =============================================================================
1494  */
1495 #define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1496  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1497  _out7) \
1498  { \
1499  __m256i _s0_m, _s1_m; \
1500  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1501  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1502  \
1503  _s0_m = __lasx_xvilvl_w(_in2, _in0); \
1504  _s1_m = __lasx_xvilvl_w(_in3, _in1); \
1505  _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1506  _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1507  _s0_m = __lasx_xvilvh_w(_in2, _in0); \
1508  _s1_m = __lasx_xvilvh_w(_in3, _in1); \
1509  _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1510  _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1511  _s0_m = __lasx_xvilvl_w(_in6, _in4); \
1512  _s1_m = __lasx_xvilvl_w(_in7, _in5); \
1513  _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1514  _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1515  _s0_m = __lasx_xvilvh_w(_in6, _in4); \
1516  _s1_m = __lasx_xvilvh_w(_in7, _in5); \
1517  _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m); \
1518  _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m); \
1519  _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20); \
1520  _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20); \
1521  _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20); \
1522  _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20); \
1523  _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31); \
1524  _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31); \
1525  _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31); \
1526  _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31); \
1527  }
1528 
1529 /*
1530  * =============================================================================
1531  * Description : Transpose input 16x8 byte block
1532  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1533  * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1534  * (input 16x8 byte block)
1535  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1536  * _out7 (output 8x16 byte block)
1537  * Details : The rows of the matrix become columns, and the columns become
1538  * rows.
1539  * Example : See LASX_TRANSPOSE16x8_H
1540  * =============================================================================
1541  */
1542 #define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1543  _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1544  _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1545  _out6, _out7) \
1546  { \
1547  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1548  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1549  \
1550  _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1551  _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1552  _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1553  _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1554  _tmp4_m = __lasx_xvilvl_b(_in10, _in8); \
1555  _tmp5_m = __lasx_xvilvl_b(_in11, _in9); \
1556  _tmp6_m = __lasx_xvilvl_b(_in14, _in12); \
1557  _tmp7_m = __lasx_xvilvl_b(_in15, _in13); \
1558  _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1559  _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1560  _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1561  _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1562  _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m); \
1563  _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m); \
1564  _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m); \
1565  _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m); \
1566  _tmp0_m = __lasx_xvilvl_w(_out2, _out0); \
1567  _tmp2_m = __lasx_xvilvh_w(_out2, _out0); \
1568  _tmp4_m = __lasx_xvilvl_w(_out3, _out1); \
1569  _tmp6_m = __lasx_xvilvh_w(_out3, _out1); \
1570  _tmp1_m = __lasx_xvilvl_w(_out6, _out4); \
1571  _tmp3_m = __lasx_xvilvh_w(_out6, _out4); \
1572  _tmp5_m = __lasx_xvilvl_w(_out7, _out5); \
1573  _tmp7_m = __lasx_xvilvh_w(_out7, _out5); \
1574  _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m); \
1575  _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m); \
1576  _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m); \
1577  _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m); \
1578  _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m); \
1579  _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m); \
1580  _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m); \
1581  _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m); \
1582  }
1583 
1584 /*
1585  * =============================================================================
1586  * Description : Transpose input 16x8 byte block
1587  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
1588  * _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
1589  * (input 16x8 byte block)
1590  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1591  * _out7 (output 8x16 byte block)
1592  * Details : The rows of the matrix become columns, and the columns become
1593  * rows.
1594  * Example : LASX_TRANSPOSE16x8_H
1595  * _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1596  * _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1597  * _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1598  * _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1599  * _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1600  * _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1601  * _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1602  * _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1603  * _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1604  * _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1605  * _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1606  * _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1607  * _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1608  * _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1609  * _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1610  * _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
1611  *
1612  * _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
1613  * _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
1614  * _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
1615  * _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
1616  * _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
1617  * _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
1618  * _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
1619  * _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
1620  * =============================================================================
1621  */
1622 #define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1623  _in8, _in9, _in10, _in11, _in12, _in13, _in14, \
1624  _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
1625  _out6, _out7) \
1626  { \
1627  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1628  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1629  __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7; \
1630  \
1631  _tmp0_m = __lasx_xvilvl_h(_in2, _in0); \
1632  _tmp1_m = __lasx_xvilvl_h(_in3, _in1); \
1633  _tmp2_m = __lasx_xvilvl_h(_in6, _in4); \
1634  _tmp3_m = __lasx_xvilvl_h(_in7, _in5); \
1635  _tmp4_m = __lasx_xvilvl_h(_in10, _in8); \
1636  _tmp5_m = __lasx_xvilvl_h(_in11, _in9); \
1637  _tmp6_m = __lasx_xvilvl_h(_in14, _in12); \
1638  _tmp7_m = __lasx_xvilvl_h(_in15, _in13); \
1639  _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1640  _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1641  _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1642  _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1643  _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1644  _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1645  _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1646  _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1647  _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1648  _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1649  _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1650  _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1651  _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1652  _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1653  _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1654  _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1655  _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1656  _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1657  _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1658  _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1659  \
1660  _tmp0_m = __lasx_xvilvh_h(_in2, _in0); \
1661  _tmp1_m = __lasx_xvilvh_h(_in3, _in1); \
1662  _tmp2_m = __lasx_xvilvh_h(_in6, _in4); \
1663  _tmp3_m = __lasx_xvilvh_h(_in7, _in5); \
1664  _tmp4_m = __lasx_xvilvh_h(_in10, _in8); \
1665  _tmp5_m = __lasx_xvilvh_h(_in11, _in9); \
1666  _tmp6_m = __lasx_xvilvh_h(_in14, _in12); \
1667  _tmp7_m = __lasx_xvilvh_h(_in15, _in13); \
1668  _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m); \
1669  _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m); \
1670  _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m); \
1671  _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m); \
1672  _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m); \
1673  _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m); \
1674  _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m); \
1675  _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m); \
1676  _tmp0_m = __lasx_xvilvl_d(_t2, _t0); \
1677  _tmp2_m = __lasx_xvilvh_d(_t2, _t0); \
1678  _tmp4_m = __lasx_xvilvl_d(_t3, _t1); \
1679  _tmp6_m = __lasx_xvilvh_d(_t3, _t1); \
1680  _tmp1_m = __lasx_xvilvl_d(_t6, _t4); \
1681  _tmp3_m = __lasx_xvilvh_d(_t6, _t4); \
1682  _tmp5_m = __lasx_xvilvl_d(_t7, _t5); \
1683  _tmp7_m = __lasx_xvilvh_d(_t7, _t5); \
1684  _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20); \
1685  _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20); \
1686  _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20); \
1687  _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20); \
1688  }
1689 
1690 /*
1691  * =============================================================================
1692  * Description : Transpose 4x4 block with halfword elements in vectors
1693  * Arguments : Inputs - _in0, _in1, _in2, _in3
1694  * Outputs - _out0, _out1, _out2, _out3
1695  * Return Type - signed halfword
1696  * Details : The rows of the matrix become columns, and the columns become
1697  * rows.
1698  * Example : See LASX_TRANSPOSE8x8_H
1699  * =============================================================================
1700  */
1701 #define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
1702  _out3) \
1703  { \
1704  __m256i _s0_m, _s1_m; \
1705  \
1706  _s0_m = __lasx_xvilvl_h(_in1, _in0); \
1707  _s1_m = __lasx_xvilvl_h(_in3, _in2); \
1708  _out0 = __lasx_xvilvl_w(_s1_m, _s0_m); \
1709  _out2 = __lasx_xvilvh_w(_s1_m, _s0_m); \
1710  _out1 = __lasx_xvilvh_d(_out0, _out0); \
1711  _out3 = __lasx_xvilvh_d(_out2, _out2); \
1712  }
1713 
1714 /*
1715  * =============================================================================
1716  * Description : Transpose input 8x8 byte block
1717  * Arguments : Inputs - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
1718  * (input 8x8 byte block)
1719  * Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
1720  * _out7 (output 8x8 byte block)
1721  * Example : See LASX_TRANSPOSE8x8_H
1722  * =============================================================================
1723  */
1724 #define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1725  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1726  _out7) \
1727  { \
1728  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1729  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1730  _tmp0_m = __lasx_xvilvl_b(_in2, _in0); \
1731  _tmp1_m = __lasx_xvilvl_b(_in3, _in1); \
1732  _tmp2_m = __lasx_xvilvl_b(_in6, _in4); \
1733  _tmp3_m = __lasx_xvilvl_b(_in7, _in5); \
1734  _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m); \
1735  _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m); \
1736  _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m); \
1737  _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m); \
1738  _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m); \
1739  _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m); \
1740  _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m); \
1741  _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m); \
1742  _out1 = __lasx_xvbsrl_v(_out0, 8); \
1743  _out3 = __lasx_xvbsrl_v(_out2, 8); \
1744  _out5 = __lasx_xvbsrl_v(_out4, 8); \
1745  _out7 = __lasx_xvbsrl_v(_out6, 8); \
1746  }
1747 
1748 /*
1749  * =============================================================================
1750  * Description : Transpose 8x8 block with halfword elements in vectors.
1751  * Arguments : Inputs - _in0, _in1, ~
1752  * Outputs - _out0, _out1, ~
1753  * Details : The rows of the matrix become columns, and the columns become
1754  * rows.
1755  * Example : LASX_TRANSPOSE8x8_H
1756  * _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1757  * _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1758  * _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
1759  * _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1760  * _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1761  * _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1762  * _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
1763  * _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
1764  *
1765  * _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
1766  * _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
1767  * _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
1768  * _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
1769  * _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
1770  * _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
1771  * _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
1772  * _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
1773  * =============================================================================
1774  */
1775 #define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1776  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1777  _out7) \
1778  { \
1779  __m256i _s0_m, _s1_m; \
1780  __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m; \
1781  __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m; \
1782  \
1783  _s0_m = __lasx_xvilvl_h(_in6, _in4); \
1784  _s1_m = __lasx_xvilvl_h(_in7, _in5); \
1785  _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1786  _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1787  _s0_m = __lasx_xvilvh_h(_in6, _in4); \
1788  _s1_m = __lasx_xvilvh_h(_in7, _in5); \
1789  _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1790  _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1791  \
1792  _s0_m = __lasx_xvilvl_h(_in2, _in0); \
1793  _s1_m = __lasx_xvilvl_h(_in3, _in1); \
1794  _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1795  _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1796  _s0_m = __lasx_xvilvh_h(_in2, _in0); \
1797  _s1_m = __lasx_xvilvh_h(_in3, _in1); \
1798  _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m); \
1799  _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m); \
1800  \
1801  _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m); \
1802  _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m); \
1803  _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m); \
1804  _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m); \
1805  _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m); \
1806  _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m); \
1807  _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m); \
1808  _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m); \
1809  }
1810 
1811 /*
1812  * =============================================================================
1813  * Description : Butterfly of 4 input vectors
1814  * Arguments : Inputs - _in0, _in1, _in2, _in3
1815  * Outputs - _out0, _out1, _out2, _out3
1816  * Details : Butterfly operation
1817  * Example : LASX_BUTTERFLY_4
1818  * _out0 = _in0 + _in3;
1819  * _out1 = _in1 + _in2;
1820  * _out2 = _in1 - _in2;
1821  * _out3 = _in0 - _in3;
1822  * =============================================================================
1823  */
1824 #define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1825  { \
1826  _out0 = __lasx_xvadd_b(_in0, _in3); \
1827  _out1 = __lasx_xvadd_b(_in1, _in2); \
1828  _out2 = __lasx_xvsub_b(_in1, _in2); \
1829  _out3 = __lasx_xvsub_b(_in0, _in3); \
1830  }
1831 #define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1832  { \
1833  _out0 = __lasx_xvadd_h(_in0, _in3); \
1834  _out1 = __lasx_xvadd_h(_in1, _in2); \
1835  _out2 = __lasx_xvsub_h(_in1, _in2); \
1836  _out3 = __lasx_xvsub_h(_in0, _in3); \
1837  }
1838 #define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1839  { \
1840  _out0 = __lasx_xvadd_w(_in0, _in3); \
1841  _out1 = __lasx_xvadd_w(_in1, _in2); \
1842  _out2 = __lasx_xvsub_w(_in1, _in2); \
1843  _out3 = __lasx_xvsub_w(_in0, _in3); \
1844  }
1845 #define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
1846  { \
1847  _out0 = __lasx_xvadd_d(_in0, _in3); \
1848  _out1 = __lasx_xvadd_d(_in1, _in2); \
1849  _out2 = __lasx_xvsub_d(_in1, _in2); \
1850  _out3 = __lasx_xvsub_d(_in0, _in3); \
1851  }
1852 
1853 /*
1854  * =============================================================================
1855  * Description : Butterfly of 8 input vectors
1856  * Arguments : Inputs - _in0, _in1, _in2, _in3, ~
1857  * Outputs - _out0, _out1, _out2, _out3, ~
1858  * Details : Butterfly operation
1859  * Example : LASX_BUTTERFLY_8
1860  * _out0 = _in0 + _in7;
1861  * _out1 = _in1 + _in6;
1862  * _out2 = _in2 + _in5;
1863  * _out3 = _in3 + _in4;
1864  * _out4 = _in3 - _in4;
1865  * _out5 = _in2 - _in5;
1866  * _out6 = _in1 - _in6;
1867  * _out7 = _in0 - _in7;
1868  * =============================================================================
1869  */
1870 #define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1871  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1872  _out7) \
1873  { \
1874  _out0 = __lasx_xvadd_b(_in0, _in7); \
1875  _out1 = __lasx_xvadd_b(_in1, _in6); \
1876  _out2 = __lasx_xvadd_b(_in2, _in5); \
1877  _out3 = __lasx_xvadd_b(_in3, _in4); \
1878  _out4 = __lasx_xvsub_b(_in3, _in4); \
1879  _out5 = __lasx_xvsub_b(_in2, _in5); \
1880  _out6 = __lasx_xvsub_b(_in1, _in6); \
1881  _out7 = __lasx_xvsub_b(_in0, _in7); \
1882  }
1883 
1884 #define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1885  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1886  _out7) \
1887  { \
1888  _out0 = __lasx_xvadd_h(_in0, _in7); \
1889  _out1 = __lasx_xvadd_h(_in1, _in6); \
1890  _out2 = __lasx_xvadd_h(_in2, _in5); \
1891  _out3 = __lasx_xvadd_h(_in3, _in4); \
1892  _out4 = __lasx_xvsub_h(_in3, _in4); \
1893  _out5 = __lasx_xvsub_h(_in2, _in5); \
1894  _out6 = __lasx_xvsub_h(_in1, _in6); \
1895  _out7 = __lasx_xvsub_h(_in0, _in7); \
1896  }
1897 
1898 #define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1899  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1900  _out7) \
1901  { \
1902  _out0 = __lasx_xvadd_w(_in0, _in7); \
1903  _out1 = __lasx_xvadd_w(_in1, _in6); \
1904  _out2 = __lasx_xvadd_w(_in2, _in5); \
1905  _out3 = __lasx_xvadd_w(_in3, _in4); \
1906  _out4 = __lasx_xvsub_w(_in3, _in4); \
1907  _out5 = __lasx_xvsub_w(_in2, _in5); \
1908  _out6 = __lasx_xvsub_w(_in1, _in6); \
1909  _out7 = __lasx_xvsub_w(_in0, _in7); \
1910  }
1911 
1912 #define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
1913  _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
1914  _out7) \
1915  { \
1916  _out0 = __lasx_xvadd_d(_in0, _in7); \
1917  _out1 = __lasx_xvadd_d(_in1, _in6); \
1918  _out2 = __lasx_xvadd_d(_in2, _in5); \
1919  _out3 = __lasx_xvadd_d(_in3, _in4); \
1920  _out4 = __lasx_xvsub_d(_in3, _in4); \
1921  _out5 = __lasx_xvsub_d(_in2, _in5); \
1922  _out6 = __lasx_xvsub_d(_in1, _in6); \
1923  _out7 = __lasx_xvsub_d(_in0, _in7); \
1924  }
1925 
1926 #endif // LASX
1927 
1928 /*
1929  * =============================================================================
1930  * Description : Print out elements in vector.
1931  * Arguments : Inputs - RTYPE, _element_num, _in0, _enter
1932  * Outputs -
1933  * Details : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
1934  * '_enter' is TRUE, prefix "\nVP:" will be added first.
1935  * Example : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
1936  * VP:1,2,3,4,
1937  * =============================================================================
1938  */
1939 #define VECT_PRINT(RTYPE, element_num, in0, enter) \
1940  { \
1941  RTYPE _tmp0 = (RTYPE)in0; \
1942  int _i = 0; \
1943  if (enter) printf("\nVP:"); \
1944  for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
1945  }
1946 
1947 #endif /* LOONGSON_INTRINSICS_H */
1948 #endif /* AVUTIL_LOONGARCH_LOONGSON_INTRINSICS_H */
out
FILE * out
Definition: movenc.c:54
max
#define max(a, b)
Definition: cuda_runtime.h:33
zero
#define zero
Definition: regdef.h:64
min
float min
Definition: vorbis_enc_data.h:429