FFmpeg
simple_idct_lasx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021 Loongson Technology Corporation Limited
3  * Contributed by Hao Chen <chenhao@loongson.cn>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
23 #include "idctdsp_loongarch.h"
24 
25 #define LASX_TRANSPOSE4x16(in_0, in_1, in_2, in_3, out_0, out_1, out_2, out_3) \
26 { \
27  __m256i temp_0, temp_1, temp_2, temp_3; \
28  __m256i temp_4, temp_5, temp_6, temp_7; \
29  DUP4_ARG3(__lasx_xvpermi_q, in_2, in_0, 0x20, in_2, in_0, 0x31, in_3, in_1,\
30  0x20, in_3, in_1, 0x31, temp_0, temp_1, temp_2, temp_3); \
31  DUP2_ARG2(__lasx_xvilvl_h, temp_1, temp_0, temp_3, temp_2, temp_4, temp_6);\
32  DUP2_ARG2(__lasx_xvilvh_h, temp_1, temp_0, temp_3, temp_2, temp_5, temp_7);\
33  DUP2_ARG2(__lasx_xvilvl_w, temp_6, temp_4, temp_7, temp_5, out_0, out_2); \
34  DUP2_ARG2(__lasx_xvilvh_w, temp_6, temp_4, temp_7, temp_5, out_1, out_3); \
35 }
36 
37 #define LASX_IDCTROWCONDDC \
38  const_val = 16383 * ((1 << 19) / 16383); \
39  const_val1 = __lasx_xvreplgr2vr_w(const_val); \
40  DUP4_ARG2(__lasx_xvld, block, 0, block, 32, block, 64, block, 96, \
41  in0, in1, in2, in3); \
42  LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \
43  a0 = __lasx_xvpermi_d(in0, 0xD8); \
44  a0 = __lasx_vext2xv_w_h(a0); \
45  temp = __lasx_xvslli_w(a0, 3); \
46  a1 = __lasx_xvpermi_d(in0, 0x8D); \
47  a1 = __lasx_vext2xv_w_h(a1); \
48  a2 = __lasx_xvpermi_d(in1, 0xD8); \
49  a2 = __lasx_vext2xv_w_h(a2); \
50  a3 = __lasx_xvpermi_d(in1, 0x8D); \
51  a3 = __lasx_vext2xv_w_h(a3); \
52  b0 = __lasx_xvpermi_d(in2, 0xD8); \
53  b0 = __lasx_vext2xv_w_h(b0); \
54  b1 = __lasx_xvpermi_d(in2, 0x8D); \
55  b1 = __lasx_vext2xv_w_h(b1); \
56  b2 = __lasx_xvpermi_d(in3, 0xD8); \
57  b2 = __lasx_vext2xv_w_h(b2); \
58  b3 = __lasx_xvpermi_d(in3, 0x8D); \
59  b3 = __lasx_vext2xv_w_h(b3); \
60  select_vec = a0 | a1 | a2 | a3 | b0 | b1 | b2 | b3; \
61  select_vec = __lasx_xvslti_wu(select_vec, 1); \
62  \
63  DUP4_ARG2(__lasx_xvrepl128vei_h, w1, 2, w1, 3, w1, 4, w1, 5, \
64  w2, w3, w4, w5); \
65  DUP2_ARG2(__lasx_xvrepl128vei_h, w1, 6, w1, 7, w6, w7); \
66  w1 = __lasx_xvrepl128vei_h(w1, 1); \
67  \
68  /* part of FUNC6(idctRowCondDC) */ \
69  temp0 = __lasx_xvmaddwl_w_h(const_val0, in0, w4); \
70  DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \
71  a0 = __lasx_xvadd_w(temp0, temp1); \
72  a1 = __lasx_xvadd_w(temp0, temp2); \
73  a2 = __lasx_xvsub_w(temp0, temp2); \
74  a3 = __lasx_xvsub_w(temp0, temp1); \
75  \
76  DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \
77  b0 = __lasx_xvdp2_w_h(temp0, temp1); \
78  temp1 = __lasx_xvneg_h(w7); \
79  temp2 = __lasx_xvilvl_h(temp1, w3); \
80  b1 = __lasx_xvdp2_w_h(temp0, temp2); \
81  temp1 = __lasx_xvneg_h(w1); \
82  temp2 = __lasx_xvilvl_h(temp1, w5); \
83  b2 = __lasx_xvdp2_w_h(temp0, temp2); \
84  temp1 = __lasx_xvneg_h(w5); \
85  temp2 = __lasx_xvilvl_h(temp1, w7); \
86  b3 = __lasx_xvdp2_w_h(temp0, temp2); \
87  \
88  /* if (AV_RAN64A(row + 4)) */ \
89  DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \
90  a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \
91  temp1 = __lasx_xvilvl_h(w2, w4); \
92  a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \
93  temp1 = __lasx_xvneg_h(w4); \
94  temp2 = __lasx_xvilvl_h(w2, temp1); \
95  a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \
96  temp1 = __lasx_xvneg_h(w6); \
97  temp2 = __lasx_xvilvl_h(temp1, w4); \
98  a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \
99  \
100  DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \
101  b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \
102  DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \
103  b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \
104  b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \
105  temp1 = __lasx_xvneg_h(w1); \
106  temp2 = __lasx_xvilvl_h(temp1, w3); \
107  b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \
108  \
109  DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \
110  temp0, temp1, temp2, temp3); \
111  DUP4_ARG2(__lasx_xvsub_w, a0, b0, a1, b1, a2, b2, a3, b3, \
112  a0, a1, a2, a3); \
113  DUP4_ARG2(__lasx_xvsrai_w, temp0, 11, temp1, 11, temp2, 11, temp3, 11, \
114  temp0, temp1, temp2, temp3); \
115  DUP4_ARG2(__lasx_xvsrai_w, a0, 11, a1, 11, a2, 11, a3, 11, a0, a1, a2, a3);\
116  DUP4_ARG3(__lasx_xvbitsel_v, temp0, temp, select_vec, temp1, temp, \
117  select_vec, temp2, temp, select_vec, temp3, temp, select_vec, \
118  in0, in1, in2, in3); \
119  DUP4_ARG3(__lasx_xvbitsel_v, a0, temp, select_vec, a1, temp, \
120  select_vec, a2, temp, select_vec, a3, temp, select_vec, \
121  a0, a1, a2, a3); \
122  DUP4_ARG2(__lasx_xvpickev_h, in1, in0, in3, in2, a2, a3, a0, a1, \
123  in0, in1, in2, in3); \
124  DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8, \
125  in0, in1, in2, in3); \
126 
127 #define LASX_IDCTCOLS \
128  /* part of FUNC6(idctSparaseCol) */ \
129  LASX_TRANSPOSE4x16(in0, in1, in2, in3, in0, in1, in2, in3); \
130  temp0 = __lasx_xvmaddwl_w_h(const_val1, in0, w4); \
131  DUP2_ARG2(__lasx_xvmulwl_w_h, in1, w2, in1, w6, temp1, temp2); \
132  a0 = __lasx_xvadd_w(temp0, temp1); \
133  a1 = __lasx_xvadd_w(temp0, temp2); \
134  a2 = __lasx_xvsub_w(temp0, temp2); \
135  a3 = __lasx_xvsub_w(temp0, temp1); \
136  \
137  DUP2_ARG2(__lasx_xvilvh_h, in1, in0, w3, w1, temp0, temp1); \
138  b0 = __lasx_xvdp2_w_h(temp0, temp1); \
139  temp1 = __lasx_xvneg_h(w7); \
140  temp2 = __lasx_xvilvl_h(temp1, w3); \
141  b1 = __lasx_xvdp2_w_h(temp0, temp2); \
142  temp1 = __lasx_xvneg_h(w1); \
143  temp2 = __lasx_xvilvl_h(temp1, w5); \
144  b2 = __lasx_xvdp2_w_h(temp0, temp2); \
145  temp1 = __lasx_xvneg_h(w5); \
146  temp2 = __lasx_xvilvl_h(temp1, w7); \
147  b3 = __lasx_xvdp2_w_h(temp0, temp2); \
148  \
149  /* if (AV_RAN64A(row + 4)) */ \
150  DUP2_ARG2(__lasx_xvilvl_h, in3, in2, w6, w4, temp0, temp1); \
151  a0 = __lasx_xvdp2add_w_h(a0, temp0, temp1); \
152  temp1 = __lasx_xvilvl_h(w2, w4); \
153  a1 = __lasx_xvdp2sub_w_h(a1, temp0, temp1); \
154  temp1 = __lasx_xvneg_h(w4); \
155  temp2 = __lasx_xvilvl_h(w2, temp1); \
156  a2 = __lasx_xvdp2add_w_h(a2, temp0, temp2); \
157  temp1 = __lasx_xvneg_h(w6); \
158  temp2 = __lasx_xvilvl_h(temp1, w4); \
159  a3 = __lasx_xvdp2add_w_h(a3, temp0, temp2); \
160  \
161  DUP2_ARG2(__lasx_xvilvh_h, in3, in2, w7, w5, temp0, temp1); \
162  b0 = __lasx_xvdp2add_w_h(b0, temp0, temp1); \
163  DUP2_ARG2(__lasx_xvilvl_h, w5, w1, w3, w7, temp1, temp2); \
164  b1 = __lasx_xvdp2sub_w_h(b1, temp0, temp1); \
165  b2 = __lasx_xvdp2add_w_h(b2, temp0, temp2); \
166  temp1 = __lasx_xvneg_h(w1); \
167  temp2 = __lasx_xvilvl_h(temp1, w3); \
168  b3 = __lasx_xvdp2add_w_h(b3, temp0, temp2); \
169  \
170  DUP4_ARG2(__lasx_xvadd_w, a0, b0, a1, b1, a2, b2, a3, b3, \
171  temp0, temp1, temp2, temp3); \
172  DUP4_ARG2(__lasx_xvsub_w, a3, b3, a2, b2, a1, b1, a0, b0, \
173  a3, a2, a1, a0); \
174  DUP4_ARG3(__lasx_xvsrani_h_w, temp1, temp0, 20, temp3, temp2, 20, a2, a3, \
175  20, a0, a1, 20, in0, in1, in2, in3); \
176 
178 {
179  int32_t const_val = 1 << 10;
180  __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
181  0x4B42539F58C50000, 0x11A822A332493FFF};
182  __m256i in0, in1, in2, in3;
183  __m256i w2, w3, w4, w5, w6, w7;
184  __m256i a0, a1, a2, a3;
185  __m256i b0, b1, b2, b3;
186  __m256i temp0, temp1, temp2, temp3;
187  __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
188  __m256i const_val1, select_vec, temp;
189 
192  DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
193  in0, in1, in2, in3);
194  __lasx_xvst(in0, block, 0);
195  __lasx_xvst(in1, block, 32);
196  __lasx_xvst(in2, block, 64);
197  __lasx_xvst(in3, block, 96);
198 }
199 
200 void ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride,
201  int16_t *block)
202 {
203  int32_t const_val = 1 << 10;
204  ptrdiff_t dst_stride_2x = dst_stride << 1;
205  ptrdiff_t dst_stride_4x = dst_stride << 2;
206  ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
207  __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
208  0x4B42539F58C50000, 0x11A822A332493FFF};
209  __m256i in0, in1, in2, in3;
210  __m256i w2, w3, w4, w5, w6, w7;
211  __m256i a0, a1, a2, a3;
212  __m256i b0, b1, b2, b3;
213  __m256i temp0, temp1, temp2, temp3;
214  __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
215  __m256i const_val1, select_vec, temp;
216 
219  DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
220  in0, in1, in2, in3);
221  DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
222  DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
223  __lasx_xvstelm_d(in0, dst, 0, 0);
224  __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
225  __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
226  __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
227  dst += dst_stride_4x;
228  __lasx_xvstelm_d(in1, dst, 0, 0);
229  __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
230  __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
231  __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
232 }
233 
234 void ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride,
235  int16_t *block)
236 {
237  int32_t const_val = 1 << 10;
238  uint8_t *dst1 = dst;
239  ptrdiff_t dst_stride_2x = dst_stride << 1;
240  ptrdiff_t dst_stride_4x = dst_stride << 2;
241  ptrdiff_t dst_stride_3x = dst_stride_2x + dst_stride;
242 
243  __m256i w1 = {0x4B42539F58C50000, 0x11A822A332493FFF,
244  0x4B42539F58C50000, 0x11A822A332493FFF};
245  __m256i sh = {0x0003000200010000, 0x000B000A00090008,
246  0x0007000600050004, 0x000F000E000D000C};
247  __m256i in0, in1, in2, in3;
248  __m256i w2, w3, w4, w5, w6, w7;
249  __m256i a0, a1, a2, a3;
250  __m256i b0, b1, b2, b3;
251  __m256i temp0, temp1, temp2, temp3;
252  __m256i const_val0 = __lasx_xvreplgr2vr_w(const_val);
253  __m256i const_val1, select_vec, temp;
254 
257  a0 = __lasx_xvldrepl_d(dst1, 0);
258  a0 = __lasx_vext2xv_hu_bu(a0);
259  dst1 += dst_stride;
260  a1 = __lasx_xvldrepl_d(dst1, 0);
261  a1 = __lasx_vext2xv_hu_bu(a1);
262  dst1 += dst_stride;
263  a2 = __lasx_xvldrepl_d(dst1, 0);
264  a2 = __lasx_vext2xv_hu_bu(a2);
265  dst1 += dst_stride;
266  a3 = __lasx_xvldrepl_d(dst1, 0);
267  a3 = __lasx_vext2xv_hu_bu(a3);
268  dst1 += dst_stride;
269  b0 = __lasx_xvldrepl_d(dst1, 0);
270  b0 = __lasx_vext2xv_hu_bu(b0);
271  dst1 += dst_stride;
272  b1 = __lasx_xvldrepl_d(dst1, 0);
273  b1 = __lasx_vext2xv_hu_bu(b1);
274  dst1 += dst_stride;
275  b2 = __lasx_xvldrepl_d(dst1, 0);
276  b2 = __lasx_vext2xv_hu_bu(b2);
277  dst1 += dst_stride;
278  b3 = __lasx_xvldrepl_d(dst1, 0);
279  b3 = __lasx_vext2xv_hu_bu(b3);
280  DUP4_ARG3(__lasx_xvshuf_h, sh, a1, a0, sh, a3, a2, sh, b1, b0, sh, b3, b2,
281  temp0, temp1, temp2, temp3);
282  DUP4_ARG2(__lasx_xvadd_h, temp0, in0, temp1, in1, temp2, in2, temp3, in3,
283  in0, in1, in2, in3);
284  DUP4_ARG2(__lasx_xvpermi_d, in0, 0xD8, in1, 0xD8, in2, 0xD8, in3, 0xD8,
285  in0, in1, in2, in3);
286  DUP4_ARG1(__lasx_xvclip255_h, in0, in1, in2, in3, in0, in1, in2, in3);
287  DUP2_ARG2(__lasx_xvpickev_b, in1, in0, in3, in2, in0, in1);
288  __lasx_xvstelm_d(in0, dst, 0, 0);
289  __lasx_xvstelm_d(in0, dst + dst_stride, 0, 2);
290  __lasx_xvstelm_d(in0, dst + dst_stride_2x, 0, 1);
291  __lasx_xvstelm_d(in0, dst + dst_stride_3x, 0, 3);
292  dst += dst_stride_4x;
293  __lasx_xvstelm_d(in1, dst, 0, 0);
294  __lasx_xvstelm_d(in1, dst + dst_stride, 0, 2);
295  __lasx_xvstelm_d(in1, dst + dst_stride_2x, 0, 1);
296  __lasx_xvstelm_d(in1, dst + dst_stride_3x, 0, 3);
297 }
idctdsp_loongarch.h
ff_simple_idct_put_lasx
void ff_simple_idct_put_lasx(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
Definition: simple_idct_lasx.c:200
DUP2_ARG2
#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1)
Definition: loongson_intrinsics.h:58
b1
static double b1(void *priv, double x, double y)
Definition: vf_xfade.c:1771
a1
#define a1
Definition: regdef.h:47
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
b3
static double b3(void *priv, double x, double y)
Definition: vf_xfade.c:1773
DUP4_ARG1
#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:70
b2
static double b2(void *priv, double x, double y)
Definition: vf_xfade.c:1772
LASX_IDCTCOLS
#define LASX_IDCTCOLS
Definition: simple_idct_lasx.c:127
a0
#define a0
Definition: regdef.h:46
a2
#define a2
Definition: regdef.h:48
ff_simple_idct_add_lasx
void ff_simple_idct_add_lasx(uint8_t *dst, ptrdiff_t dst_stride, int16_t *block)
Definition: simple_idct_lasx.c:234
LASX_IDCTROWCONDDC
#define LASX_IDCTROWCONDDC
Definition: simple_idct_lasx.c:37
temp
else temp
Definition: vf_mcdeint.c:248
loongson_intrinsics.h
ff_simple_idct_lasx
void ff_simple_idct_lasx(int16_t *block)
Definition: simple_idct_lasx.c:177
int32_t
int32_t
Definition: audioconvert.c:56
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
b0
static double b0(void *priv, double x, double y)
Definition: vf_xfade.c:1770
a3
#define a3
Definition: regdef.h:49
DUP4_ARG3
#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:83