FFmpeg
hevc_idct_lsx.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2022 Loongson Technology Corporation Limited
3  * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
4  * Hao Chen <chenhao@loongson.cn>
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22 
24 #include "hevcdsp_lsx.h"
25 
26 static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
27  64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
28 };
29 
30 static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
31  64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
32  64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
33  64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
34  64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
35 };
36 
37 static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
38  90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
39  90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
40  88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
41  85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
42  82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
43  78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
44  73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
45  67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
46  61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
47  54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
48  46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
49  38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
50  31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
51  22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
52  13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
53  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
54 };
55 
56 static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
57  90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
58  80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
59  57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
60  25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
61 };
62 
63 static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
64  89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
65 };
66 
67 #define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \
68  sum0, sum1, sum2, sum3, shift) \
69 { \
70  __m128i vec0, vec1, vec2, vec3, vec4, vec5; \
71  __m128i cnst64 = __lsx_vldi(0x0840); \
72  __m128i cnst83 = __lsx_vldi(0x0853); \
73  __m128i cnst36 = __lsx_vldi(0x0824); \
74  \
75  vec0 = __lsx_vdp2_w_h(in_r0, cnst64); \
76  vec1 = __lsx_vdp2_w_h(in_l0, cnst83); \
77  vec2 = __lsx_vdp2_w_h(in_r1, cnst64); \
78  vec3 = __lsx_vdp2_w_h(in_l1, cnst36); \
79  vec4 = __lsx_vdp2_w_h(in_l0, cnst36); \
80  vec5 = __lsx_vdp2_w_h(in_l1, cnst83); \
81  \
82  sum0 = __lsx_vadd_w(vec0, vec2); \
83  sum1 = __lsx_vsub_w(vec0, vec2); \
84  vec1 = __lsx_vadd_w(vec1, vec3); \
85  vec4 = __lsx_vsub_w(vec4, vec5); \
86  sum2 = __lsx_vsub_w(sum1, vec4); \
87  sum3 = __lsx_vsub_w(sum0, vec1); \
88  sum0 = __lsx_vadd_w(sum0, vec1); \
89  sum1 = __lsx_vadd_w(sum1, vec4); \
90  \
91  sum0 = __lsx_vsrari_w(sum0, shift); \
92  sum1 = __lsx_vsrari_w(sum1, shift); \
93  sum2 = __lsx_vsrari_w(sum2, shift); \
94  sum3 = __lsx_vsrari_w(sum3, shift); \
95  sum0 = __lsx_vsat_w(sum0, 15); \
96  sum1 = __lsx_vsat_w(sum1, 15); \
97  sum2 = __lsx_vsat_w(sum2, 15); \
98  sum3 = __lsx_vsat_w(sum3, 15); \
99 }
100 
101 #define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \
102 { \
103  __m128i src0_r, src1_r, src2_r, src3_r; \
104  __m128i src0_l, src1_l, src2_l, src3_l; \
105  __m128i filter0, filter1, filter2, filter3; \
106  __m128i temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \
107  __m128i temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \
108  __m128i sum0_r, sum1_r, sum2_r, sum3_r; \
109  __m128i sum0_l, sum1_l, sum2_l, sum3_l; \
110  \
111  DUP4_ARG2(__lsx_vilvl_h, in4, in0, in6, in2, in5, in1, in3, in7, \
112  src0_r, src1_r, src2_r, src3_r); \
113  DUP4_ARG2(__lsx_vilvh_h, in4, in0, in6, in2, in5, in1, in3, in7, \
114  src0_l, src1_l, src2_l, src3_l); \
115  \
116  DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 8, \
117  filter, 12, filter0, filter1, filter2, filter3); \
118  DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
119  src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \
120  temp1_r, temp1_l); \
121  \
122  LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
123  sum1_l, sum1_r); \
124  sum2_r = sum1_r; \
125  sum2_l = sum1_l; \
126  sum3_r = sum0_r; \
127  sum3_l = sum0_l; \
128  \
129  DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \
130  src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \
131  temp3_r, temp3_l); \
132  temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \
133  temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \
134  sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \
135  sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \
136  sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \
137  sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \
138  \
139  in0 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \
140  in7 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \
141  \
142  DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \
143  src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \
144  temp5_r, temp5_l); \
145  temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \
146  temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \
147  sum1_r = __lsx_vadd_w(sum1_r, temp4_r); \
148  sum1_l = __lsx_vadd_w(sum1_l, temp4_l); \
149  sum2_r = __lsx_vsub_w(sum2_r, temp4_r); \
150  sum2_l = __lsx_vsub_w(sum2_l, temp4_l); \
151  \
152  in3 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \
153  in4 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \
154  \
155  DUP4_ARG2(__lsx_vldrepl_w, filter, 16, filter, 20, filter, 24, \
156  filter, 28, filter0, filter1, filter2, filter3); \
157  DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
158  src1_r, filter1, src1_l, filter1, temp0_r, temp0_l, \
159  temp1_r, temp1_l); \
160  \
161  LSX_BUTTERFLY_4_W(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l,\
162  sum1_l, sum1_r); \
163  sum2_r = sum1_r; \
164  sum2_l = sum1_l; \
165  sum3_r = sum0_r; \
166  sum3_l = sum0_l; \
167  \
168  DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter2, src2_l, filter2, \
169  src3_r, filter3, src3_l, filter3, temp2_r, temp2_l, \
170  temp3_r, temp3_l); \
171  temp2_r = __lsx_vadd_w(temp2_r, temp3_r); \
172  temp2_l = __lsx_vadd_w(temp2_l, temp3_l); \
173  sum0_r = __lsx_vadd_w(sum0_r, temp2_r); \
174  sum0_l = __lsx_vadd_w(sum0_l, temp2_l); \
175  sum3_r = __lsx_vsub_w(sum3_r, temp2_r); \
176  sum3_l = __lsx_vsub_w(sum3_l, temp2_l); \
177  \
178  in1 = __lsx_vssrarni_h_w(sum0_l, sum0_r, shift); \
179  in6 = __lsx_vssrarni_h_w(sum3_l, sum3_r, shift); \
180  \
181  DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter3, src2_l, filter3, \
182  src3_r, filter2, src3_l, filter2, temp4_r, temp4_l, \
183  temp5_r, temp5_l); \
184  temp4_r = __lsx_vsub_w(temp4_r, temp5_r); \
185  temp4_l = __lsx_vsub_w(temp4_l, temp5_l); \
186  sum1_r = __lsx_vsub_w(sum1_r, temp4_r); \
187  sum1_l = __lsx_vsub_w(sum1_l, temp4_l); \
188  sum2_r = __lsx_vadd_w(sum2_r, temp4_r); \
189  sum2_l = __lsx_vadd_w(sum2_l, temp4_l); \
190  \
191  in2 = __lsx_vssrarni_h_w(sum1_l, sum1_r, shift); \
192  in5 = __lsx_vssrarni_h_w(sum2_l, sum2_r, shift); \
193 }
194 
195 #define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \
196  src4_r, src5_r, src6_r, src7_r, \
197  src0_l, src1_l, src2_l, src3_l, \
198  src4_l, src5_l, src6_l, src7_l, shift) \
199 { \
200  int16_t *ptr0, *ptr1; \
201  __m128i dst0, dst1; \
202  __m128i filter0, filter1, filter2, filter3; \
203  __m128i temp0_r, temp1_r, temp0_l, temp1_l; \
204  __m128i sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \
205  __m128i sum3_l, res0_r, res1_r, res0_l, res1_l; \
206  \
207  ptr0 = (buf_ptr + 112); \
208  ptr1 = (buf_ptr + 128); \
209  k = -1; \
210  \
211  for (j = 0; j < 4; j++) \
212  { \
213  DUP4_ARG2(__lsx_vldrepl_w, filter, 0, filter, 4, filter, 16, \
214  filter, 20, filter0, filter1, filter2, filter3); \
215  DUP4_ARG2(__lsx_vdp2_w_h, src0_r, filter0, src0_l, filter0, \
216  src4_r, filter2, src4_l, filter2, sum0_r, sum0_l, \
217  sum2_r, sum2_l); \
218  DUP2_ARG2(__lsx_vdp2_w_h, src7_r, filter2, src7_l, filter2, \
219  sum3_r, sum3_l); \
220  DUP4_ARG3(__lsx_vdp2add_w_h, sum0_r, src1_r, filter1, sum0_l, \
221  src1_l, filter1, sum2_r, src5_r, filter3, sum2_l, \
222  src5_l, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \
223  DUP2_ARG3(__lsx_vdp2add_w_h, sum3_r, src6_r, filter3, sum3_l, \
224  src6_l, filter3, sum3_r, sum3_l); \
225  \
226  sum1_r = sum0_r; \
227  sum1_l = sum0_l; \
228  \
229  DUP4_ARG2(__lsx_vldrepl_w, filter, 8, filter, 12, filter, 24, \
230  filter, 28, filter0, filter1, filter2, filter3); \
231  filter += 16; \
232  DUP2_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0, \
233  temp0_r, temp0_l); \
234  DUP2_ARG3(__lsx_vdp2add_w_h, sum2_r, src6_r, filter2, sum2_l, \
235  src6_l, filter2, sum2_r, sum2_l); \
236  DUP2_ARG2(__lsx_vdp2_w_h, src5_r, filter2, src5_l, filter2, \
237  temp1_r, temp1_l); \
238  \
239  sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \
240  sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \
241  sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \
242  sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \
243  sum3_r = __lsx_vsub_w(temp1_r, sum3_r); \
244  sum3_l = __lsx_vsub_w(temp1_l, sum3_l); \
245  \
246  DUP2_ARG2(__lsx_vdp2_w_h, src3_r, filter1, src3_l, filter1, \
247  temp0_r, temp0_l); \
248  DUP4_ARG3(__lsx_vdp2add_w_h, sum2_r, src7_r, filter3, sum2_l, \
249  src7_l, filter3, sum3_r, src4_r, filter3, sum3_l, \
250  src4_l, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \
251  \
252  sum0_r = __lsx_vadd_w(sum0_r, temp0_r); \
253  sum0_l = __lsx_vadd_w(sum0_l, temp0_l); \
254  sum1_r = __lsx_vsub_w(sum1_r, temp0_r); \
255  sum1_l = __lsx_vsub_w(sum1_l, temp0_l); \
256  \
257  LSX_BUTTERFLY_4_W(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \
258  res1_l, res1_r); \
259  dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \
260  dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \
261  __lsx_vst(dst0, buf_ptr, 0); \
262  __lsx_vst(dst1, (buf_ptr + ((15 - (j * 2)) << 4)), 0); \
263  \
264  LSX_BUTTERFLY_4_W(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \
265  res1_l, res1_r); \
266  \
267  dst0 = __lsx_vssrarni_h_w(res0_l, res0_r, shift); \
268  dst1 = __lsx_vssrarni_h_w(res1_l, res1_r, shift); \
269  __lsx_vst(dst0, (ptr0 + ((((j + 1) >> 1) * 2 * k) << 4)), 0); \
270  __lsx_vst(dst1, (ptr1 - ((((j + 1) >> 1) * 2 * k) << 4)), 0); \
271  \
272  k *= -1; \
273  buf_ptr += 16; \
274  } \
275 }
276 
277 #define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \
278 { \
279  tmp0_r = __lsx_vld(input + load_idx * 8, 0); \
280  tmp0_l = __lsx_vld(input + load_idx * 8, 16); \
281  tmp1_r = sum0_r; \
282  tmp1_l = sum0_l; \
283  sum0_r = __lsx_vadd_w(sum0_r, tmp0_r); \
284  sum0_l = __lsx_vadd_w(sum0_l, tmp0_l); \
285  __lsx_vst(sum0_r, (input + load_idx * 8), 0); \
286  __lsx_vst(sum0_l, (input + load_idx * 8), 16); \
287  tmp1_r = __lsx_vsub_w(tmp1_r, tmp0_r); \
288  tmp1_l = __lsx_vsub_w(tmp1_l, tmp0_l); \
289  __lsx_vst(tmp1_r, (input + store_idx * 8), 0); \
290  __lsx_vst(tmp1_l, (input + store_idx * 8), 16); \
291 }
292 
293 #define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \
294  res0, res1, res2, res3, shift) \
295 { \
296  __m128i vec0, vec1, vec2, vec3; \
297  __m128i cnst74 = __lsx_vldi(0x84a); \
298  __m128i cnst55 = __lsx_vldi(0x837); \
299  __m128i cnst29 = __lsx_vldi(0x81d); \
300  \
301  vec0 = __lsx_vadd_w(in_r0, in_r1); \
302  vec2 = __lsx_vsub_w(in_r0, in_l1); \
303  res0 = __lsx_vmul_w(vec0, cnst29); \
304  res1 = __lsx_vmul_w(vec2, cnst55); \
305  res2 = __lsx_vsub_w(in_r0, in_r1); \
306  vec1 = __lsx_vadd_w(in_r1, in_l1); \
307  res2 = __lsx_vadd_w(res2, in_l1); \
308  vec3 = __lsx_vmul_w(in_l0, cnst74); \
309  res3 = __lsx_vmul_w(vec0, cnst55); \
310  \
311  res0 = __lsx_vadd_w(res0, __lsx_vmul_w(vec1, cnst55)); \
312  res1 = __lsx_vsub_w(res1, __lsx_vmul_w(vec1, cnst29)); \
313  res2 = __lsx_vmul_w(res2, cnst74); \
314  res3 = __lsx_vadd_w(res3, __lsx_vmul_w(vec2, cnst29)); \
315  \
316  res0 = __lsx_vadd_w(res0, vec3); \
317  res1 = __lsx_vadd_w(res1, vec3); \
318  res3 = __lsx_vsub_w(res3, vec3); \
319  \
320  res0 = __lsx_vsrari_w(res0, shift); \
321  res1 = __lsx_vsrari_w(res1, shift); \
322  res2 = __lsx_vsrari_w(res2, shift); \
323  res3 = __lsx_vsrari_w(res3, shift); \
324  res0 = __lsx_vsat_w(res0, 15); \
325  res1 = __lsx_vsat_w(res1, 15); \
326  res2 = __lsx_vsat_w(res2, 15); \
327  res3 = __lsx_vsat_w(res3, 15); \
328 }
329 
330 void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit)
331 {
332  __m128i in0, in1;
333  __m128i in_r0, in_l0, in_r1, in_l1;
334  __m128i sum0, sum1, sum2, sum3;
335  __m128i zero = __lsx_vldi(0x00);
336 
337  in0 = __lsx_vld(coeffs, 0);
338  in1 = __lsx_vld(coeffs, 16);
339  in_r0 = __lsx_vilvl_h(zero, in0);
340  in_l0 = __lsx_vilvh_h(zero, in0);
341  in_r1 = __lsx_vilvl_h(zero, in1);
342  in_l1 = __lsx_vilvh_h(zero, in1);
343 
344  HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
345  LSX_TRANSPOSE4x4_W(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
346  HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
347 
348  /* Pack and transpose */
349  in0 = __lsx_vpickev_h(sum2, sum0);
350  in1 = __lsx_vpickev_h(sum3, sum1);
351  sum0 = __lsx_vilvl_h(in1, in0);
352  sum1 = __lsx_vilvh_h(in1, in0);
353  in0 = __lsx_vilvl_w(sum1, sum0);
354  in1 = __lsx_vilvh_w(sum1, sum0);
355 
356  __lsx_vst(in0, coeffs, 0);
357  __lsx_vst(in1, coeffs, 16);
358 }
359 
360 void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit)
361 {
362  const int16_t *filter = &gt8x8_cnst[0];
363  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
364 
365  DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 16, coeffs, 32,
366  coeffs, 48, in0, in1, in2, in3);
367  DUP4_ARG2(__lsx_vld, coeffs, 64, coeffs, 80, coeffs, 96,
368  coeffs, 112, in4, in5, in6, in7);
369  HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
370  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
371  in0, in1, in2, in3, in4, in5, in6, in7);
372  HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
373  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
374  in0, in1, in2, in3, in4, in5, in6, in7);
375 
376  __lsx_vst(in0, coeffs, 0);
377  __lsx_vst(in1, coeffs, 16);
378  __lsx_vst(in2, coeffs, 32);
379  __lsx_vst(in3, coeffs, 48);
380  __lsx_vst(in4, coeffs, 64);
381  __lsx_vst(in5, coeffs, 80);
382  __lsx_vst(in6, coeffs, 96);
383  __lsx_vst(in7, coeffs, 112);
384 }
385 
386 void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit)
387 {
388  int16_t i, j, k;
389  int16_t buf[256];
390  int16_t *buf_ptr = &buf[0];
391  int16_t *src = coeffs;
392  const int16_t *filter = &gt16x16_cnst[0];
393  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
394  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
395  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
396  __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
397  __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
398 
399  for (i = 2; i--;) {
400  DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
401  in0, in1, in2, in3);
402  DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
403  in4, in5, in6, in7);
404  DUP4_ARG2(__lsx_vld, src, 256, src, 288, src, 320, src, 352,
405  in8, in9, in10, in11);
406  DUP4_ARG2(__lsx_vld, src, 384, src, 416, src, 448, src, 480,
407  in12, in13, in14, in15);
408 
409  DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
410  src0_r, src1_r, src2_r, src3_r);
411  DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
412  src4_r, src5_r, src6_r, src7_r);
413  DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
414  src0_l, src1_l, src2_l, src3_l);
415  DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
416  src4_l, src5_l, src6_l, src7_l);
417 
418  HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
419  src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
420  src4_l, src5_l, src6_l, src7_l, 7);
421 
422  src += 8;
423  buf_ptr = (&buf[0] + 8);
424  filter = &gt16x16_cnst[0];
425  }
426 
427  src = &buf[0];
428  buf_ptr = coeffs;
429  filter = &gt16x16_cnst[0];
430 
431  for (i = 2; i--;) {
432  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48,
433  in0, in8, in1, in9);
434  DUP4_ARG2(__lsx_vld, src, 64, src, 80, src, 96, src, 112,
435  in2, in10, in3, in11);
436  DUP4_ARG2(__lsx_vld, src, 128, src, 144, src, 160, src, 176,
437  in4, in12, in5, in13);
438  DUP4_ARG2(__lsx_vld, src, 192, src, 208, src, 224, src, 240,
439  in6, in14, in7, in15);
440  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
441  in0, in1, in2, in3, in4, in5, in6, in7);
442  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
443  in8, in9, in10, in11, in12, in13, in14, in15);
444  DUP4_ARG2(__lsx_vilvl_h, in4, in0, in12, in8, in6, in2, in14, in10,
445  src0_r, src1_r, src2_r, src3_r);
446  DUP4_ARG2(__lsx_vilvl_h, in5, in1, in13, in9, in3, in7, in11, in15,
447  src4_r, src5_r, src6_r, src7_r);
448  DUP4_ARG2(__lsx_vilvh_h, in4, in0, in12, in8, in6, in2, in14, in10,
449  src0_l, src1_l, src2_l, src3_l);
450  DUP4_ARG2(__lsx_vilvh_h, in5, in1, in13, in9, in3, in7, in11, in15,
451  src4_l, src5_l, src6_l, src7_l);
452  HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
453  src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
454  src4_l, src5_l, src6_l, src7_l, 12);
455 
456  src += 128;
457  buf_ptr = coeffs + 8;
458  filter = &gt16x16_cnst[0];
459  }
460 
461  DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 32, coeffs, 64, coeffs, 96,
462  in0, in1, in2, in3);
463  DUP4_ARG2(__lsx_vld, coeffs, 128, coeffs, 160, coeffs, 192, coeffs, 224,
464  in4, in5, in6, in7);
465  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
466  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
467  __lsx_vst(vec0, coeffs, 0);
468  __lsx_vst(vec1, coeffs, 32);
469  __lsx_vst(vec2, coeffs, 64);
470  __lsx_vst(vec3, coeffs, 96);
471  __lsx_vst(vec4, coeffs, 128);
472  __lsx_vst(vec5, coeffs, 160);
473  __lsx_vst(vec6, coeffs, 192);
474  __lsx_vst(vec7, coeffs, 224);
475 
476  src = coeffs + 8;
477  DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96, in0, in1, in2, in3);
478  DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
479  in4, in5, in6, in7);
480  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
481  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
482  src = coeffs + 128;
483  DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
484  in8, in9, in10, in11);
485  DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
486  in12, in13, in14, in15);
487 
488  __lsx_vst(vec0, src, 0);
489  __lsx_vst(vec1, src, 32);
490  __lsx_vst(vec2, src, 64);
491  __lsx_vst(vec3, src, 96);
492  __lsx_vst(vec4, src, 128);
493  __lsx_vst(vec5, src, 160);
494  __lsx_vst(vec6, src, 192);
495  __lsx_vst(vec7, src, 224);
496  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15,
497  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
498  src = coeffs + 8;
499  __lsx_vst(vec0, src, 0);
500  __lsx_vst(vec1, src, 32);
501  __lsx_vst(vec2, src, 64);
502  __lsx_vst(vec3, src, 96);
503  __lsx_vst(vec4, src, 128);
504  __lsx_vst(vec5, src, 160);
505  __lsx_vst(vec6, src, 192);
506  __lsx_vst(vec7, src, 224);
507 
508  src = coeffs + 136;
509  DUP4_ARG2(__lsx_vld, src, 0, src, 32, src, 64, src, 96,
510  in0, in1, in2, in3);
511  DUP4_ARG2(__lsx_vld, src, 128, src, 160, src, 192, src, 224,
512  in4, in5, in6, in7);
513  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
514  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
515  __lsx_vst(vec0, src, 0);
516  __lsx_vst(vec1, src, 32);
517  __lsx_vst(vec2, src, 64);
518  __lsx_vst(vec3, src, 96);
519  __lsx_vst(vec4, src, 128);
520  __lsx_vst(vec5, src, 160);
521  __lsx_vst(vec6, src, 192);
522  __lsx_vst(vec7, src, 224);
523 }
524 
525 static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch,
526  uint8_t round)
527 {
528  uint8_t i;
529  int32_t buf_pitch_2 = buf_pitch << 1;
530  int32_t buf_pitch_4 = buf_pitch << 2;
531  int32_t buf_pitch_8 = buf_pitch << 3;
532  int32_t buf_pitch_16 = buf_pitch << 4;
533 
534  const int16_t *filter_ptr0 = &gt32x32_cnst0[0];
535  const int16_t *filter_ptr1 = &gt32x32_cnst1[0];
536  const int16_t *filter_ptr2 = &gt32x32_cnst2[0];
537  const int16_t *filter_ptr3 = &gt8x8_cnst[0];
538  int16_t *src0 = (coeffs + buf_pitch);
539  int16_t *src1 = (coeffs + buf_pitch_2);
540  int16_t *src2 = (coeffs + buf_pitch_4);
541  int16_t *src3 = (coeffs);
542  int32_t tmp_buf[8 * 32 + 15];
543  int32_t *tmp_buf_ptr = tmp_buf + 15;
544  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
545  __m128i src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
546  __m128i src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
547  __m128i filter0, filter1, filter2, filter3;
548  __m128i sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
549 
550  /* Align pointer to 64 byte boundary */
551  tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
552 
553  /* process coeff 4, 12, 20, 28 */
554  in0 = __lsx_vld(src2, 0);
555  in1 = __lsx_vld(src2 + buf_pitch_8, 0);
556  in2 = __lsx_vld(src2 + buf_pitch_16, 0);
557  in3 = __lsx_vld(src2 + buf_pitch_16 + buf_pitch_8, 0);
558  in4 = __lsx_vld(src3, 0);
559  in5 = __lsx_vld(src3 + buf_pitch_8, 0);
560  in6 = __lsx_vld(src3 + buf_pitch_16, 0);
561  in7 = __lsx_vld(src3 + buf_pitch_16 + buf_pitch_8, 0);
562  DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in6, in4, in7, in5,
563  src0_r, src1_r, src2_r, src3_r);
564  DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in6, in4, in7, in5,
565  src0_l, src1_l, src2_l, src3_l);
566 
567  filter0 = __lsx_vldrepl_w(filter_ptr2, 0);
568  filter1 = __lsx_vldrepl_w(filter_ptr2, 4);
569  sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
570  sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
571  sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
572  sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
573  __lsx_vst(sum0_r, tmp_buf_ptr, 0);
574  __lsx_vst(sum0_l, tmp_buf_ptr, 16);
575 
576  filter0 = __lsx_vldrepl_w(filter_ptr2, 8);
577  filter1 = __lsx_vldrepl_w(filter_ptr2, 12);
578  sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
579  sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
580  sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
581  sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
582  __lsx_vst(sum0_r, tmp_buf_ptr, 32);
583  __lsx_vst(sum0_l, tmp_buf_ptr, 48);
584 
585  filter0 = __lsx_vldrepl_w(filter_ptr2, 16);
586  filter1 = __lsx_vldrepl_w(filter_ptr2, 20);
587  sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
588  sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
589  sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
590  sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
591  __lsx_vst(sum0_r, tmp_buf_ptr, 64);
592  __lsx_vst(sum0_l, tmp_buf_ptr, 80);
593 
594  filter0 = __lsx_vldrepl_w(filter_ptr2, 24);
595  filter1 = __lsx_vldrepl_w(filter_ptr2, 28);
596  sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
597  sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
598  sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
599  sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
600  __lsx_vst(sum0_r, tmp_buf_ptr, 96);
601  __lsx_vst(sum0_l, tmp_buf_ptr, 112);
602 
603  /* process coeff 0, 8, 16, 24 */
604  filter0 = __lsx_vldrepl_w(filter_ptr3, 0);
605  filter1 = __lsx_vldrepl_w(filter_ptr3, 4);
606 
607  DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
608  src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
609  sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
610  sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
611  sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
612  sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
613 
614  HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 0, 7);
615  HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 3, 4);
616 
617  filter0 = __lsx_vldrepl_w(filter_ptr3, 16);
618  filter1 = __lsx_vldrepl_w(filter_ptr3, 20);
619 
620  DUP4_ARG2(__lsx_vdp2_w_h, src2_r, filter0, src2_l, filter0,
621  src3_r, filter1, src3_l, filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
622  sum1_r = __lsx_vsub_w(sum0_r, tmp1_r);
623  sum1_l = __lsx_vsub_w(sum0_l, tmp1_l);
624  sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
625  sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
626 
627  HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, 1, 6);
628  HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, 2, 5);
629 
630  /* process coeff 2 6 10 14 18 22 26 30 */
631  in0 = __lsx_vld(src1, 0);
632  in1 = __lsx_vld(src1 + buf_pitch_4, 0);
633  in2 = __lsx_vld(src1 + buf_pitch_8, 0);
634  in3 = __lsx_vld(src1 + buf_pitch_8 + buf_pitch_4, 0);
635  in4 = __lsx_vld(src1 + buf_pitch_16, 0);
636  in5 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_4, 0);
637  in6 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8, 0);
638  in7 = __lsx_vld(src1 + buf_pitch_16 + buf_pitch_8 + buf_pitch_4, 0);
639 
640  DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
641  src0_r, src1_r, src2_r, src3_r);
642  DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
643  src0_l, src1_l, src2_l, src3_l);
644 
645  /* loop for all columns of constants */
646  for (i = 0; i < 8; i++) {
647  /* processing single column of constants */
648  filter0 = __lsx_vldrepl_w(filter_ptr1, 0);
649  filter1 = __lsx_vldrepl_w(filter_ptr1, 4);
650  filter2 = __lsx_vldrepl_w(filter_ptr1, 8);
651  filter3 = __lsx_vldrepl_w(filter_ptr1, 12);
652  sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
653  sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
654  sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
655  sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
656  sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
657  sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
658  sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
659  sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
660 
661  tmp0_r = __lsx_vld(tmp_buf_ptr + (i << 3), 0);
662  tmp0_l = __lsx_vld(tmp_buf_ptr + (i << 3), 16);
663  tmp1_r = tmp0_r;
664  tmp1_l = tmp0_l;
665  tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
666  tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
667  tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
668  tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
669  __lsx_vst(tmp0_r, tmp_buf_ptr + (i << 3), 0);
670  __lsx_vst(tmp0_l, tmp_buf_ptr + (i << 3), 16);
671  __lsx_vst(tmp1_r, tmp_buf_ptr + ((15 - i) * 8), 0);
672  __lsx_vst(tmp1_l, tmp_buf_ptr + ((15 - i) * 8), 16);
673 
674  filter_ptr1 += 8;
675  }
676 
677  /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
678  in0 = __lsx_vld(src0, 0);
679  in1 = __lsx_vld(src0 + buf_pitch_2, 0);
680  in2 = __lsx_vld(src0 + buf_pitch_4, 0);
681  in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
682  in4 = __lsx_vld(src0 + buf_pitch_8, 0);
683  in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
684  in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
685  in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
686 
687  src0 += 16 * buf_pitch;
688  DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
689  src0_r, src1_r, src2_r, src3_r);
690  DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
691  src0_l, src1_l, src2_l, src3_l);
692  in0 = __lsx_vld(src0, 0);
693  in1 = __lsx_vld(src0 + buf_pitch_2, 0);
694  in2 = __lsx_vld(src0 + buf_pitch_4, 0);
695  in3 = __lsx_vld(src0 + buf_pitch_4 + buf_pitch_2, 0);
696  in4 = __lsx_vld(src0 + buf_pitch_8, 0);
697  in5 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_2, 0);
698  in6 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4, 0);
699  in7 = __lsx_vld(src0 + buf_pitch_8 + buf_pitch_4 + buf_pitch_2, 0);
700 
701  DUP4_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, in5, in4, in7, in6,
702  src4_r, src5_r, src6_r, src7_r);
703  DUP4_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, in5, in4, in7, in6,
704  src4_l, src5_l, src6_l, src7_l);
705 
706  /* loop for all columns of filter constants */
707  for (i = 0; i < 16; i++) {
708  /* processing single column of constants */
709  filter0 = __lsx_vldrepl_w(filter_ptr0, 0);
710  filter1 = __lsx_vldrepl_w(filter_ptr0, 4);
711  filter2 = __lsx_vldrepl_w(filter_ptr0, 8);
712  filter3 = __lsx_vldrepl_w(filter_ptr0, 12);
713  sum0_r = __lsx_vdp2_w_h(src0_r, filter0);
714  sum0_l = __lsx_vdp2_w_h(src0_l, filter0);
715  sum0_r = __lsx_vdp2add_w_h(sum0_r, src1_r, filter1);
716  sum0_l = __lsx_vdp2add_w_h(sum0_l, src1_l, filter1);
717  sum0_r = __lsx_vdp2add_w_h(sum0_r, src2_r, filter2);
718  sum0_l = __lsx_vdp2add_w_h(sum0_l, src2_l, filter2);
719  sum0_r = __lsx_vdp2add_w_h(sum0_r, src3_r, filter3);
720  sum0_l = __lsx_vdp2add_w_h(sum0_l, src3_l, filter3);
721  tmp1_r = sum0_r;
722  tmp1_l = sum0_l;
723 
724  filter0 = __lsx_vldrepl_w(filter_ptr0, 16);
725  filter1 = __lsx_vldrepl_w(filter_ptr0, 20);
726  filter2 = __lsx_vldrepl_w(filter_ptr0, 24);
727  filter3 = __lsx_vldrepl_w(filter_ptr0, 28);
728  sum0_r = __lsx_vdp2_w_h(src4_r, filter0);
729  sum0_l = __lsx_vdp2_w_h(src4_l, filter0);
730  sum0_r = __lsx_vdp2add_w_h(sum0_r, src5_r, filter1);
731  sum0_l = __lsx_vdp2add_w_h(sum0_l, src5_l, filter1);
732  sum0_r = __lsx_vdp2add_w_h(sum0_r, src6_r, filter2);
733  sum0_l = __lsx_vdp2add_w_h(sum0_l, src6_l, filter2);
734  sum0_r = __lsx_vdp2add_w_h(sum0_r, src7_r, filter3);
735  sum0_l = __lsx_vdp2add_w_h(sum0_l, src7_l, filter3);
736  sum0_r = __lsx_vadd_w(sum0_r, tmp1_r);
737  sum0_l = __lsx_vadd_w(sum0_l, tmp1_l);
738 
739  tmp0_r = __lsx_vld(tmp_buf_ptr + i * 8, 0);
740  tmp0_l = __lsx_vld(tmp_buf_ptr + i * 8, 16);
741  tmp1_r = tmp0_r;
742  tmp1_l = tmp0_l;
743  tmp0_r = __lsx_vadd_w(tmp0_r, sum0_r);
744  tmp0_l = __lsx_vadd_w(tmp0_l, sum0_l);
745  sum1_r = __lsx_vreplgr2vr_w(round);
746  tmp0_r = __lsx_vssrarn_h_w(tmp0_r, sum1_r);
747  tmp0_l = __lsx_vssrarn_h_w(tmp0_l, sum1_r);
748  in0 = __lsx_vpackev_d(tmp0_l, tmp0_r);
749  __lsx_vst(in0, (coeffs + i * buf_pitch), 0);
750  tmp1_r = __lsx_vsub_w(tmp1_r, sum0_r);
751  tmp1_l = __lsx_vsub_w(tmp1_l, sum0_l);
752  tmp1_r = __lsx_vssrarn_h_w(tmp1_r, sum1_r);
753  tmp1_l = __lsx_vssrarn_h_w(tmp1_l, sum1_r);
754  in0 = __lsx_vpackev_d(tmp1_l, tmp1_r);
755  __lsx_vst(in0, (coeffs + (31 - i) * buf_pitch), 0);
756 
757  filter_ptr0 += 16;
758  }
759 }
760 
761 static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
762 {
763  uint8_t i;
764  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
765 
766  for (i = 0; i < 4; i++) {
767  DUP4_ARG2(__lsx_vld, coeffs, 0, coeffs, 64, coeffs, 128,
768  coeffs, 192, in0, in1, in2, in3);
769  DUP4_ARG2(__lsx_vld, coeffs, 256, coeffs, 320, coeffs, 384,
770  coeffs, 448, in4, in5, in6, in7);
771  coeffs += 8;
772  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
773  in0, in1, in2, in3, in4, in5, in6, in7);
774  __lsx_vst(in0, tmp_buf, 0);
775  __lsx_vst(in1, tmp_buf, 16);
776  __lsx_vst(in2, tmp_buf, 32);
777  __lsx_vst(in3, tmp_buf, 48);
778  __lsx_vst(in4, tmp_buf, 64);
779  __lsx_vst(in5, tmp_buf, 80);
780  __lsx_vst(in6, tmp_buf, 96);
781  __lsx_vst(in7, tmp_buf, 112);
782  tmp_buf += 64;
783  }
784 }
785 
786 static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
787 {
788  uint8_t i;
789  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
790 
791  for (i = 0; i < 4; i++) {
792  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 16, tmp_buf, 32,
793  tmp_buf, 48, in0, in1, in2, in3);
794  DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 80, tmp_buf, 96,
795  tmp_buf, 112, in4, in5, in6, in7);
796  tmp_buf += 64;
797  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7,
798  in0, in1, in2, in3, in4, in5, in6, in7);
799  __lsx_vst(in0, coeffs, 0);
800  __lsx_vst(in1, coeffs, 64);
801  __lsx_vst(in2, coeffs, 128);
802  __lsx_vst(in3, coeffs, 192);
803  __lsx_vst(in4, coeffs, 256);
804  __lsx_vst(in5, coeffs, 320);
805  __lsx_vst(in6, coeffs, 384);
806  __lsx_vst(in7, coeffs, 448);
807  coeffs += 8;
808  }
809 }
810 
811 void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit)
812 {
813  uint8_t row_cnt, col_cnt;
814  int16_t *src = coeffs;
815  int16_t tmp_buf[8 * 32 + 31];
816  int16_t *tmp_buf_ptr = tmp_buf + 31;
817  uint8_t round;
818  int32_t buf_pitch;
819 
820  /* Align pointer to 64 byte boundary */
821  tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
822 
823  /* column transform */
824  round = 7;
825  buf_pitch = 32;
826  for (col_cnt = 0; col_cnt < 4; col_cnt++) {
827  /* process 8x32 blocks */
828  hevc_idct_8x32_column_lsx((coeffs + col_cnt * 8), buf_pitch, round);
829  }
830 
831  /* row transform */
832  round = 12;
833  buf_pitch = 8;
834  for (row_cnt = 0; row_cnt < 4; row_cnt++) {
835  /* process 32x8 blocks */
836  src = (coeffs + 32 * 8 * row_cnt);
837 
839  hevc_idct_8x32_column_lsx(tmp_buf_ptr, buf_pitch, round);
841  }
842 }
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:360
hevc_idct_transpose_8x32_to_32x8
static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
Definition: hevc_idct_lsx.c:786
src1
const pixel * src1
Definition: h264pred_template.c:421
gt32x32_cnst2
static const int16_t gt32x32_cnst2[16]
Definition: hevc_idct_lsx.c:63
ff_hevc_idct_32x32_lsx
void ff_hevc_idct_32x32_lsx(int16_t *coeffs, int col_limit)
Definition: hevc_idct_lsx.c:811
ff_hevc_idct_16x16_lsx
void ff_hevc_idct_16x16_lsx(int16_t *coeffs, int col_limit)
Definition: hevc_idct_lsx.c:386
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
gt16x16_cnst
static const int16_t gt16x16_cnst[64]
Definition: hevc_idct_lsx.c:30
HEVC_IDCT4x4_COL
#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, shift)
Definition: hevc_idct_lsx.c:67
gt32x32_cnst1
static const int16_t gt32x32_cnst1[64]
Definition: hevc_idct_lsx.c:56
aligned
static int aligned(int val)
Definition: dashdec.c:170
DUP4_ARG2
#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, _OUT1, _OUT2, _OUT3)
Definition: loongson_intrinsics.h:76
ff_hevc_idct_4x4_lsx
void ff_hevc_idct_4x4_lsx(int16_t *coeffs, int col_limit)
Definition: hevc_idct_lsx.c:330
hevc_idct_transpose_32x8_to_8x32
static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
Definition: hevc_idct_lsx.c:761
HEVC_EVEN16_CALC
#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)
Definition: hevc_idct_lsx.c:277
gt32x32_cnst0
static const int16_t gt32x32_cnst0[256]
Definition: hevc_idct_lsx.c:37
hevc_idct_8x32_column_lsx
static void hevc_idct_8x32_column_lsx(int16_t *coeffs, int32_t buf_pitch, uint8_t round)
Definition: hevc_idct_lsx.c:525
HEVC_IDCT8x8_COL
#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)
Definition: hevc_idct_lsx.c:101
ff_hevc_idct_8x8_lsx
void ff_hevc_idct_8x8_lsx(int16_t *coeffs, int col_limit)
Definition: hevc_idct_lsx.c:360
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:255
round
static av_always_inline av_const double round(double x)
Definition: libm.h:444
src2
const pixel * src2
Definition: h264pred_template.c:422
HEVC_IDCT16x16_COL
#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l, shift)
Definition: hevc_idct_lsx.c:195
src0
const pixel *const src0
Definition: h264pred_template.c:420
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:352
zero
#define zero
Definition: regdef.h:64
loongson_intrinsics.h
src
INIT_CLIP pixel * src
Definition: h264pred_template.c:418
gt8x8_cnst
static const int16_t gt8x8_cnst[16]
Definition: hevc_idct_lsx.c:26
int32_t
int32_t
Definition: audioconvert.c:56
hevcdsp_lsx.h