FFmpeg
hevc_idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
23 
24 static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
25  64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
26 };
27 
28 static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
29  64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
30  64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
31  64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
32  64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
33 };
34 
35 static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
36  90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
37  90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
38  88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
39  85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
40  82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
41  78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
42  73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
43  67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
44  61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
45  54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
46  46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
47  38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
48  31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
49  22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
50  13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
51  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
52 };
53 
54 static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
55  90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
56  80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
57  57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
58  25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
59 };
60 
61 static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
62  89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
63 };
64 
65 #define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \
66  sum0, sum1, sum2, sum3, shift) \
67 { \
68  v4i32 vec0, vec1, vec2, vec3, vec4, vec5; \
69  v4i32 cnst64 = __msa_ldi_w(64); \
70  v4i32 cnst83 = __msa_ldi_w(83); \
71  v4i32 cnst36 = __msa_ldi_w(36); \
72  \
73  DOTP_SH4_SW(in_r0, in_r1, in_l0, in_l1, cnst64, cnst64, \
74  cnst83, cnst36, vec0, vec2, vec1, vec3); \
75  DOTP_SH2_SW(in_l0, in_l1, cnst36, cnst83, vec4, vec5); \
76  \
77  sum0 = vec0 + vec2; \
78  sum1 = vec0 - vec2; \
79  sum3 = sum0; \
80  sum2 = sum1; \
81  \
82  vec1 += vec3; \
83  vec4 -= vec5; \
84  \
85  sum0 += vec1; \
86  sum1 += vec4; \
87  sum2 -= vec4; \
88  sum3 -= vec1; \
89  \
90  SRARI_W4_SW(sum0, sum1, sum2, sum3, shift); \
91  SAT_SW4_SW(sum0, sum1, sum2, sum3, 15); \
92 }
93 
94 #define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \
95 { \
96  v8i16 src0_r, src1_r, src2_r, src3_r; \
97  v8i16 src0_l, src1_l, src2_l, src3_l; \
98  v8i16 filt0, filter0, filter1, filter2, filter3; \
99  v4i32 temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \
100  v4i32 temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \
101  v4i32 sum0_r, sum1_r, sum2_r, sum3_r; \
102  v4i32 sum0_l, sum1_l, sum2_l, sum3_l; \
103  \
104  ILVR_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \
105  src0_r, src1_r, src2_r, src3_r); \
106  ILVL_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \
107  src0_l, src1_l, src2_l, src3_l); \
108  \
109  filt0 = LD_SH(filter); \
110  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \
111  DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \
112  filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \
113  \
114  BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \
115  sum1_l, sum1_r); \
116  sum2_r = sum1_r; \
117  sum2_l = sum1_l; \
118  sum3_r = sum0_r; \
119  sum3_l = sum0_l; \
120  \
121  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \
122  filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \
123  \
124  temp2_r += temp3_r; \
125  temp2_l += temp3_l; \
126  sum0_r += temp2_r; \
127  sum0_l += temp2_l; \
128  sum3_r -= temp2_r; \
129  sum3_l -= temp2_l; \
130  \
131  SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \
132  SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \
133  PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in0, in7); \
134  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \
135  filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \
136  \
137  temp4_r -= temp5_r; \
138  temp4_l -= temp5_l; \
139  sum1_r += temp4_r; \
140  sum1_l += temp4_l; \
141  sum2_r -= temp4_r; \
142  sum2_l -= temp4_l; \
143  \
144  SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \
145  SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \
146  PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in3, in4); \
147  \
148  filt0 = LD_SH(filter + 8); \
149  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \
150  DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \
151  filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \
152  \
153  BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \
154  sum1_l, sum1_r); \
155  sum2_r = sum1_r; \
156  sum2_l = sum1_l; \
157  sum3_r = sum0_r; \
158  sum3_l = sum0_l; \
159  \
160  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \
161  filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \
162  \
163  temp2_r += temp3_r; \
164  temp2_l += temp3_l; \
165  sum0_r += temp2_r; \
166  sum0_l += temp2_l; \
167  sum3_r -= temp2_r; \
168  sum3_l -= temp2_l; \
169  \
170  SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \
171  SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \
172  PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in1, in6); \
173  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \
174  filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \
175  \
176  temp4_r -= temp5_r; \
177  temp4_l -= temp5_l; \
178  sum1_r -= temp4_r; \
179  sum1_l -= temp4_l; \
180  sum2_r += temp4_r; \
181  sum2_l += temp4_l; \
182  \
183  SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \
184  SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \
185  PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in2, in5); \
186 }
187 
188 #define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \
189  src4_r, src5_r, src6_r, src7_r, \
190  src0_l, src1_l, src2_l, src3_l, \
191  src4_l, src5_l, src6_l, src7_l, shift) \
192 { \
193  int16_t *ptr0, *ptr1; \
194  v8i16 filt0, filt1, dst0, dst1; \
195  v8i16 filter0, filter1, filter2, filter3; \
196  v4i32 temp0_r, temp1_r, temp0_l, temp1_l; \
197  v4i32 sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \
198  v4i32 sum3_l, res0_r, res1_r, res0_l, res1_l; \
199  \
200  ptr0 = (buf_ptr + 112); \
201  ptr1 = (buf_ptr + 128); \
202  k = -1; \
203  \
204  for (j = 0; j < 4; j++) \
205  { \
206  LD_SH2(filter, 8, filt0, filt1) \
207  filter += 16; \
208  SPLATI_W2_SH(filt0, 0, filter0, filter1); \
209  SPLATI_W2_SH(filt1, 0, filter2, filter3); \
210  DOTP_SH4_SW(src0_r, src0_l, src4_r, src4_l, filter0, filter0, \
211  filter2, filter2, sum0_r, sum0_l, sum2_r, sum2_l); \
212  DOTP_SH2_SW(src7_r, src7_l, filter2, filter2, sum3_r, sum3_l); \
213  DPADD_SH4_SW(src1_r, src1_l, src5_r, src5_l, filter1, filter1, \
214  filter3, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \
215  DPADD_SH2_SW(src6_r, src6_l, filter3, filter3, sum3_r, sum3_l); \
216  \
217  sum1_r = sum0_r; \
218  sum1_l = sum0_l; \
219  \
220  SPLATI_W2_SH(filt0, 2, filter0, filter1); \
221  SPLATI_W2_SH(filt1, 2, filter2, filter3); \
222  DOTP_SH2_SW(src2_r, src2_l, filter0, filter0, temp0_r, temp0_l); \
223  DPADD_SH2_SW(src6_r, src6_l, filter2, filter2, sum2_r, sum2_l); \
224  DOTP_SH2_SW(src5_r, src5_l, filter2, filter2, temp1_r, temp1_l); \
225  \
226  sum0_r += temp0_r; \
227  sum0_l += temp0_l; \
228  sum1_r -= temp0_r; \
229  sum1_l -= temp0_l; \
230  \
231  sum3_r = temp1_r - sum3_r; \
232  sum3_l = temp1_l - sum3_l; \
233  \
234  DOTP_SH2_SW(src3_r, src3_l, filter1, filter1, temp0_r, temp0_l); \
235  DPADD_SH4_SW(src7_r, src7_l, src4_r, src4_l, filter3, filter3, \
236  filter3, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \
237  \
238  sum0_r += temp0_r; \
239  sum0_l += temp0_l; \
240  sum1_r -= temp0_r; \
241  sum1_l -= temp0_l; \
242  \
243  BUTTERFLY_4(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \
244  res1_l, res1_r); \
245  SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \
246  SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \
247  PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \
248  ST_SH(dst0, buf_ptr); \
249  ST_SH(dst1, (buf_ptr + ((15 - (j * 2)) * 16))); \
250  \
251  BUTTERFLY_4(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \
252  res1_l, res1_r); \
253  SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \
254  SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \
255  PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \
256  ST_SH(dst0, (ptr0 + (((j / 2 + j % 2) * 2 * k) * 16))); \
257  ST_SH(dst1, (ptr1 - (((j / 2 + j % 2) * 2 * k) * 16))); \
258  \
259  k *= -1; \
260  buf_ptr += 16; \
261  } \
262 }
263 
264 #define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \
265 { \
266  LD_SW2(input + load_idx * 8, 4, tmp0_r, tmp0_l); \
267  tmp1_r = sum0_r; \
268  tmp1_l = sum0_l; \
269  sum0_r += tmp0_r; \
270  sum0_l += tmp0_l; \
271  ST_SW2(sum0_r, sum0_l, (input + load_idx * 8), 4); \
272  tmp1_r -= tmp0_r; \
273  tmp1_l -= tmp0_l; \
274  ST_SW2(tmp1_r, tmp1_l, (input + store_idx * 8), 4); \
275 }
276 
277 #define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \
278  res0, res1, res2, res3, shift) \
279 { \
280  v4i32 vec0, vec1, vec2, vec3; \
281  v4i32 cnst74 = __msa_ldi_w(74); \
282  v4i32 cnst55 = __msa_ldi_w(55); \
283  v4i32 cnst29 = __msa_ldi_w(29); \
284  \
285  vec0 = in_r0 + in_r1; \
286  vec2 = in_r0 - in_l1; \
287  res0 = vec0 * cnst29; \
288  res1 = vec2 * cnst55; \
289  res2 = in_r0 - in_r1; \
290  vec1 = in_r1 + in_l1; \
291  res2 += in_l1; \
292  vec3 = in_l0 * cnst74; \
293  res3 = vec0 * cnst55; \
294  \
295  res0 += vec1 * cnst55; \
296  res1 -= vec1 * cnst29; \
297  res2 *= cnst74; \
298  res3 += vec2 * cnst29; \
299  \
300  res0 += vec3; \
301  res1 += vec3; \
302  res3 -= vec3; \
303  \
304  SRARI_W4_SW(res0, res1, res2, res3, shift); \
305  SAT_SW4_SW(res0, res1, res2, res3, 15); \
306 }
307 
308 static void hevc_idct_4x4_msa(int16_t *coeffs)
309 {
310  v8i16 in0, in1;
311  v4i32 in_r0, in_l0, in_r1, in_l1;
312  v4i32 sum0, sum1, sum2, sum3;
313  v8i16 zeros = { 0 };
314 
315  LD_SH2(coeffs, 8, in0, in1);
316  ILVRL_H2_SW(zeros, in0, in_r0, in_l0);
317  ILVRL_H2_SW(zeros, in1, in_r1, in_l1);
318 
319  HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
320  TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
321  HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
322 
323  /* Pack and transpose */
324  PCKEV_H2_SH(sum2, sum0, sum3, sum1, in0, in1);
325  ILVRL_H2_SW(in1, in0, sum0, sum1);
326  ILVRL_W2_SH(sum1, sum0, in0, in1);
327 
328  ST_SH2(in0, in1, coeffs, 8);
329 }
330 
331 static void hevc_idct_8x8_msa(int16_t *coeffs)
332 {
333  const int16_t *filter = &gt8x8_cnst[0];
334  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
335 
336  LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
337  HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
338  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
339  in0, in1, in2, in3, in4, in5, in6, in7);
340  HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
341  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
342  in0, in1, in2, in3, in4, in5, in6, in7);
343  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8);
344 }
345 
346 static void hevc_idct_16x16_msa(int16_t *coeffs)
347 {
348  int16_t i, j, k;
349  int16_t buf[256];
350  int16_t *buf_ptr = &buf[0];
351  int16_t *src = coeffs;
352  const int16_t *filter = &gt16x16_cnst[0];
353  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
354  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
355  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
356  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
357  v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
358 
359  for (i = 2; i--;) {
360  LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7,
361  in8, in9, in10, in11, in12, in13, in14, in15);
362 
363  ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
364  src0_r, src1_r, src2_r, src3_r);
365  ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
366  src4_r, src5_r, src6_r, src7_r);
367  ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
368  src0_l, src1_l, src2_l, src3_l);
369  ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
370  src4_l, src5_l, src6_l, src7_l);
371  HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
372  src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
373  src4_l, src5_l, src6_l, src7_l, 7);
374 
375  src += 8;
376  buf_ptr = (&buf[0] + 8);
377  filter = &gt16x16_cnst[0];
378  }
379 
380  src = &buf[0];
381  buf_ptr = coeffs;
382  filter = &gt16x16_cnst[0];
383 
384  for (i = 2; i--;) {
385  LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11,
386  in4, in12, in5, in13, in6, in14, in7, in15);
387  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
388  in0, in1, in2, in3, in4, in5, in6, in7);
389  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
390  in8, in9, in10, in11, in12, in13, in14, in15);
391  ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
392  src0_r, src1_r, src2_r, src3_r);
393  ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
394  src4_r, src5_r, src6_r, src7_r);
395  ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
396  src0_l, src1_l, src2_l, src3_l);
397  ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
398  src4_l, src5_l, src6_l, src7_l);
399  HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
400  src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
401  src4_l, src5_l, src6_l, src7_l, 12);
402 
403  src += 128;
404  buf_ptr = coeffs + 8;
405  filter = &gt16x16_cnst[0];
406  }
407 
408  LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7);
409  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
410  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
411  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16);
412 
413  LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7);
414  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
415  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
416  LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15);
417  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16);
418  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
419  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
420  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16);
421 
422  LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7);
423  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
424  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
425  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16);
426 }
427 
428 static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch,
429  uint8_t round)
430 {
431  uint8_t i;
432  const int16_t *filter_ptr0 = &gt32x32_cnst0[0];
433  const int16_t *filter_ptr1 = &gt32x32_cnst1[0];
434  const int16_t *filter_ptr2 = &gt32x32_cnst2[0];
435  const int16_t *filter_ptr3 = &gt8x8_cnst[0];
436  int16_t *src0 = (coeffs + buf_pitch);
437  int16_t *src1 = (coeffs + 2 * buf_pitch);
438  int16_t *src2 = (coeffs + 4 * buf_pitch);
439  int16_t *src3 = (coeffs);
440  int32_t cnst0, cnst1;
441  int32_t tmp_buf[8 * 32 + 15];
442  int32_t *tmp_buf_ptr = tmp_buf + 15;
443  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
444  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
445  v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
446  v8i16 filt0, filter0, filter1, filter2, filter3;
447  v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
448 
449  /* Align pointer to 64 byte boundary */
450  tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
451 
452  /* process coeff 4, 12, 20, 28 */
453  LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
454  ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
455  ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);
456 
457  LD_SH2(src3, 16 * buf_pitch, in4, in6);
458  LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7);
459  ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r);
460  ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l);
461 
462  /* loop for all columns of constants */
463  for (i = 0; i < 2; i++) {
464  /* processing single column of constants */
465  cnst0 = LW(filter_ptr2);
466  cnst1 = LW(filter_ptr2 + 2);
467 
468  filter0 = (v8i16) __msa_fill_w(cnst0);
469  filter1 = (v8i16) __msa_fill_w(cnst1);
470 
471  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
472  DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
473  ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + 2 * i * 8), 4);
474 
475  /* processing single column of constants */
476  cnst0 = LW(filter_ptr2 + 4);
477  cnst1 = LW(filter_ptr2 + 6);
478 
479  filter0 = (v8i16) __msa_fill_w(cnst0);
480  filter1 = (v8i16) __msa_fill_w(cnst1);
481 
482  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
483  DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
484  ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + (2 * i + 1) * 8), 4);
485 
486  filter_ptr2 += 8;
487  }
488 
489  /* process coeff 0, 8, 16, 24 */
490  /* loop for all columns of constants */
491  for (i = 0; i < 2; i++) {
492  /* processing first column of filter constants */
493  cnst0 = LW(filter_ptr3);
494  cnst1 = LW(filter_ptr3 + 2);
495 
496  filter0 = (v8i16) __msa_fill_w(cnst0);
497  filter1 = (v8i16) __msa_fill_w(cnst1);
498 
499  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter0, filter0, filter1,
500  filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
501 
502  sum1_r = sum0_r - tmp1_r;
503  sum1_l = sum0_l - tmp1_l;
504  sum0_r = sum0_r + tmp1_r;
505  sum0_l = sum0_l + tmp1_l;
506 
507  HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i));
508  HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i));
509 
510  filter_ptr3 += 8;
511  }
512 
513  /* process coeff 2 6 10 14 18 22 26 30 */
514  LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
515  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
516  src0_r, src1_r, src2_r, src3_r);
517  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
518  src0_l, src1_l, src2_l, src3_l);
519 
520  /* loop for all columns of constants */
521  for (i = 0; i < 8; i++) {
522  /* processing single column of constants */
523  filt0 = LD_SH(filter_ptr1);
524  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
525  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
526  DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
527  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
528  DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
529 
530  LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
531  tmp1_r = tmp0_r;
532  tmp1_l = tmp0_l;
533  tmp0_r += sum0_r;
534  tmp0_l += sum0_l;
535  ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4);
536  tmp1_r -= sum0_r;
537  tmp1_l -= sum0_l;
538  ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4);
539 
540  filter_ptr1 += 8;
541  }
542 
543  /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
544  LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
545  src0 += 16 * buf_pitch;
546  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
547  src0_r, src1_r, src2_r, src3_r);
548  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
549  src0_l, src1_l, src2_l, src3_l);
550 
551  LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
552  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
553  src4_r, src5_r, src6_r, src7_r);
554  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
555  src4_l, src5_l, src6_l, src7_l);
556 
557  /* loop for all columns of filter constants */
558  for (i = 0; i < 16; i++) {
559  /* processing single column of constants */
560  filt0 = LD_SH(filter_ptr0);
561  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
562  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
563  DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
564  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
565  DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
566 
567  tmp1_r = sum0_r;
568  tmp1_l = sum0_l;
569 
570  filt0 = LD_SH(filter_ptr0 + 8);
571  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
572  DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l);
573  DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2,
574  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
575  DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l);
576 
577  sum0_r += tmp1_r;
578  sum0_l += tmp1_l;
579 
580  LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
581  tmp1_r = tmp0_r;
582  tmp1_l = tmp0_l;
583  tmp0_r += sum0_r;
584  tmp0_l += sum0_l;
585  sum1_r = __msa_fill_w(round);
586  SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r);
587  SAT_SW2_SW(tmp0_r, tmp0_l, 15);
588  in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r);
589  ST_SH(in0, (coeffs + i * buf_pitch));
590  tmp1_r -= sum0_r;
591  tmp1_l -= sum0_l;
592  SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r);
593  SAT_SW2_SW(tmp1_r, tmp1_l, 15);
594  in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r);
595  ST_SH(in0, (coeffs + (31 - i) * buf_pitch));
596 
597  filter_ptr0 += 16;
598  }
599 }
600 
601 static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
602 {
603  uint8_t i;
604  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
605 
606  for (i = 0; i < 4; i++) {
607  LD_SH8(coeffs + i * 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
608  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
609  in0, in1, in2, in3, in4, in5, in6, in7);
610  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, tmp_buf + i * 8 * 8, 8);
611  }
612 }
613 
614 static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
615 {
616  uint8_t i;
617  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
618 
619  for (i = 0; i < 4; i++) {
620  LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7);
621  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
622  in0, in1, in2, in3, in4, in5, in6, in7);
623  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32);
624  }
625 }
626 
627 static void hevc_idct_32x32_msa(int16_t *coeffs)
628 {
629  uint8_t row_cnt, col_cnt;
630  int16_t *src = coeffs;
631  int16_t tmp_buf[8 * 32 + 31];
632  int16_t *tmp_buf_ptr = tmp_buf + 31;
633  uint8_t round;
634  uint8_t buf_pitch;
635 
636  /* Align pointer to 64 byte boundary */
637  tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
638 
639  /* column transform */
640  round = 7;
641  buf_pitch = 32;
642  for (col_cnt = 0; col_cnt < 4; col_cnt++) {
643  /* process 8x32 blocks */
644  hevc_idct_8x32_column_msa((coeffs + col_cnt * 8), buf_pitch, round);
645  }
646 
647  /* row transform */
648  round = 12;
649  buf_pitch = 8;
650  for (row_cnt = 0; row_cnt < 4; row_cnt++) {
651  /* process 32x8 blocks */
652  src = (coeffs + 32 * 8 * row_cnt);
653 
654  hevc_idct_transpose_32x8_to_8x32(src, tmp_buf_ptr);
655  hevc_idct_8x32_column_msa(tmp_buf_ptr, buf_pitch, round);
656  hevc_idct_transpose_8x32_to_32x8(tmp_buf_ptr, src);
657  }
658 }
659 
660 static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
661 {
662  int32_t val;
663  v8i16 dst;
664 
665  val = (coeffs[0] + 1) >> 1;
666  val = (val + 32) >> 6;
667  dst = __msa_fill_h(val);
668 
669  ST_SH2(dst, dst, coeffs, 8);
670 }
671 
672 static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
673 {
674  int32_t val;
675  v8i16 dst;
676 
677  val = (coeffs[0] + 1) >> 1;
678  val = (val + 32) >> 6;
679  dst = __msa_fill_h(val);
680 
681  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
682 }
683 
684 static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
685 {
686  uint8_t loop;
687  int32_t val;
688  v8i16 dst;
689 
690  val = (coeffs[0] + 1) >> 1;
691  val = (val + 32) >> 6;
692  dst = __msa_fill_h(val);
693 
694  for (loop = 4; loop--;) {
695  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
696  coeffs += 8 * 8;
697  }
698 }
699 
700 static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
701 {
702  uint8_t loop;
703  int32_t val;
704  v8i16 dst;
705 
706  val = (coeffs[0] + 1) >> 1;
707  val = (val + 32) >> 6;
708  dst = __msa_fill_h(val);
709 
710  for (loop = 16; loop--;) {
711  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
712  coeffs += 8 * 8;
713  }
714 }
715 
716 static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
717 {
718  uint32_t dst0, dst1, dst2, dst3;
719  v8i16 dst_r0, dst_l0, in0, in1;
720  v4i32 dst_vec = { 0 };
721  v16u8 zeros = { 0 };
722 
723  LD_SH2(coeffs, 8, in0, in1);
724  LW4(dst, stride, dst0, dst1, dst2, dst3);
725  INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec);
726  ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0);
727  ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
728  CLIP_SH2_0_255(dst_r0, dst_l0);
729  dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
730  ST_W4(dst_vec, 0, 1, 2, 3, dst, stride);
731 }
732 
733 static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
734 {
735  uint8_t *temp_dst = dst;
736  uint64_t dst0, dst1, dst2, dst3;
737  v2i64 dst_vec0 = { 0 };
738  v2i64 dst_vec1 = { 0 };
739  v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
740  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
741  v16u8 zeros = { 0 };
742 
743  LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
744  LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
745  temp_dst += (4 * stride);
746 
747  INSERT_D2_SD(dst0, dst1, dst_vec0);
748  INSERT_D2_SD(dst2, dst3, dst_vec1);
749  ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
750  ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
751  ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
752  dst_r0, dst_l0, dst_r1, dst_l1);
753  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
754  PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
755  ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst, stride);
756 
757  LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
758  INSERT_D2_SD(dst0, dst1, dst_vec0);
759  INSERT_D2_SD(dst2, dst3, dst_vec1);
760  UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
761  UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
762  ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
763  dst_r0, dst_l0, dst_r1, dst_l1);
764  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
765  PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
766  ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst + 4 * stride, stride);
767 }
768 
769 static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
770 {
771  uint8_t loop_cnt;
772  uint8_t *temp_dst = dst;
773  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
774  v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
775  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
776 
777  /* Pre-load for next iteration */
778  LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7);
779  temp_dst += (4 * stride);
780  LD_SH4(coeffs, 16, in0, in2, in4, in6);
781  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
782  coeffs += 64;
783 
784  for (loop_cnt = 3; loop_cnt--;) {
785  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
786  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
787  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
788  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
789 
790  dst_r0 += in0;
791  dst_l0 += in1;
792  dst_r1 += in2;
793  dst_l1 += in3;
794  dst_r2 += in4;
795  dst_l2 += in5;
796  dst_r3 += in6;
797  dst_l3 += in7;
798 
799  /* Pre-load for next iteration */
800  LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7);
801  temp_dst += (4 * stride);
802  LD_SH4(coeffs, 16, in0, in2, in4, in6);
803  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
804  coeffs += 64;
805 
806  CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
807  dst_r2, dst_l2, dst_r3, dst_l3);
808 
809  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
810  dst_r3, dst0, dst1, dst2, dst3);
811  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
812  dst += (4 * stride);
813  }
814 
815  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
816  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
817  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
818  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
819 
820  dst_r0 += in0;
821  dst_l0 += in1;
822  dst_r1 += in2;
823  dst_l1 += in3;
824  dst_r2 += in4;
825  dst_l2 += in5;
826  dst_r3 += in6;
827  dst_l3 += in7;
828 
829  CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
830  dst_r2, dst_l2, dst_r3, dst_l3);
831  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
832  dst_r3, dst0, dst1, dst2, dst3);
833  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
834 }
835 
836 static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
837 {
838  uint8_t loop_cnt;
839  uint8_t *temp_dst = dst;
840  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
841  v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
842  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
843 
844  /* Pre-load for next iteration */
845  LD_UB2(temp_dst, 16, dst4, dst5);
846  temp_dst += stride;
847  LD_UB2(temp_dst, 16, dst6, dst7);
848  temp_dst += stride;
849  LD_SH4(coeffs, 16, in0, in2, in4, in6);
850  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
851  coeffs += 64;
852 
853  for (loop_cnt = 14; loop_cnt--;) {
854  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
855  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
856  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
857  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
858 
859  dst_r0 += in0;
860  dst_l0 += in1;
861  dst_r1 += in2;
862  dst_l1 += in3;
863  dst_r2 += in4;
864  dst_l2 += in5;
865  dst_r3 += in6;
866  dst_l3 += in7;
867 
868  /* Pre-load for next iteration */
869  LD_UB2(temp_dst, 16, dst4, dst5);
870  temp_dst += stride;
871  LD_UB2(temp_dst, 16, dst6, dst7);
872  temp_dst += stride;
873  LD_SH4(coeffs, 16, in0, in2, in4, in6);
874  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
875  coeffs += 64;
876 
877  CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
878  dst_r2, dst_l2, dst_r3, dst_l3);
879  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
880  dst_r3, dst0, dst1, dst2, dst3);
881  ST_UB2(dst0, dst1, dst, 16);
882  dst += stride;
883  ST_UB2(dst2, dst3, dst, 16);
884  dst += stride;
885  }
886 
887  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
888  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
889  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
890  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
891 
892  dst_r0 += in0;
893  dst_l0 += in1;
894  dst_r1 += in2;
895  dst_l1 += in3;
896  dst_r2 += in4;
897  dst_l2 += in5;
898  dst_r3 += in6;
899  dst_l3 += in7;
900 
901  /* Pre-load for next iteration */
902  LD_UB2(temp_dst, 16, dst4, dst5);
903  temp_dst += stride;
904  LD_UB2(temp_dst, 16, dst6, dst7);
905  temp_dst += stride;
906  LD_SH4(coeffs, 16, in0, in2, in4, in6);
907  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
908 
909  CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
910  dst_r2, dst_l2, dst_r3, dst_l3);
911  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
912  dst_r3, dst0, dst1, dst2, dst3);
913  ST_UB2(dst0, dst1, dst, 16);
914  dst += stride;
915  ST_UB2(dst2, dst3, dst, 16);
916  dst += stride;
917 
918  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
919  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
920  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
921  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
922 
923  dst_r0 += in0;
924  dst_l0 += in1;
925  dst_r1 += in2;
926  dst_l1 += in3;
927  dst_r2 += in4;
928  dst_l2 += in5;
929  dst_r3 += in6;
930  dst_l3 += in7;
931 
932  CLIP_SH8_0_255(dst_r0, dst_l0, dst_r1, dst_l1,
933  dst_r2, dst_l2, dst_r3, dst_l3);
934  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
935  dst_r3, dst0, dst1, dst2, dst3);
936  ST_UB2(dst0, dst1, dst, 16);
937  dst += stride;
938  ST_UB2(dst2, dst3, dst, 16);
939 }
940 
941 static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
942 {
943  v8i16 in0, in1, dst0, dst1;
944  v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3;
945 
946  LD_SH2(coeffs, 8, in0, in1);
947  UNPCK_SH_SW(in0, in_r0, in_l0);
948  UNPCK_SH_SW(in1, in_r1, in_l1);
949  HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
950  7);
951  TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1);
952  HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
953  12);
954 
955  /* Pack and transpose */
956  PCKEV_H2_SH(res2, res0, res3, res1, dst0, dst1);
957  ILVRL_H2_SW(dst1, dst0, res0, res1);
958  ILVRL_W2_SH(res1, res0, dst0, dst1);
959 
960  ST_SH2(dst0, dst1, coeffs, 8);
961 }
962 
963 void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
964 {
965  hevc_idct_4x4_msa(coeffs);
966 }
967 
968 void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
969 {
970  hevc_idct_8x8_msa(coeffs);
971 }
972 
973 void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
974 {
975  hevc_idct_16x16_msa(coeffs);
976 }
977 
978 void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
979 {
980  hevc_idct_32x32_msa(coeffs);
981 }
982 
983 void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
984 {
985  hevc_addblk_4x4_msa(coeffs, dst, stride);
986 }
987 
988 void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
989 {
990  hevc_addblk_8x8_msa(coeffs, dst, stride);
991 }
992 
993 void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
994 {
995  hevc_addblk_16x16_msa(coeffs, dst, stride);
996 }
997 
998 void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
999 {
1000  hevc_addblk_32x32_msa(coeffs, dst, stride);
1001 }
1002 
1003 void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
1004 {
1005  hevc_idct_dc_4x4_msa(coeffs);
1006 }
1007 
1008 void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
1009 {
1010  hevc_idct_dc_8x8_msa(coeffs);
1011 }
1012 
1013 void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
1014 {
1015  hevc_idct_dc_16x16_msa(coeffs);
1016 }
1017 
1018 void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
1019 {
1020  hevc_idct_dc_32x32_msa(coeffs);
1021 }
1022 
1023 void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
1024 {
1025  hevc_idct_luma_4x4_msa(coeffs);
1026 }
static const int16_t gt32x32_cnst1[64]
Definition: hevc_idct_msa.c:54
const char const char void * val
Definition: avisynth_c.h:863
static void hevc_idct_8x8_msa(int16_t *coeffs)
void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1,res0, res1, res2, res3, shift)
#define ILVRL_B2_SH(...)
#define LD_SH16(...)
#define ILVR_H4_SH(...)
void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
#define PCKEV_B2_SH(...)
#define LW(psrc)
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
#define ILVRL_H2_SW(...)
#define LD_UB4(...)
#define src
Definition: vp8dsp.c:254
#define ILVL_H2_SH(...)
#define ILVL_H4_SH(...)
#define DOTP_SH4_SW(...)
#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:350
uint8_t
#define LD4(psrc, stride, out0, out1, out2, out3)
#define ST_SH2(...)
#define UNPCK_UB_SH(in, out0, out1)
#define LD_UB2(...)
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
#define DOTP_SH2_SW(...)
#define LD_SH(...)
void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
#define CLIP_SH8_0_255(in0, in1, in2, in3,in4, in5, in6, in7)
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:358
#define CLIP_SH2_0_255(in0, in1)
void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
static const int16_t gt32x32_cnst0[256]
Definition: hevc_idct_msa.c:35
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
static int aligned(int val)
Definition: dashdec.c:178
static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
#define SAT_SW2_SW(...)
#define DPADD_SH4_SW(...)
static void hevc_idct_16x16_msa(int16_t *coeffs)
static const int16_t gt8x8_cnst[16]
Definition: hevc_idct_msa.c:24
void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
static av_always_inline av_const double round(double x)
Definition: libm.h:444
#define TRANSPOSE8x8_SH_SH(...)
#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1,sum0, sum1, sum2, sum3, shift)
Definition: hevc_idct_msa.c:65
static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
#define SPLATI_W4_SH(...)
#define CLIP_SH4_0_255(in0, in1, in2, in3)
#define LD_SH8(...)
static const int16_t gt32x32_cnst2[16]
Definition: hevc_idct_msa.c:61
static void hevc_idct_32x32_msa(int16_t *coeffs)
void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
#define LD_SH2(...)
int32_t
#define PCKEV_H2_SH(...)
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch, uint8_t round)
#define PCKEV_B4_UB(...)
#define ST_UB2(...)
#define INSERT_D2_SD(...)
#define ST_UB4(...)
#define src1
Definition: h264pred.c:139
#define UNPCK_SH_SW(in, out0, out1)
void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
static int loop
Definition: ffplay.c:340
#define INSERT_W4_SW(...)
void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
#define SRAR_W2_SW(...)
void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
void * buf
Definition: avisynth_c.h:766
static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
#define ST_SH(...)
#define src0
Definition: h264pred.c:138
#define ADD2(in0, in1, in2, in3, out0, out1)
#define DPADD_SH2_SW(...)
#define LD_SW2(...)
#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r,src4_r, src5_r, src6_r, src7_r,src0_l, src1_l, src2_l, src3_l,src4_l, src5_l, src6_l, src7_l, shift)
GLint GLenum GLboolean GLsizei stride
Definition: opengl_enc.c:104
#define LW4(psrc, stride, out0, out1, out2, out3)
#define ST_SH8(...)
#define ILVRL_W2_SH(...)
#define ILVR_H2_SH(...)
#define LD_SH4(...)
static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
#define ST_SW2(...)
#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)
Definition: hevc_idct_msa.c:94
static void hevc_idct_4x4_msa(int16_t *coeffs)
static const int16_t gt16x16_cnst[64]
Definition: hevc_idct_msa.c:28
static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
#define stride
static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)