FFmpeg
hevc_idct_msa.c
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
3  *
4  * This file is part of FFmpeg.
5  *
6  * FFmpeg is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU Lesser General Public
8  * License as published by the Free Software Foundation; either
9  * version 2.1 of the License, or (at your option) any later version.
10  *
11  * FFmpeg is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14  * Lesser General Public License for more details.
15  *
16  * You should have received a copy of the GNU Lesser General Public
17  * License along with FFmpeg; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19  */
20 
23 
24 static const int16_t gt8x8_cnst[16] __attribute__ ((aligned (64))) = {
25  64, 64, 83, 36, 89, 50, 18, 75, 64, -64, 36, -83, 75, -89, -50, -18
26 };
27 
28 static const int16_t gt16x16_cnst[64] __attribute__ ((aligned (64))) = {
29  64, 83, 64, 36, 89, 75, 50, 18, 90, 80, 57, 25, 70, 87, 9, 43,
30  64, 36, -64, -83, 75, -18, -89, -50, 87, 9, -80, -70, -43, 57, -25, -90,
31  64, -36, -64, 83, 50, -89, 18, 75, 80, -70, -25, 90, -87, 9, 43, 57,
32  64, -83, 64, -36, 18, -50, 75, -89, 70, -87, 90, -80, 9, -43, -57, 25
33 };
34 
35 static const int16_t gt32x32_cnst0[256] __attribute__ ((aligned (64))) = {
36  90, 90, 88, 85, 82, 78, 73, 67, 61, 54, 46, 38, 31, 22, 13, 4,
37  90, 82, 67, 46, 22, -4, -31, -54, -73, -85, -90, -88, -78, -61, -38, -13,
38  88, 67, 31, -13, -54, -82, -90, -78, -46, -4, 38, 73, 90, 85, 61, 22,
39  85, 46, -13, -67, -90, -73, -22, 38, 82, 88, 54, -4, -61, -90, -78, -31,
40  82, 22, -54, -90, -61, 13, 78, 85, 31, -46, -90, -67, 4, 73, 88, 38,
41  78, -4, -82, -73, 13, 85, 67, -22, -88, -61, 31, 90, 54, -38, -90, -46,
42  73, -31, -90, -22, 78, 67, -38, -90, -13, 82, 61, -46, -88, -4, 85, 54,
43  67, -54, -78, 38, 85, -22, -90, 4, 90, 13, -88, -31, 82, 46, -73, -61,
44  61, -73, -46, 82, 31, -88, -13, 90, -4, -90, 22, 85, -38, -78, 54, 67,
45  54, -85, -4, 88, -46, -61, 82, 13, -90, 38, 67, -78, -22, 90, -31, -73,
46  46, -90, 38, 54, -90, 31, 61, -88, 22, 67, -85, 13, 73, -82, 4, 78,
47  38, -88, 73, -4, -67, 90, -46, -31, 85, -78, 13, 61, -90, 54, 22, -82,
48  31, -78, 90, -61, 4, 54, -88, 82, -38, -22, 73, -90, 67, -13, -46, 85,
49  22, -61, 85, -90, 73, -38, -4, 46, -78, 90, -82, 54, -13, -31, 67, -88,
50  13, -38, 61, -78, 88, -90, 85, -73, 54, -31, 4, 22, -46, 67, -82, 90,
51  4, -13, 22, -31, 38, -46, 54, -61, 67, -73, 78, -82, 85, -88, 90, -90
52 };
53 
54 static const int16_t gt32x32_cnst1[64] __attribute__ ((aligned (64))) = {
55  90, 87, 80, 70, 57, 43, 25, 9, 87, 57, 9, -43, -80, -90, -70, -25,
56  80, 9, -70, -87, -25, 57, 90, 43, 70, -43, -87, 9, 90, 25, -80, -57,
57  57, -80, -25, 90, -9, -87, 43, 70, 43, -90, 57, 25, -87, 70, 9, -80,
58  25, -70, 90, -80, 43, 9, -57, 87, 9, -25, 43, -57, 70, -80, 87, -90
59 };
60 
61 static const int16_t gt32x32_cnst2[16] __attribute__ ((aligned (64))) = {
62  89, 75, 50, 18, 75, -18, -89, -50, 50, -89, 18, 75, 18, -50, 75, -89
63 };
64 
65 #define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, \
66  sum0, sum1, sum2, sum3, shift) \
67 { \
68  v4i32 vec0, vec1, vec2, vec3, vec4, vec5; \
69  v4i32 cnst64 = __msa_ldi_w(64); \
70  v4i32 cnst83 = __msa_ldi_w(83); \
71  v4i32 cnst36 = __msa_ldi_w(36); \
72  \
73  DOTP_SH4_SW(in_r0, in_r1, in_l0, in_l1, cnst64, cnst64, \
74  cnst83, cnst36, vec0, vec2, vec1, vec3); \
75  DOTP_SH2_SW(in_l0, in_l1, cnst36, cnst83, vec4, vec5); \
76  \
77  sum0 = vec0 + vec2; \
78  sum1 = vec0 - vec2; \
79  sum3 = sum0; \
80  sum2 = sum1; \
81  \
82  vec1 += vec3; \
83  vec4 -= vec5; \
84  \
85  sum0 += vec1; \
86  sum1 += vec4; \
87  sum2 -= vec4; \
88  sum3 -= vec1; \
89  \
90  SRARI_W4_SW(sum0, sum1, sum2, sum3, shift); \
91  SAT_SW4_SW(sum0, sum1, sum2, sum3, 15); \
92 }
93 
94 #define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift) \
95 { \
96  v8i16 src0_r, src1_r, src2_r, src3_r; \
97  v8i16 src0_l, src1_l, src2_l, src3_l; \
98  v8i16 filt0, filter0, filter1, filter2, filter3; \
99  v4i32 temp0_r, temp1_r, temp2_r, temp3_r, temp4_r, temp5_r; \
100  v4i32 temp0_l, temp1_l, temp2_l, temp3_l, temp4_l, temp5_l; \
101  v4i32 sum0_r, sum1_r, sum2_r, sum3_r; \
102  v4i32 sum0_l, sum1_l, sum2_l, sum3_l; \
103  \
104  ILVR_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \
105  src0_r, src1_r, src2_r, src3_r); \
106  ILVL_H4_SH(in4, in0, in6, in2, in5, in1, in3, in7, \
107  src0_l, src1_l, src2_l, src3_l); \
108  \
109  filt0 = LD_SH(filter); \
110  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \
111  DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \
112  filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \
113  \
114  BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \
115  sum1_l, sum1_r); \
116  sum2_r = sum1_r; \
117  sum2_l = sum1_l; \
118  sum3_r = sum0_r; \
119  sum3_l = sum0_l; \
120  \
121  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \
122  filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \
123  \
124  temp2_r += temp3_r; \
125  temp2_l += temp3_l; \
126  sum0_r += temp2_r; \
127  sum0_l += temp2_l; \
128  sum3_r -= temp2_r; \
129  sum3_l -= temp2_l; \
130  \
131  SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \
132  SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \
133  PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in0, in7); \
134  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \
135  filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \
136  \
137  temp4_r -= temp5_r; \
138  temp4_l -= temp5_l; \
139  sum1_r += temp4_r; \
140  sum1_l += temp4_l; \
141  sum2_r -= temp4_r; \
142  sum2_l -= temp4_l; \
143  \
144  SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \
145  SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \
146  PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in3, in4); \
147  \
148  filt0 = LD_SH(filter + 8); \
149  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3); \
150  DOTP_SH4_SW(src0_r, src0_l, src1_r, src1_l, filter0, filter0, \
151  filter1, filter1, temp0_r, temp0_l, temp1_r, temp1_l); \
152  \
153  BUTTERFLY_4(temp0_r, temp0_l, temp1_l, temp1_r, sum0_r, sum0_l, \
154  sum1_l, sum1_r); \
155  sum2_r = sum1_r; \
156  sum2_l = sum1_l; \
157  sum3_r = sum0_r; \
158  sum3_l = sum0_l; \
159  \
160  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter2, filter2, \
161  filter3, filter3, temp2_r, temp2_l, temp3_r, temp3_l); \
162  \
163  temp2_r += temp3_r; \
164  temp2_l += temp3_l; \
165  sum0_r += temp2_r; \
166  sum0_l += temp2_l; \
167  sum3_r -= temp2_r; \
168  sum3_l -= temp2_l; \
169  \
170  SRARI_W4_SW(sum0_r, sum0_l, sum3_r, sum3_l, shift); \
171  SAT_SW4_SW(sum0_r, sum0_l, sum3_r, sum3_l, 15); \
172  PCKEV_H2_SH(sum0_l, sum0_r, sum3_l, sum3_r, in1, in6); \
173  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter3, filter3, \
174  filter2, filter2, temp4_r, temp4_l, temp5_r, temp5_l); \
175  \
176  temp4_r -= temp5_r; \
177  temp4_l -= temp5_l; \
178  sum1_r -= temp4_r; \
179  sum1_l -= temp4_l; \
180  sum2_r += temp4_r; \
181  sum2_l += temp4_l; \
182  \
183  SRARI_W4_SW(sum1_r, sum1_l, sum2_r, sum2_l, shift); \
184  SAT_SW4_SW(sum1_r, sum1_l, sum2_r, sum2_l, 15); \
185  PCKEV_H2_SH(sum1_l, sum1_r, sum2_l, sum2_r, in2, in5); \
186 }
187 
188 #define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, \
189  src4_r, src5_r, src6_r, src7_r, \
190  src0_l, src1_l, src2_l, src3_l, \
191  src4_l, src5_l, src6_l, src7_l, shift) \
192 { \
193  int16_t *ptr0, *ptr1; \
194  v8i16 filt0, filt1, dst0, dst1; \
195  v8i16 filter0, filter1, filter2, filter3; \
196  v4i32 temp0_r, temp1_r, temp0_l, temp1_l; \
197  v4i32 sum0_r, sum1_r, sum2_r, sum3_r, sum0_l, sum1_l, sum2_l; \
198  v4i32 sum3_l, res0_r, res1_r, res0_l, res1_l; \
199  \
200  ptr0 = (buf_ptr + 112); \
201  ptr1 = (buf_ptr + 128); \
202  k = -1; \
203  \
204  for (j = 0; j < 4; j++) \
205  { \
206  LD_SH2(filter, 8, filt0, filt1) \
207  filter += 16; \
208  SPLATI_W2_SH(filt0, 0, filter0, filter1); \
209  SPLATI_W2_SH(filt1, 0, filter2, filter3); \
210  DOTP_SH4_SW(src0_r, src0_l, src4_r, src4_l, filter0, filter0, \
211  filter2, filter2, sum0_r, sum0_l, sum2_r, sum2_l); \
212  DOTP_SH2_SW(src7_r, src7_l, filter2, filter2, sum3_r, sum3_l); \
213  DPADD_SH4_SW(src1_r, src1_l, src5_r, src5_l, filter1, filter1, \
214  filter3, filter3, sum0_r, sum0_l, sum2_r, sum2_l); \
215  DPADD_SH2_SW(src6_r, src6_l, filter3, filter3, sum3_r, sum3_l); \
216  \
217  sum1_r = sum0_r; \
218  sum1_l = sum0_l; \
219  \
220  SPLATI_W2_SH(filt0, 2, filter0, filter1); \
221  SPLATI_W2_SH(filt1, 2, filter2, filter3); \
222  DOTP_SH2_SW(src2_r, src2_l, filter0, filter0, temp0_r, temp0_l); \
223  DPADD_SH2_SW(src6_r, src6_l, filter2, filter2, sum2_r, sum2_l); \
224  DOTP_SH2_SW(src5_r, src5_l, filter2, filter2, temp1_r, temp1_l); \
225  \
226  sum0_r += temp0_r; \
227  sum0_l += temp0_l; \
228  sum1_r -= temp0_r; \
229  sum1_l -= temp0_l; \
230  \
231  sum3_r = temp1_r - sum3_r; \
232  sum3_l = temp1_l - sum3_l; \
233  \
234  DOTP_SH2_SW(src3_r, src3_l, filter1, filter1, temp0_r, temp0_l); \
235  DPADD_SH4_SW(src7_r, src7_l, src4_r, src4_l, filter3, filter3, \
236  filter3, filter3, sum2_r, sum2_l, sum3_r, sum3_l); \
237  \
238  sum0_r += temp0_r; \
239  sum0_l += temp0_l; \
240  sum1_r -= temp0_r; \
241  sum1_l -= temp0_l; \
242  \
243  BUTTERFLY_4(sum0_r, sum0_l, sum2_l, sum2_r, res0_r, res0_l, \
244  res1_l, res1_r); \
245  SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \
246  SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \
247  PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \
248  ST_SH(dst0, buf_ptr); \
249  ST_SH(dst1, (buf_ptr + ((15 - (j * 2)) * 16))); \
250  \
251  BUTTERFLY_4(sum1_r, sum1_l, sum3_l, sum3_r, res0_r, res0_l, \
252  res1_l, res1_r); \
253  SRARI_W4_SW(res0_r, res0_l, res1_r, res1_l, shift); \
254  SAT_SW4_SW(res0_r, res0_l, res1_r, res1_l, 15); \
255  PCKEV_H2_SH(res0_l, res0_r, res1_l, res1_r, dst0, dst1); \
256  ST_SH(dst0, (ptr0 + (((j / 2 + j % 2) * 2 * k) * 16))); \
257  ST_SH(dst1, (ptr1 - (((j / 2 + j % 2) * 2 * k) * 16))); \
258  \
259  k *= -1; \
260  buf_ptr += 16; \
261  } \
262 }
263 
264 #define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx) \
265 { \
266  LD_SW2(input + load_idx * 8, 4, tmp0_r, tmp0_l); \
267  tmp1_r = sum0_r; \
268  tmp1_l = sum0_l; \
269  sum0_r += tmp0_r; \
270  sum0_l += tmp0_l; \
271  ST_SW2(sum0_r, sum0_l, (input + load_idx * 8), 4); \
272  tmp1_r -= tmp0_r; \
273  tmp1_l -= tmp0_l; \
274  ST_SW2(tmp1_r, tmp1_l, (input + store_idx * 8), 4); \
275 }
276 
277 #define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, \
278  res0, res1, res2, res3, shift) \
279 { \
280  v4i32 vec0, vec1, vec2, vec3; \
281  v4i32 cnst74 = __msa_ldi_w(74); \
282  v4i32 cnst55 = __msa_ldi_w(55); \
283  v4i32 cnst29 = __msa_ldi_w(29); \
284  \
285  vec0 = in_r0 + in_r1; \
286  vec2 = in_r0 - in_l1; \
287  res0 = vec0 * cnst29; \
288  res1 = vec2 * cnst55; \
289  res2 = in_r0 - in_r1; \
290  vec1 = in_r1 + in_l1; \
291  res2 += in_l1; \
292  vec3 = in_l0 * cnst74; \
293  res3 = vec0 * cnst55; \
294  \
295  res0 += vec1 * cnst55; \
296  res1 -= vec1 * cnst29; \
297  res2 *= cnst74; \
298  res3 += vec2 * cnst29; \
299  \
300  res0 += vec3; \
301  res1 += vec3; \
302  res3 -= vec3; \
303  \
304  SRARI_W4_SW(res0, res1, res2, res3, shift); \
305  SAT_SW4_SW(res0, res1, res2, res3, 15); \
306 }
307 
308 static void hevc_idct_4x4_msa(int16_t *coeffs)
309 {
310  v8i16 in0, in1;
311  v4i32 in_r0, in_l0, in_r1, in_l1;
312  v4i32 sum0, sum1, sum2, sum3;
313  v8i16 zeros = { 0 };
314 
315  LD_SH2(coeffs, 8, in0, in1);
316  ILVRL_H2_SW(zeros, in0, in_r0, in_l0);
317  ILVRL_H2_SW(zeros, in1, in_r1, in_l1);
318 
319  HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 7);
320  TRANSPOSE4x4_SW_SW(sum0, sum1, sum2, sum3, in_r0, in_l0, in_r1, in_l1);
321  HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, 12);
322 
323  /* Pack and transpose */
324  PCKEV_H2_SH(sum2, sum0, sum3, sum1, in0, in1);
325  ILVRL_H2_SW(in1, in0, sum0, sum1);
326  ILVRL_W2_SH(sum1, sum0, in0, in1);
327 
328  ST_SH2(in0, in1, coeffs, 8);
329 }
330 
331 static void hevc_idct_8x8_msa(int16_t *coeffs)
332 {
333  const int16_t *filter = &gt8x8_cnst[0];
334  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
335 
336  LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
337  HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 7);
338  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
339  in0, in1, in2, in3, in4, in5, in6, in7);
340  HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, 12);
341  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
342  in0, in1, in2, in3, in4, in5, in6, in7);
343  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs, 8);
344 }
345 
346 static void hevc_idct_16x16_msa(int16_t *coeffs)
347 {
348  int16_t i, j, k;
349  int16_t buf[256];
350  int16_t *buf_ptr = &buf[0];
351  int16_t *src = coeffs;
352  const int16_t *filter = &gt16x16_cnst[0];
353  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
354  v8i16 in8, in9, in10, in11, in12, in13, in14, in15;
355  v8i16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
356  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
357  v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
358 
359  for (i = 2; i--;) {
360  LD_SH16(src, 16, in0, in1, in2, in3, in4, in5, in6, in7,
361  in8, in9, in10, in11, in12, in13, in14, in15);
362 
363  ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
364  src0_r, src1_r, src2_r, src3_r);
365  ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
366  src4_r, src5_r, src6_r, src7_r);
367  ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
368  src0_l, src1_l, src2_l, src3_l);
369  ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
370  src4_l, src5_l, src6_l, src7_l);
371  HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
372  src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
373  src4_l, src5_l, src6_l, src7_l, 7);
374 
375  src += 8;
376  buf_ptr = (&buf[0] + 8);
377  filter = &gt16x16_cnst[0];
378  }
379 
380  src = &buf[0];
381  buf_ptr = coeffs;
382  filter = &gt16x16_cnst[0];
383 
384  for (i = 2; i--;) {
385  LD_SH16(src, 8, in0, in8, in1, in9, in2, in10, in3, in11,
386  in4, in12, in5, in13, in6, in14, in7, in15);
387  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
388  in0, in1, in2, in3, in4, in5, in6, in7);
389  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
390  in8, in9, in10, in11, in12, in13, in14, in15);
391  ILVR_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
392  src0_r, src1_r, src2_r, src3_r);
393  ILVR_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
394  src4_r, src5_r, src6_r, src7_r);
395  ILVL_H4_SH(in4, in0, in12, in8, in6, in2, in14, in10,
396  src0_l, src1_l, src2_l, src3_l);
397  ILVL_H4_SH(in5, in1, in13, in9, in3, in7, in11, in15,
398  src4_l, src5_l, src6_l, src7_l);
399  HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r,
400  src6_r, src7_r, src0_l, src1_l, src2_l, src3_l,
401  src4_l, src5_l, src6_l, src7_l, 12);
402 
403  src += 128;
404  buf_ptr = coeffs + 8;
405  filter = &gt16x16_cnst[0];
406  }
407 
408  LD_SH8(coeffs, 16, in0, in1, in2, in3, in4, in5, in6, in7);
409  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
410  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
411  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, coeffs, 16);
412 
413  LD_SH8((coeffs + 8), 16, in0, in1, in2, in3, in4, in5, in6, in7);
414  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
415  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
416  LD_SH8((coeffs + 128), 16, in8, in9, in10, in11, in12, in13, in14, in15);
417  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 128), 16);
418  TRANSPOSE8x8_SH_SH(in8, in9, in10, in11, in12, in13, in14, in15,
419  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
420  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 8), 16);
421 
422  LD_SH8((coeffs + 136), 16, in0, in1, in2, in3, in4, in5, in6, in7);
423  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
424  vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7);
425  ST_SH8(vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, (coeffs + 136), 16);
426 }
427 
428 static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch,
429  uint8_t round)
430 {
431  uint8_t i;
432  const int16_t *filter_ptr0 = &gt32x32_cnst0[0];
433  const int16_t *filter_ptr1 = &gt32x32_cnst1[0];
434  const int16_t *filter_ptr2 = &gt32x32_cnst2[0];
435  const int16_t *filter_ptr3 = &gt8x8_cnst[0];
436  int16_t *src0 = (coeffs + buf_pitch);
437  int16_t *src1 = (coeffs + 2 * buf_pitch);
438  int16_t *src2 = (coeffs + 4 * buf_pitch);
439  int16_t *src3 = (coeffs);
440  int32_t cnst0, cnst1;
441  int32_t tmp_buf[8 * 32 + 15];
442  int32_t *tmp_buf_ptr = tmp_buf + 15;
443  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
444  v8i16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
445  v8i16 src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l;
446  v8i16 filt0, filter0, filter1, filter2, filter3;
447  v4i32 sum0_r, sum0_l, sum1_r, sum1_l, tmp0_r, tmp0_l, tmp1_r, tmp1_l;
448 
449  /* Align pointer to 64 byte boundary */
450  tmp_buf_ptr = (int32_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
451 
452  /* process coeff 4, 12, 20, 28 */
453  LD_SH4(src2, 8 * buf_pitch, in0, in1, in2, in3);
454  ILVR_H2_SH(in1, in0, in3, in2, src0_r, src1_r);
455  ILVL_H2_SH(in1, in0, in3, in2, src0_l, src1_l);
456 
457  LD_SH2(src3, 16 * buf_pitch, in4, in6);
458  LD_SH2((src3 + 8 * buf_pitch), 16 * buf_pitch, in5, in7);
459  ILVR_H2_SH(in6, in4, in7, in5, src2_r, src3_r);
460  ILVL_H2_SH(in6, in4, in7, in5, src2_l, src3_l);
461 
462  /* loop for all columns of constants */
463  for (i = 0; i < 2; i++) {
464  /* processing single column of constants */
465  cnst0 = LW(filter_ptr2);
466  cnst1 = LW(filter_ptr2 + 2);
467 
468  filter0 = (v8i16) __msa_fill_w(cnst0);
469  filter1 = (v8i16) __msa_fill_w(cnst1);
470 
471  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
472  DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
473  ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + 2 * i * 8), 4);
474 
475  /* processing single column of constants */
476  cnst0 = LW(filter_ptr2 + 4);
477  cnst1 = LW(filter_ptr2 + 6);
478 
479  filter0 = (v8i16) __msa_fill_w(cnst0);
480  filter1 = (v8i16) __msa_fill_w(cnst1);
481 
482  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
483  DPADD_SH2_SW(src1_r, src1_l, filter1, filter1, sum0_r, sum0_l);
484  ST_SW2(sum0_r, sum0_l, (tmp_buf_ptr + (2 * i + 1) * 8), 4);
485 
486  filter_ptr2 += 8;
487  }
488 
489  /* process coeff 0, 8, 16, 24 */
490  /* loop for all columns of constants */
491  for (i = 0; i < 2; i++) {
492  /* processing first column of filter constants */
493  cnst0 = LW(filter_ptr3);
494  cnst1 = LW(filter_ptr3 + 2);
495 
496  filter0 = (v8i16) __msa_fill_w(cnst0);
497  filter1 = (v8i16) __msa_fill_w(cnst1);
498 
499  DOTP_SH4_SW(src2_r, src2_l, src3_r, src3_l, filter0, filter0, filter1,
500  filter1, sum0_r, sum0_l, tmp1_r, tmp1_l);
501 
502  sum1_r = sum0_r - tmp1_r;
503  sum1_l = sum0_l - tmp1_l;
504  sum0_r = sum0_r + tmp1_r;
505  sum0_l = sum0_l + tmp1_l;
506 
507  HEVC_EVEN16_CALC(tmp_buf_ptr, sum0_r, sum0_l, i, (7 - i));
508  HEVC_EVEN16_CALC(tmp_buf_ptr, sum1_r, sum1_l, (3 - i), (4 + i));
509 
510  filter_ptr3 += 8;
511  }
512 
513  /* process coeff 2 6 10 14 18 22 26 30 */
514  LD_SH8(src1, 4 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
515  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
516  src0_r, src1_r, src2_r, src3_r);
517  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
518  src0_l, src1_l, src2_l, src3_l);
519 
520  /* loop for all columns of constants */
521  for (i = 0; i < 8; i++) {
522  /* processing single column of constants */
523  filt0 = LD_SH(filter_ptr1);
524  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
525  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
526  DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
527  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
528  DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
529 
530  LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
531  tmp1_r = tmp0_r;
532  tmp1_l = tmp0_l;
533  tmp0_r += sum0_r;
534  tmp0_l += sum0_l;
535  ST_SW2(tmp0_r, tmp0_l, (tmp_buf_ptr + i * 8), 4);
536  tmp1_r -= sum0_r;
537  tmp1_l -= sum0_l;
538  ST_SW2(tmp1_r, tmp1_l, (tmp_buf_ptr + (15 - i) * 8), 4);
539 
540  filter_ptr1 += 8;
541  }
542 
543  /* process coeff 1 3 5 7 9 11 13 15 17 19 21 23 25 27 29 31 */
544  LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
545  src0 += 16 * buf_pitch;
546  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
547  src0_r, src1_r, src2_r, src3_r);
548  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
549  src0_l, src1_l, src2_l, src3_l);
550 
551  LD_SH8(src0, 2 * buf_pitch, in0, in1, in2, in3, in4, in5, in6, in7);
552  ILVR_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
553  src4_r, src5_r, src6_r, src7_r);
554  ILVL_H4_SH(in1, in0, in3, in2, in5, in4, in7, in6,
555  src4_l, src5_l, src6_l, src7_l);
556 
557  /* loop for all columns of filter constants */
558  for (i = 0; i < 16; i++) {
559  /* processing single column of constants */
560  filt0 = LD_SH(filter_ptr0);
561  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
562  DOTP_SH2_SW(src0_r, src0_l, filter0, filter0, sum0_r, sum0_l);
563  DPADD_SH4_SW(src1_r, src1_l, src2_r, src2_l, filter1, filter1, filter2,
564  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
565  DPADD_SH2_SW(src3_r, src3_l, filter3, filter3, sum0_r, sum0_l);
566 
567  tmp1_r = sum0_r;
568  tmp1_l = sum0_l;
569 
570  filt0 = LD_SH(filter_ptr0 + 8);
571  SPLATI_W4_SH(filt0, filter0, filter1, filter2, filter3);
572  DOTP_SH2_SW(src4_r, src4_l, filter0, filter0, sum0_r, sum0_l);
573  DPADD_SH4_SW(src5_r, src5_l, src6_r, src6_l, filter1, filter1, filter2,
574  filter2, sum0_r, sum0_l, sum0_r, sum0_l);
575  DPADD_SH2_SW(src7_r, src7_l, filter3, filter3, sum0_r, sum0_l);
576 
577  sum0_r += tmp1_r;
578  sum0_l += tmp1_l;
579 
580  LD_SW2(tmp_buf_ptr + i * 8, 4, tmp0_r, tmp0_l);
581  tmp1_r = tmp0_r;
582  tmp1_l = tmp0_l;
583  tmp0_r += sum0_r;
584  tmp0_l += sum0_l;
585  sum1_r = __msa_fill_w(round);
586  SRAR_W2_SW(tmp0_r, tmp0_l, sum1_r);
587  SAT_SW2_SW(tmp0_r, tmp0_l, 15);
588  in0 = __msa_pckev_h((v8i16) tmp0_l, (v8i16) tmp0_r);
589  ST_SH(in0, (coeffs + i * buf_pitch));
590  tmp1_r -= sum0_r;
591  tmp1_l -= sum0_l;
592  SRAR_W2_SW(tmp1_r, tmp1_l, sum1_r);
593  SAT_SW2_SW(tmp1_r, tmp1_l, 15);
594  in0 = __msa_pckev_h((v8i16) tmp1_l, (v8i16) tmp1_r);
595  ST_SH(in0, (coeffs + (31 - i) * buf_pitch));
596 
597  filter_ptr0 += 16;
598  }
599 }
600 
601 static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
602 {
603  uint8_t i;
604  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
605 
606  for (i = 0; i < 4; i++) {
607  LD_SH8(coeffs + i * 8, 32, in0, in1, in2, in3, in4, in5, in6, in7);
608  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
609  in0, in1, in2, in3, in4, in5, in6, in7);
610  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, tmp_buf + i * 8 * 8, 8);
611  }
612 }
613 
614 static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
615 {
616  uint8_t i;
617  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
618 
619  for (i = 0; i < 4; i++) {
620  LD_SH8(tmp_buf + i * 8 * 8, 8, in0, in1, in2, in3, in4, in5, in6, in7);
621  TRANSPOSE8x8_SH_SH(in0, in1, in2, in3, in4, in5, in6, in7,
622  in0, in1, in2, in3, in4, in5, in6, in7);
623  ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, coeffs + i * 8, 32);
624  }
625 }
626 
627 static void hevc_idct_32x32_msa(int16_t *coeffs)
628 {
629  uint8_t row_cnt, col_cnt;
630  int16_t *src = coeffs;
631  int16_t tmp_buf[8 * 32 + 31];
632  int16_t *tmp_buf_ptr = tmp_buf + 31;
633  uint8_t round;
634  uint8_t buf_pitch;
635 
636  /* Align pointer to 64 byte boundary */
637  tmp_buf_ptr = (int16_t *)(((uintptr_t) tmp_buf_ptr) & ~(uintptr_t) 63);
638 
639  /* column transform */
640  round = 7;
641  buf_pitch = 32;
642  for (col_cnt = 0; col_cnt < 4; col_cnt++) {
643  /* process 8x32 blocks */
644  hevc_idct_8x32_column_msa((coeffs + col_cnt * 8), buf_pitch, round);
645  }
646 
647  /* row transform */
648  round = 12;
649  buf_pitch = 8;
650  for (row_cnt = 0; row_cnt < 4; row_cnt++) {
651  /* process 32x8 blocks */
652  src = (coeffs + 32 * 8 * row_cnt);
653 
655  hevc_idct_8x32_column_msa(tmp_buf_ptr, buf_pitch, round);
657  }
658 }
659 
660 static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
661 {
662  int32_t val;
663  v8i16 dst;
664 
665  val = (coeffs[0] + 1) >> 1;
666  val = (val + 32) >> 6;
667  dst = __msa_fill_h(val);
668 
669  ST_SH2(dst, dst, coeffs, 8);
670 }
671 
672 static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
673 {
674  int32_t val;
675  v8i16 dst;
676 
677  val = (coeffs[0] + 1) >> 1;
678  val = (val + 32) >> 6;
679  dst = __msa_fill_h(val);
680 
681  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
682 }
683 
684 static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
685 {
686  uint8_t loop;
687  int32_t val;
688  v8i16 dst;
689 
690  val = (coeffs[0] + 1) >> 1;
691  val = (val + 32) >> 6;
692  dst = __msa_fill_h(val);
693 
694  for (loop = 4; loop--;) {
695  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
696  coeffs += 8 * 8;
697  }
698 }
699 
700 static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
701 {
702  uint8_t loop;
703  int32_t val;
704  v8i16 dst;
705 
706  val = (coeffs[0] + 1) >> 1;
707  val = (val + 32) >> 6;
708  dst = __msa_fill_h(val);
709 
710  for (loop = 16; loop--;) {
711  ST_SH8(dst, dst, dst, dst, dst, dst, dst, dst, coeffs, 8);
712  coeffs += 8 * 8;
713  }
714 }
715 
716 static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
717 {
718  uint32_t dst0, dst1, dst2, dst3;
719  v8i16 dst_r0, dst_l0, in0, in1;
720  v4i32 dst_vec = { 0 };
721  v16u8 zeros = { 0 };
722 
723  LD_SH2(coeffs, 8, in0, in1);
724  LW4(dst, stride, dst0, dst1, dst2, dst3);
725  INSERT_W4_SW(dst0, dst1, dst2, dst3, dst_vec);
726  ILVRL_B2_SH(zeros, dst_vec, dst_r0, dst_l0);
727  ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
728  CLIP_SH2_0_255(dst_r0, dst_l0);
729  dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
730  ST_W4(dst_vec, 0, 1, 2, 3, dst, stride);
731 }
732 
733 static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
734 {
735  uint8_t *temp_dst = dst;
736  uint64_t dst0, dst1, dst2, dst3;
737  v2i64 dst_vec0 = { 0 };
738  v2i64 dst_vec1 = { 0 };
739  v8i16 dst_r0, dst_l0, dst_r1, dst_l1;
740  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
741  v16u8 zeros = { 0 };
742 
743  LD_SH8(coeffs, 8, in0, in1, in2, in3, in4, in5, in6, in7);
744  LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
745  temp_dst += (4 * stride);
746 
747  INSERT_D2_SD(dst0, dst1, dst_vec0);
748  INSERT_D2_SD(dst2, dst3, dst_vec1);
749  ILVRL_B2_SH(zeros, dst_vec0, dst_r0, dst_l0);
750  ILVRL_B2_SH(zeros, dst_vec1, dst_r1, dst_l1);
751  ADD4(dst_r0, in0, dst_l0, in1, dst_r1, in2, dst_l1, in3,
752  dst_r0, dst_l0, dst_r1, dst_l1);
753  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
754  PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
755  ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst, stride);
756 
757  LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
758  INSERT_D2_SD(dst0, dst1, dst_vec0);
759  INSERT_D2_SD(dst2, dst3, dst_vec1);
760  UNPCK_UB_SH(dst_vec0, dst_r0, dst_l0);
761  UNPCK_UB_SH(dst_vec1, dst_r1, dst_l1);
762  ADD4(dst_r0, in4, dst_l0, in5, dst_r1, in6, dst_l1, in7,
763  dst_r0, dst_l0, dst_r1, dst_l1);
764  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
765  PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
766  ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst + 4 * stride, stride);
767 }
768 
769 static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
770 {
771  uint8_t loop_cnt;
772  uint8_t *temp_dst = dst;
773  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
774  v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
775  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
776 
777  /* Pre-load for next iteration */
778  LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7);
779  temp_dst += (4 * stride);
780  LD_SH4(coeffs, 16, in0, in2, in4, in6);
781  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
782  coeffs += 64;
783 
784  for (loop_cnt = 3; loop_cnt--;) {
785  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
786  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
787  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
788  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
789 
790  dst_r0 += in0;
791  dst_l0 += in1;
792  dst_r1 += in2;
793  dst_l1 += in3;
794  dst_r2 += in4;
795  dst_l2 += in5;
796  dst_r3 += in6;
797  dst_l3 += in7;
798 
799  /* Pre-load for next iteration */
800  LD_UB4(temp_dst, stride, dst4, dst5, dst6, dst7);
801  temp_dst += (4 * stride);
802  LD_SH4(coeffs, 16, in0, in2, in4, in6);
803  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
804  coeffs += 64;
805 
806  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
807  CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
808  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
809  dst_r3, dst0, dst1, dst2, dst3);
810  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
811  dst += (4 * stride);
812  }
813 
814  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
815  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
816  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
817  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
818 
819  dst_r0 += in0;
820  dst_l0 += in1;
821  dst_r1 += in2;
822  dst_l1 += in3;
823  dst_r2 += in4;
824  dst_l2 += in5;
825  dst_r3 += in6;
826  dst_l3 += in7;
827 
828  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
829  CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
830  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
831  dst_r3, dst0, dst1, dst2, dst3);
832  ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
833 }
834 
835 static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
836 {
837  uint8_t loop_cnt;
838  uint8_t *temp_dst = dst;
839  v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
840  v8i16 dst_r0, dst_l0, dst_r1, dst_l1, dst_r2, dst_l2, dst_r3, dst_l3;
841  v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
842 
843  /* Pre-load for next iteration */
844  LD_UB2(temp_dst, 16, dst4, dst5);
845  temp_dst += stride;
846  LD_UB2(temp_dst, 16, dst6, dst7);
847  temp_dst += stride;
848  LD_SH4(coeffs, 16, in0, in2, in4, in6);
849  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
850  coeffs += 64;
851 
852  for (loop_cnt = 14; loop_cnt--;) {
853  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
854  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
855  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
856  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
857 
858  dst_r0 += in0;
859  dst_l0 += in1;
860  dst_r1 += in2;
861  dst_l1 += in3;
862  dst_r2 += in4;
863  dst_l2 += in5;
864  dst_r3 += in6;
865  dst_l3 += in7;
866 
867  /* Pre-load for next iteration */
868  LD_UB2(temp_dst, 16, dst4, dst5);
869  temp_dst += stride;
870  LD_UB2(temp_dst, 16, dst6, dst7);
871  temp_dst += stride;
872  LD_SH4(coeffs, 16, in0, in2, in4, in6);
873  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
874  coeffs += 64;
875 
876  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
877  CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
878  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
879  dst_r3, dst0, dst1, dst2, dst3);
880  ST_UB2(dst0, dst1, dst, 16);
881  dst += stride;
882  ST_UB2(dst2, dst3, dst, 16);
883  dst += stride;
884  }
885 
886  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
887  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
888  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
889  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
890 
891  dst_r0 += in0;
892  dst_l0 += in1;
893  dst_r1 += in2;
894  dst_l1 += in3;
895  dst_r2 += in4;
896  dst_l2 += in5;
897  dst_r3 += in6;
898  dst_l3 += in7;
899 
900  /* Pre-load for next iteration */
901  LD_UB2(temp_dst, 16, dst4, dst5);
902  temp_dst += stride;
903  LD_UB2(temp_dst, 16, dst6, dst7);
904  temp_dst += stride;
905  LD_SH4(coeffs, 16, in0, in2, in4, in6);
906  LD_SH4((coeffs + 8), 16, in1, in3, in5, in7);
907 
908  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
909  CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
910  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
911  dst_r3, dst0, dst1, dst2, dst3);
912  ST_UB2(dst0, dst1, dst, 16);
913  dst += stride;
914  ST_UB2(dst2, dst3, dst, 16);
915  dst += stride;
916 
917  UNPCK_UB_SH(dst4, dst_r0, dst_l0);
918  UNPCK_UB_SH(dst5, dst_r1, dst_l1);
919  UNPCK_UB_SH(dst6, dst_r2, dst_l2);
920  UNPCK_UB_SH(dst7, dst_r3, dst_l3);
921 
922  dst_r0 += in0;
923  dst_l0 += in1;
924  dst_r1 += in2;
925  dst_l1 += in3;
926  dst_r2 += in4;
927  dst_l2 += in5;
928  dst_r3 += in6;
929  dst_l3 += in7;
930 
931  CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
932  CLIP_SH4_0_255(dst_r2, dst_l2, dst_r3, dst_l3);
933  PCKEV_B4_UB(dst_l0, dst_r0, dst_l1, dst_r1, dst_l2, dst_r2, dst_l3,
934  dst_r3, dst0, dst1, dst2, dst3);
935  ST_UB2(dst0, dst1, dst, 16);
936  dst += stride;
937  ST_UB2(dst2, dst3, dst, 16);
938 }
939 
940 static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
941 {
942  v8i16 in0, in1, dst0, dst1;
943  v4i32 in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3;
944 
945  LD_SH2(coeffs, 8, in0, in1);
946  UNPCK_SH_SW(in0, in_r0, in_l0);
947  UNPCK_SH_SW(in1, in_r1, in_l1);
948  HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
949  7);
950  TRANSPOSE4x4_SW_SW(res0, res1, res2, res3, in_r0, in_l0, in_r1, in_l1);
951  HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3,
952  12);
953 
954  /* Pack and transpose */
955  PCKEV_H2_SH(res2, res0, res3, res1, dst0, dst1);
956  ILVRL_H2_SW(dst1, dst0, res0, res1);
957  ILVRL_W2_SH(res1, res0, dst0, dst1);
958 
959  ST_SH2(dst0, dst1, coeffs, 8);
960 }
961 
962 void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
963 {
964  hevc_idct_4x4_msa(coeffs);
965 }
966 
967 void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
968 {
969  hevc_idct_8x8_msa(coeffs);
970 }
971 
972 void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
973 {
974  hevc_idct_16x16_msa(coeffs);
975 }
976 
977 void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
978 {
979  hevc_idct_32x32_msa(coeffs);
980 }
981 
982 void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
983 {
984  hevc_addblk_4x4_msa(coeffs, dst, stride);
985 }
986 
987 void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
988 {
989  hevc_addblk_8x8_msa(coeffs, dst, stride);
990 }
991 
992 void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
993 {
994  hevc_addblk_16x16_msa(coeffs, dst, stride);
995 }
996 
997 void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
998 {
999  hevc_addblk_32x32_msa(coeffs, dst, stride);
1000 }
1001 
1002 void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
1003 {
1004  hevc_idct_dc_4x4_msa(coeffs);
1005 }
1006 
1007 void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
1008 {
1009  hevc_idct_dc_8x8_msa(coeffs);
1010 }
1011 
1012 void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
1013 {
1014  hevc_idct_dc_16x16_msa(coeffs);
1015 }
1016 
1017 void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
1018 {
1019  hevc_idct_dc_32x32_msa(coeffs);
1020 }
1021 
1022 void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
1023 {
1024  hevc_idct_luma_4x4_msa(coeffs);
1025 }
HEVC_IDCT4x4_COL
#define HEVC_IDCT4x4_COL(in_r0, in_l0, in_r1, in_l1, sum0, sum1, sum2, sum3, shift)
Definition: hevc_idct_msa.c:65
stride
int stride
Definition: mace.c:144
LD_SH2
#define LD_SH2(...)
Definition: generic_macros_msa.h:282
ST_W4
#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:458
ff_hevc_addblk_4x4_msa
void ff_hevc_addblk_4x4_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
Definition: hevc_idct_msa.c:982
ST_SW2
#define ST_SW2(...)
Definition: generic_macros_msa.h:367
ILVR_H2_SH
#define ILVR_H2_SH(...)
Definition: generic_macros_msa.h:1441
gt16x16_cnst
static const int16_t gt16x16_cnst[64]
Definition: hevc_idct_msa.c:28
LD_SH4
#define LD_SH4(...)
Definition: generic_macros_msa.h:301
filter1
static void filter1(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:358
ST_UB2
#define ST_UB2(...)
Definition: generic_macros_msa.h:363
INSERT_D2_SD
#define INSERT_D2_SD(...)
Definition: generic_macros_msa.h:1222
ff_hevc_idct_dc_8x8_msa
void ff_hevc_idct_dc_8x8_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:1007
LD_SH
#define LD_SH(...)
Definition: generic_macros_msa.h:39
PCKEV_B4_UB
#define PCKEV_B4_UB(...)
Definition: generic_macros_msa.h:1788
filter
filter_frame For filters that do not use the this method is called when a frame is pushed to the filter s input It can be called at any time except in a reentrant way If the input frame is enough to produce then the filter should push the output frames on the output link immediately As an exception to the previous rule if the input frame is enough to produce several output frames then the filter needs output only at least one per link The additional frames can be left buffered in the filter
Definition: filter_design.txt:228
ST_UB4
#define ST_UB4(...)
Definition: generic_macros_msa.h:374
SRAR_W2_SW
#define SRAR_W2_SW(...)
Definition: generic_macros_msa.h:2083
hevc_idct_transpose_8x32_to_32x8
static void hevc_idct_transpose_8x32_to_32x8(int16_t *tmp_buf, int16_t *coeffs)
Definition: hevc_idct_msa.c:614
ADD4
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2172
hevc_addblk_4x4_msa
static void hevc_addblk_4x4_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
Definition: hevc_idct_msa.c:716
ADD2
#define ADD2(in0, in1, in2, in3, out0, out1)
Definition: generic_macros_msa.h:2167
ff_hevc_idct_dc_32x32_msa
void ff_hevc_idct_dc_32x32_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:1017
generic_macros_msa.h
hevc_idct_transpose_32x8_to_8x32
static void hevc_idct_transpose_32x8_to_8x32(int16_t *coeffs, int16_t *tmp_buf)
Definition: hevc_idct_msa.c:601
hevc_addblk_16x16_msa
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
Definition: hevc_idct_msa.c:769
ff_hevc_idct_4x4_msa
void ff_hevc_idct_4x4_msa(int16_t *coeffs, int col_limit)
Definition: hevc_idct_msa.c:962
TRANSPOSE8x8_SH_SH
#define TRANSPOSE8x8_SH_SH(...)
Definition: generic_macros_msa.h:2578
loop
static int loop
Definition: ffplay.c:340
src
#define src
Definition: vp8dsp.c:254
ff_hevc_addblk_8x8_msa
void ff_hevc_addblk_8x8_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
Definition: hevc_idct_msa.c:987
ILVRL_W2_SH
#define ILVRL_W2_SH(...)
Definition: generic_macros_msa.h:1566
hevc_idct_dc_4x4_msa
static void hevc_idct_dc_4x4_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:660
aligned
static int aligned(int val)
Definition: dashdec.c:165
buf
void * buf
Definition: avisynth_c.h:766
gt32x32_cnst2
static const int16_t gt32x32_cnst2[16]
Definition: hevc_idct_msa.c:61
ILVL_H2_SH
#define ILVL_H2_SH(...)
Definition: generic_macros_msa.h:1342
ff_hevc_addblk_16x16_msa
void ff_hevc_addblk_16x16_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
Definition: hevc_idct_msa.c:992
gt32x32_cnst1
static const int16_t gt32x32_cnst1[64]
Definition: hevc_idct_msa.c:54
gt8x8_cnst
static const int16_t gt8x8_cnst[16]
Definition: hevc_idct_msa.c:24
ff_hevc_idct_luma_4x4_msa
void ff_hevc_idct_luma_4x4_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:1022
gt32x32_cnst0
static const int16_t gt32x32_cnst0[256]
Definition: hevc_idct_msa.c:35
TRANSPOSE4x4_SW_SW
#define TRANSPOSE4x4_SW_SW(in0, in1, in2, in3, out0, out1, out2, out3)
Definition: generic_macros_msa.h:2586
hevc_idct_luma_4x4_msa
static void hevc_idct_luma_4x4_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:940
int32_t
int32_t
Definition: audio_convert.c:194
ST_SH2
#define ST_SH2(...)
Definition: generic_macros_msa.h:366
UNPCK_SH_SW
#define UNPCK_SH_SW(in, out0, out1)
Definition: generic_macros_msa.h:2295
LD_SH16
#define LD_SH16(...)
Definition: generic_macros_msa.h:350
INSERT_W4_SW
#define INSERT_W4_SW(...)
Definition: generic_macros_msa.h:1206
hevc_addblk_8x8_msa
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
Definition: hevc_idct_msa.c:733
ff_hevc_idct_dc_16x16_msa
void ff_hevc_idct_dc_16x16_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:1012
hevc_addblk_32x32_msa
static void hevc_addblk_32x32_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
Definition: hevc_idct_msa.c:835
CLIP_SH2_0_255
#define CLIP_SH2_0_255(in0, in1)
Definition: generic_macros_msa.h:966
hevc_idct_32x32_msa
static void hevc_idct_32x32_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:627
LW
#define LW(psrc)
Definition: generic_macros_msa.h:108
hevcdsp_mips.h
DPADD_SH2_SW
#define DPADD_SH2_SW(...)
Definition: generic_macros_msa.h:902
ff_hevc_idct_8x8_msa
void ff_hevc_idct_8x8_msa(int16_t *coeffs, int col_limit)
Definition: hevc_idct_msa.c:967
LD_UB4
#define LD_UB4(...)
Definition: generic_macros_msa.h:298
ST_SH8
#define ST_SH8(...)
Definition: generic_macros_msa.h:392
LD_SW2
#define LD_SW2(...)
Definition: generic_macros_msa.h:283
val
const char const char void * val
Definition: avisynth_c.h:863
hevc_idct_8x32_column_msa
static void hevc_idct_8x32_column_msa(int16_t *coeffs, uint8_t buf_pitch, uint8_t round)
Definition: hevc_idct_msa.c:428
HEVC_EVEN16_CALC
#define HEVC_EVEN16_CALC(input, sum0_r, sum0_l, load_idx, store_idx)
Definition: hevc_idct_msa.c:264
LW4
#define LW4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:204
src0
#define src0
Definition: h264pred.c:138
SPLATI_W4_SH
#define SPLATI_W4_SH(...)
Definition: generic_macros_msa.h:1749
ILVRL_H2_SW
#define ILVRL_H2_SW(...)
Definition: generic_macros_msa.h:1558
src1
#define src1
Definition: h264pred.c:139
CLIP_SH4_0_255
#define CLIP_SH4_0_255(in0, in1, in2, in3)
Definition: generic_macros_msa.h:971
HEVC_IDCT16x16_COL
#define HEVC_IDCT16x16_COL(src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l, src7_l, shift)
Definition: hevc_idct_msa.c:188
PCKEV_B2_SH
#define PCKEV_B2_SH(...)
Definition: generic_macros_msa.h:1770
DOTP_SH4_SW
#define DOTP_SH4_SW(...)
Definition: generic_macros_msa.h:831
LD_SH8
#define LD_SH8(...)
Definition: generic_macros_msa.h:339
i
#define i(width, name, range_min, range_max)
Definition: cbs_h2645.c:259
round
static av_always_inline av_const double round(double x)
Definition: libm.h:444
hevc_idct_8x8_msa
static void hevc_idct_8x8_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:331
uint8_t
uint8_t
Definition: audio_convert.c:194
LD4
#define LD4(psrc, stride, out0, out1, out2, out3)
Definition: generic_macros_msa.h:230
UNPCK_UB_SH
#define UNPCK_UB_SH(in, out0, out1)
Definition: generic_macros_msa.h:2270
hevc_idct_16x16_msa
static void hevc_idct_16x16_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:346
hevc_idct_4x4_msa
static void hevc_idct_4x4_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:308
HEVC_IDCT_LUMA4x4_COL
#define HEVC_IDCT_LUMA4x4_COL(in_r0, in_l0, in_r1, in_l1, res0, res1, res2, res3, shift)
Definition: hevc_idct_msa.c:277
ff_hevc_addblk_32x32_msa
void ff_hevc_addblk_32x32_msa(uint8_t *dst, int16_t *coeffs, ptrdiff_t stride)
Definition: hevc_idct_msa.c:997
ILVL_H4_SH
#define ILVL_H4_SH(...)
Definition: generic_macros_msa.h:1351
LD_UB2
#define LD_UB2(...)
Definition: generic_macros_msa.h:279
DOTP_SH2_SW
#define DOTP_SH2_SW(...)
Definition: generic_macros_msa.h:822
DPADD_SH4_SW
#define DPADD_SH4_SW(...)
Definition: generic_macros_msa.h:910
HEVC_IDCT8x8_COL
#define HEVC_IDCT8x8_COL(in0, in1, in2, in3, in4, in5, in6, in7, shift)
Definition: hevc_idct_msa.c:94
ST_D4
#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride)
Definition: generic_macros_msa.h:499
hevc_idct_dc_16x16_msa
static void hevc_idct_dc_16x16_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:684
filter0
static void filter0(SUINT32 *dst, const int32_t *src, int32_t coeff, ptrdiff_t len)
Definition: dcadsp.c:350
ff_hevc_idct_32x32_msa
void ff_hevc_idct_32x32_msa(int16_t *coeffs, int col_limit)
Definition: hevc_idct_msa.c:977
hevc_idct_dc_32x32_msa
static void hevc_idct_dc_32x32_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:700
hevc_idct_dc_8x8_msa
static void hevc_idct_dc_8x8_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:672
ILVRL_B2_SH
#define ILVRL_B2_SH(...)
Definition: generic_macros_msa.h:1547
ST_SH
#define ST_SH(...)
Definition: generic_macros_msa.h:47
ILVR_H4_SH
#define ILVR_H4_SH(...)
Definition: generic_macros_msa.h:1457
SAT_SW2_SW
#define SAT_SW2_SW(...)
Definition: generic_macros_msa.h:1681
PCKEV_H2_SH
#define PCKEV_H2_SH(...)
Definition: generic_macros_msa.h:1808
ff_hevc_idct_dc_4x4_msa
void ff_hevc_idct_dc_4x4_msa(int16_t *coeffs)
Definition: hevc_idct_msa.c:1002
ff_hevc_idct_16x16_msa
void ff_hevc_idct_16x16_msa(int16_t *coeffs, int col_limit)
Definition: hevc_idct_msa.c:972