FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
jrevdct.c
Go to the documentation of this file.
1 /*
2  * This file is part of the Independent JPEG Group's software.
3  *
4  * The authors make NO WARRANTY or representation, either express or implied,
5  * with respect to this software, its quality, accuracy, merchantability, or
6  * fitness for a particular purpose. This software is provided "AS IS", and
7  * you, its user, assume the entire risk as to its quality and accuracy.
8  *
9  * This software is copyright (C) 1991, 1992, Thomas G. Lane.
10  * All Rights Reserved except as specified below.
11  *
12  * Permission is hereby granted to use, copy, modify, and distribute this
13  * software (or portions thereof) for any purpose, without fee, subject to
14  * these conditions:
15  * (1) If any part of the source code for this software is distributed, then
16  * this README file must be included, with this copyright and no-warranty
17  * notice unaltered; and any additions, deletions, or changes to the original
18  * files must be clearly indicated in accompanying documentation.
19  * (2) If only executable code is distributed, then the accompanying
20  * documentation must state that "this software is based in part on the work
21  * of the Independent JPEG Group".
22  * (3) Permission for use of this software is granted only if the user accepts
23  * full responsibility for any undesirable consequences; the authors accept
24  * NO LIABILITY for damages of any kind.
25  *
26  * These conditions apply to any software derived from or based on the IJG
27  * code, not just to the unmodified library. If you use our work, you ought
28  * to acknowledge us.
29  *
30  * Permission is NOT granted for the use of any IJG author's name or company
31  * name in advertising or publicity relating to this software or products
32  * derived from it. This software may be referred to only as "the Independent
33  * JPEG Group's software".
34  *
35  * We specifically permit and encourage the use of this software as the basis
36  * of commercial products, provided that all warranty or liability claims are
37  * assumed by the product vendor.
38  *
39  * This file contains the basic inverse-DCT transformation subroutine.
40  *
41  * This implementation is based on an algorithm described in
42  * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
43  * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
44  * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
45  * The primary algorithm described there uses 11 multiplies and 29 adds.
46  * We use their alternate method with 12 multiplies and 32 adds.
47  * The advantage of this method is that no data path contains more than one
48  * multiplication; this allows a very simple and accurate implementation in
49  * scaled fixed-point arithmetic, with a minimal number of shifts.
50  *
51  * I've made lots of modifications to attempt to take advantage of the
52  * sparse nature of the DCT matrices we're getting. Although the logic
53  * is cumbersome, it's straightforward and the resulting code is much
54  * faster.
55  *
56  * A better way to do this would be to pass in the DCT block as a sparse
57  * matrix, perhaps with the difference cases encoded.
58  */
59 
60 /**
61  * @file
62  * Independent JPEG Group's LLM idct.
63  */
64 
65 #include "libavutil/common.h"
66 #include "libavutil/intreadwrite.h"
67 
68 #include "dct.h"
69 #include "idctdsp.h"
70 
71 #define EIGHT_BIT_SAMPLES
72 
73 #define DCTSIZE 8
74 #define DCTSIZE2 64
75 
76 #define GLOBAL
77 
78 #define RIGHT_SHIFT(x, n) ((x) >> (n))
79 
80 typedef int16_t DCTBLOCK[DCTSIZE2];
81 
82 #define CONST_BITS 13
83 
84 /*
85  * This routine is specialized to the case DCTSIZE = 8.
86  */
87 
88 #if DCTSIZE != 8
89  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
90 #endif
91 
92 
93 /*
94  * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
95  * on each column. Direct algorithms are also available, but they are
96  * much more complex and seem not to be any faster when reduced to code.
97  *
98  * The poop on this scaling stuff is as follows:
99  *
100  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
101  * larger than the true IDCT outputs. The final outputs are therefore
102  * a factor of N larger than desired; since N=8 this can be cured by
103  * a simple right shift at the end of the algorithm. The advantage of
104  * this arrangement is that we save two multiplications per 1-D IDCT,
105  * because the y0 and y4 inputs need not be divided by sqrt(N).
106  *
107  * We have to do addition and subtraction of the integer inputs, which
108  * is no problem, and multiplication by fractional constants, which is
109  * a problem to do in integer arithmetic. We multiply all the constants
110  * by CONST_SCALE and convert them to integer constants (thus retaining
111  * CONST_BITS bits of precision in the constants). After doing a
112  * multiplication we have to divide the product by CONST_SCALE, with proper
113  * rounding, to produce the correct output. This division can be done
114  * cheaply as a right shift of CONST_BITS bits. We postpone shifting
115  * as long as possible so that partial sums can be added together with
116  * full fractional precision.
117  *
118  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
119  * they are represented to better-than-integral precision. These outputs
120  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
121  * with the recommended scaling. (To scale up 12-bit sample data further, an
122  * intermediate int32 array would be needed.)
123  *
124  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
125  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
126  * shows that the values given below are the most effective.
127  */
128 
129 #ifdef EIGHT_BIT_SAMPLES
130 #define PASS1_BITS 2
131 #else
132 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
133 #endif
134 
135 #define ONE ((int32_t) 1)
136 
137 #define CONST_SCALE (ONE << CONST_BITS)
138 
139 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
140  * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
141  * you will pay a significant penalty in run time. In that case, figure
142  * the correct integer constant values and insert them by hand.
143  */
144 
145 /* Actually FIX is no longer used, we precomputed them all */
146 #define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5))
147 
148 /* Descale and correctly round an int32_t value that's scaled by N bits.
149  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
150  * the fudge factor is correct for either sign of X.
151  */
152 
153 #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
154 
155 /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
156  * For 8-bit samples with the recommended scaling, all the variable
157  * and constant values involved are no more than 16 bits wide, so a
158  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
159  * this provides a useful speedup on many machines.
160  * There is no way to specify a 16x16->32 multiply in portable C, but
161  * some C compilers will do the right thing if you provide the correct
162  * combination of casts.
163  * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
164  */
165 
166 #ifdef EIGHT_BIT_SAMPLES
167 #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
168 #define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const)))
169 #endif
170 #ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */
171 #define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const)))
172 #endif
173 #endif
174 
175 #ifndef MULTIPLY /* default definition */
176 #define MULTIPLY(var,const) ((var) * (const))
177 #endif
178 
179 
180 /*
181  Unlike our decoder where we approximate the FIXes, we need to use exact
182 ones here or successive P-frames will drift too much with Reference frame coding
183 */
184 #define FIX_0_211164243 1730
185 #define FIX_0_275899380 2260
186 #define FIX_0_298631336 2446
187 #define FIX_0_390180644 3196
188 #define FIX_0_509795579 4176
189 #define FIX_0_541196100 4433
190 #define FIX_0_601344887 4926
191 #define FIX_0_765366865 6270
192 #define FIX_0_785694958 6436
193 #define FIX_0_899976223 7373
194 #define FIX_1_061594337 8697
195 #define FIX_1_111140466 9102
196 #define FIX_1_175875602 9633
197 #define FIX_1_306562965 10703
198 #define FIX_1_387039845 11363
199 #define FIX_1_451774981 11893
200 #define FIX_1_501321110 12299
201 #define FIX_1_662939225 13623
202 #define FIX_1_847759065 15137
203 #define FIX_1_961570560 16069
204 #define FIX_2_053119869 16819
205 #define FIX_2_172734803 17799
206 #define FIX_2_562915447 20995
207 #define FIX_3_072711026 25172
208 
209 /*
210  * Perform the inverse DCT on one block of coefficients.
211  */
212 
214 {
215  int32_t tmp0, tmp1, tmp2, tmp3;
216  int32_t tmp10, tmp11, tmp12, tmp13;
217  int32_t z1, z2, z3, z4, z5;
218  int32_t d0, d1, d2, d3, d4, d5, d6, d7;
219  register int16_t *dataptr;
220  int rowctr;
221 
222  /* Pass 1: process rows. */
223  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
224  /* furthermore, we scale the results by 2**PASS1_BITS. */
225 
226  dataptr = data;
227 
228  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
229  /* Due to quantization, we will usually find that many of the input
230  * coefficients are zero, especially the AC terms. We can exploit this
231  * by short-circuiting the IDCT calculation for any row in which all
232  * the AC terms are zero. In that case each output is equal to the
233  * DC coefficient (with scale factor as needed).
234  * With typical images and quantization tables, half or more of the
235  * row DCT calculations can be simplified this way.
236  */
237 
238  register uint8_t *idataptr = (uint8_t*)dataptr;
239 
240  /* WARNING: we do the same permutation as MMX idct to simplify the
241  video core */
242  d0 = dataptr[0];
243  d2 = dataptr[1];
244  d4 = dataptr[2];
245  d6 = dataptr[3];
246  d1 = dataptr[4];
247  d3 = dataptr[5];
248  d5 = dataptr[6];
249  d7 = dataptr[7];
250 
251  if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
252  /* AC terms all zero */
253  if (d0) {
254  /* Compute a 32 bit value to assign. */
255  int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
256  register int v = (dcval & 0xffff) | ((dcval * (1 << 16)) & 0xffff0000);
257 
258  AV_WN32A(&idataptr[ 0], v);
259  AV_WN32A(&idataptr[ 4], v);
260  AV_WN32A(&idataptr[ 8], v);
261  AV_WN32A(&idataptr[12], v);
262  }
263 
264  dataptr += DCTSIZE; /* advance pointer to next row */
265  continue;
266  }
267 
268  /* Even part: reverse the even part of the forward DCT. */
269  /* The rotator is sqrt(2)*c(-6). */
270 {
271  if (d6) {
272  if (d2) {
273  /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
274  z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
275  tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
276  tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
277 
278  tmp0 = (d0 + d4) * CONST_SCALE;
279  tmp1 = (d0 - d4) * CONST_SCALE;
280 
281  tmp10 = tmp0 + tmp3;
282  tmp13 = tmp0 - tmp3;
283  tmp11 = tmp1 + tmp2;
284  tmp12 = tmp1 - tmp2;
285  } else {
286  /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
287  tmp2 = MULTIPLY(-d6, FIX_1_306562965);
288  tmp3 = MULTIPLY(d6, FIX_0_541196100);
289 
290  tmp0 = (d0 + d4) * CONST_SCALE;
291  tmp1 = (d0 - d4) * CONST_SCALE;
292 
293  tmp10 = tmp0 + tmp3;
294  tmp13 = tmp0 - tmp3;
295  tmp11 = tmp1 + tmp2;
296  tmp12 = tmp1 - tmp2;
297  }
298  } else {
299  if (d2) {
300  /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
301  tmp2 = MULTIPLY(d2, FIX_0_541196100);
302  tmp3 = MULTIPLY(d2, FIX_1_306562965);
303 
304  tmp0 = (d0 + d4) * CONST_SCALE;
305  tmp1 = (d0 - d4) * CONST_SCALE;
306 
307  tmp10 = tmp0 + tmp3;
308  tmp13 = tmp0 - tmp3;
309  tmp11 = tmp1 + tmp2;
310  tmp12 = tmp1 - tmp2;
311  } else {
312  /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
313  tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
314  tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
315  }
316  }
317 
318  /* Odd part per figure 8; the matrix is unitary and hence its
319  * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
320  */
321 
322  if (d7) {
323  if (d5) {
324  if (d3) {
325  if (d1) {
326  /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
327  z1 = d7 + d1;
328  z2 = d5 + d3;
329  z3 = d7 + d3;
330  z4 = d5 + d1;
331  z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
332 
333  tmp0 = MULTIPLY(d7, FIX_0_298631336);
334  tmp1 = MULTIPLY(d5, FIX_2_053119869);
335  tmp2 = MULTIPLY(d3, FIX_3_072711026);
336  tmp3 = MULTIPLY(d1, FIX_1_501321110);
337  z1 = MULTIPLY(-z1, FIX_0_899976223);
338  z2 = MULTIPLY(-z2, FIX_2_562915447);
339  z3 = MULTIPLY(-z3, FIX_1_961570560);
340  z4 = MULTIPLY(-z4, FIX_0_390180644);
341 
342  z3 += z5;
343  z4 += z5;
344 
345  tmp0 += z1 + z3;
346  tmp1 += z2 + z4;
347  tmp2 += z2 + z3;
348  tmp3 += z1 + z4;
349  } else {
350  /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
351  z2 = d5 + d3;
352  z3 = d7 + d3;
353  z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
354 
355  tmp0 = MULTIPLY(d7, FIX_0_298631336);
356  tmp1 = MULTIPLY(d5, FIX_2_053119869);
357  tmp2 = MULTIPLY(d3, FIX_3_072711026);
358  z1 = MULTIPLY(-d7, FIX_0_899976223);
359  z2 = MULTIPLY(-z2, FIX_2_562915447);
360  z3 = MULTIPLY(-z3, FIX_1_961570560);
361  z4 = MULTIPLY(-d5, FIX_0_390180644);
362 
363  z3 += z5;
364  z4 += z5;
365 
366  tmp0 += z1 + z3;
367  tmp1 += z2 + z4;
368  tmp2 += z2 + z3;
369  tmp3 = z1 + z4;
370  }
371  } else {
372  if (d1) {
373  /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
374  z1 = d7 + d1;
375  z4 = d5 + d1;
376  z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
377 
378  tmp0 = MULTIPLY(d7, FIX_0_298631336);
379  tmp1 = MULTIPLY(d5, FIX_2_053119869);
380  tmp3 = MULTIPLY(d1, FIX_1_501321110);
381  z1 = MULTIPLY(-z1, FIX_0_899976223);
382  z2 = MULTIPLY(-d5, FIX_2_562915447);
383  z3 = MULTIPLY(-d7, FIX_1_961570560);
384  z4 = MULTIPLY(-z4, FIX_0_390180644);
385 
386  z3 += z5;
387  z4 += z5;
388 
389  tmp0 += z1 + z3;
390  tmp1 += z2 + z4;
391  tmp2 = z2 + z3;
392  tmp3 += z1 + z4;
393  } else {
394  /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
395  tmp0 = MULTIPLY(-d7, FIX_0_601344887);
396  z1 = MULTIPLY(-d7, FIX_0_899976223);
397  z3 = MULTIPLY(-d7, FIX_1_961570560);
398  tmp1 = MULTIPLY(-d5, FIX_0_509795579);
399  z2 = MULTIPLY(-d5, FIX_2_562915447);
400  z4 = MULTIPLY(-d5, FIX_0_390180644);
401  z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
402 
403  z3 += z5;
404  z4 += z5;
405 
406  tmp0 += z3;
407  tmp1 += z4;
408  tmp2 = z2 + z3;
409  tmp3 = z1 + z4;
410  }
411  }
412  } else {
413  if (d3) {
414  if (d1) {
415  /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
416  z1 = d7 + d1;
417  z3 = d7 + d3;
418  z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
419 
420  tmp0 = MULTIPLY(d7, FIX_0_298631336);
421  tmp2 = MULTIPLY(d3, FIX_3_072711026);
422  tmp3 = MULTIPLY(d1, FIX_1_501321110);
423  z1 = MULTIPLY(-z1, FIX_0_899976223);
424  z2 = MULTIPLY(-d3, FIX_2_562915447);
425  z3 = MULTIPLY(-z3, FIX_1_961570560);
426  z4 = MULTIPLY(-d1, FIX_0_390180644);
427 
428  z3 += z5;
429  z4 += z5;
430 
431  tmp0 += z1 + z3;
432  tmp1 = z2 + z4;
433  tmp2 += z2 + z3;
434  tmp3 += z1 + z4;
435  } else {
436  /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
437  z3 = d7 + d3;
438 
439  tmp0 = MULTIPLY(-d7, FIX_0_601344887);
440  z1 = MULTIPLY(-d7, FIX_0_899976223);
441  tmp2 = MULTIPLY(d3, FIX_0_509795579);
442  z2 = MULTIPLY(-d3, FIX_2_562915447);
443  z5 = MULTIPLY(z3, FIX_1_175875602);
444  z3 = MULTIPLY(-z3, FIX_0_785694958);
445 
446  tmp0 += z3;
447  tmp1 = z2 + z5;
448  tmp2 += z3;
449  tmp3 = z1 + z5;
450  }
451  } else {
452  if (d1) {
453  /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
454  z1 = d7 + d1;
455  z5 = MULTIPLY(z1, FIX_1_175875602);
456 
457  z1 = MULTIPLY(z1, FIX_0_275899380);
458  z3 = MULTIPLY(-d7, FIX_1_961570560);
459  tmp0 = MULTIPLY(-d7, FIX_1_662939225);
460  z4 = MULTIPLY(-d1, FIX_0_390180644);
461  tmp3 = MULTIPLY(d1, FIX_1_111140466);
462 
463  tmp0 += z1;
464  tmp1 = z4 + z5;
465  tmp2 = z3 + z5;
466  tmp3 += z1;
467  } else {
468  /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
469  tmp0 = MULTIPLY(-d7, FIX_1_387039845);
470  tmp1 = MULTIPLY(d7, FIX_1_175875602);
471  tmp2 = MULTIPLY(-d7, FIX_0_785694958);
472  tmp3 = MULTIPLY(d7, FIX_0_275899380);
473  }
474  }
475  }
476  } else {
477  if (d5) {
478  if (d3) {
479  if (d1) {
480  /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
481  z2 = d5 + d3;
482  z4 = d5 + d1;
483  z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
484 
485  tmp1 = MULTIPLY(d5, FIX_2_053119869);
486  tmp2 = MULTIPLY(d3, FIX_3_072711026);
487  tmp3 = MULTIPLY(d1, FIX_1_501321110);
488  z1 = MULTIPLY(-d1, FIX_0_899976223);
489  z2 = MULTIPLY(-z2, FIX_2_562915447);
490  z3 = MULTIPLY(-d3, FIX_1_961570560);
491  z4 = MULTIPLY(-z4, FIX_0_390180644);
492 
493  z3 += z5;
494  z4 += z5;
495 
496  tmp0 = z1 + z3;
497  tmp1 += z2 + z4;
498  tmp2 += z2 + z3;
499  tmp3 += z1 + z4;
500  } else {
501  /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
502  z2 = d5 + d3;
503 
504  z5 = MULTIPLY(z2, FIX_1_175875602);
505  tmp1 = MULTIPLY(d5, FIX_1_662939225);
506  z4 = MULTIPLY(-d5, FIX_0_390180644);
507  z2 = MULTIPLY(-z2, FIX_1_387039845);
508  tmp2 = MULTIPLY(d3, FIX_1_111140466);
509  z3 = MULTIPLY(-d3, FIX_1_961570560);
510 
511  tmp0 = z3 + z5;
512  tmp1 += z2;
513  tmp2 += z2;
514  tmp3 = z4 + z5;
515  }
516  } else {
517  if (d1) {
518  /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
519  z4 = d5 + d1;
520 
521  z5 = MULTIPLY(z4, FIX_1_175875602);
522  z1 = MULTIPLY(-d1, FIX_0_899976223);
523  tmp3 = MULTIPLY(d1, FIX_0_601344887);
524  tmp1 = MULTIPLY(-d5, FIX_0_509795579);
525  z2 = MULTIPLY(-d5, FIX_2_562915447);
526  z4 = MULTIPLY(z4, FIX_0_785694958);
527 
528  tmp0 = z1 + z5;
529  tmp1 += z4;
530  tmp2 = z2 + z5;
531  tmp3 += z4;
532  } else {
533  /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
534  tmp0 = MULTIPLY(d5, FIX_1_175875602);
535  tmp1 = MULTIPLY(d5, FIX_0_275899380);
536  tmp2 = MULTIPLY(-d5, FIX_1_387039845);
537  tmp3 = MULTIPLY(d5, FIX_0_785694958);
538  }
539  }
540  } else {
541  if (d3) {
542  if (d1) {
543  /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
544  z5 = d1 + d3;
545  tmp3 = MULTIPLY(d1, FIX_0_211164243);
546  tmp2 = MULTIPLY(-d3, FIX_1_451774981);
547  z1 = MULTIPLY(d1, FIX_1_061594337);
548  z2 = MULTIPLY(-d3, FIX_2_172734803);
549  z4 = MULTIPLY(z5, FIX_0_785694958);
550  z5 = MULTIPLY(z5, FIX_1_175875602);
551 
552  tmp0 = z1 - z4;
553  tmp1 = z2 + z4;
554  tmp2 += z5;
555  tmp3 += z5;
556  } else {
557  /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
558  tmp0 = MULTIPLY(-d3, FIX_0_785694958);
559  tmp1 = MULTIPLY(-d3, FIX_1_387039845);
560  tmp2 = MULTIPLY(-d3, FIX_0_275899380);
561  tmp3 = MULTIPLY(d3, FIX_1_175875602);
562  }
563  } else {
564  if (d1) {
565  /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
566  tmp0 = MULTIPLY(d1, FIX_0_275899380);
567  tmp1 = MULTIPLY(d1, FIX_0_785694958);
568  tmp2 = MULTIPLY(d1, FIX_1_175875602);
569  tmp3 = MULTIPLY(d1, FIX_1_387039845);
570  } else {
571  /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
572  tmp0 = tmp1 = tmp2 = tmp3 = 0;
573  }
574  }
575  }
576  }
577 }
578  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
579 
580  dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
581  dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
582  dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
583  dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
584  dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
585  dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
586  dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
587  dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
588 
589  dataptr += DCTSIZE; /* advance pointer to next row */
590  }
591 
592  /* Pass 2: process columns. */
593  /* Note that we must descale the results by a factor of 8 == 2**3, */
594  /* and also undo the PASS1_BITS scaling. */
595 
596  dataptr = data;
597  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
598  /* Columns of zeroes can be exploited in the same way as we did with rows.
599  * However, the row calculation has created many nonzero AC terms, so the
600  * simplification applies less often (typically 5% to 10% of the time).
601  * On machines with very fast multiplication, it's possible that the
602  * test takes more time than it's worth. In that case this section
603  * may be commented out.
604  */
605 
606  d0 = dataptr[DCTSIZE*0];
607  d1 = dataptr[DCTSIZE*1];
608  d2 = dataptr[DCTSIZE*2];
609  d3 = dataptr[DCTSIZE*3];
610  d4 = dataptr[DCTSIZE*4];
611  d5 = dataptr[DCTSIZE*5];
612  d6 = dataptr[DCTSIZE*6];
613  d7 = dataptr[DCTSIZE*7];
614 
615  /* Even part: reverse the even part of the forward DCT. */
616  /* The rotator is sqrt(2)*c(-6). */
617  if (d6) {
618  if (d2) {
619  /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
620  z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
621  tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
622  tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
623 
624  tmp0 = (d0 + d4) * CONST_SCALE;
625  tmp1 = (d0 - d4) * CONST_SCALE;
626 
627  tmp10 = tmp0 + tmp3;
628  tmp13 = tmp0 - tmp3;
629  tmp11 = tmp1 + tmp2;
630  tmp12 = tmp1 - tmp2;
631  } else {
632  /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
633  tmp2 = MULTIPLY(-d6, FIX_1_306562965);
634  tmp3 = MULTIPLY(d6, FIX_0_541196100);
635 
636  tmp0 = (d0 + d4) * CONST_SCALE;
637  tmp1 = (d0 - d4) * CONST_SCALE;
638 
639  tmp10 = tmp0 + tmp3;
640  tmp13 = tmp0 - tmp3;
641  tmp11 = tmp1 + tmp2;
642  tmp12 = tmp1 - tmp2;
643  }
644  } else {
645  if (d2) {
646  /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
647  tmp2 = MULTIPLY(d2, FIX_0_541196100);
648  tmp3 = MULTIPLY(d2, FIX_1_306562965);
649 
650  tmp0 = (d0 + d4) * CONST_SCALE;
651  tmp1 = (d0 - d4) * CONST_SCALE;
652 
653  tmp10 = tmp0 + tmp3;
654  tmp13 = tmp0 - tmp3;
655  tmp11 = tmp1 + tmp2;
656  tmp12 = tmp1 - tmp2;
657  } else {
658  /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
659  tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
660  tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
661  }
662  }
663 
664  /* Odd part per figure 8; the matrix is unitary and hence its
665  * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
666  */
667  if (d7) {
668  if (d5) {
669  if (d3) {
670  if (d1) {
671  /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
672  z1 = d7 + d1;
673  z2 = d5 + d3;
674  z3 = d7 + d3;
675  z4 = d5 + d1;
676  z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
677 
678  tmp0 = MULTIPLY(d7, FIX_0_298631336);
679  tmp1 = MULTIPLY(d5, FIX_2_053119869);
680  tmp2 = MULTIPLY(d3, FIX_3_072711026);
681  tmp3 = MULTIPLY(d1, FIX_1_501321110);
682  z1 = MULTIPLY(-z1, FIX_0_899976223);
683  z2 = MULTIPLY(-z2, FIX_2_562915447);
684  z3 = MULTIPLY(-z3, FIX_1_961570560);
685  z4 = MULTIPLY(-z4, FIX_0_390180644);
686 
687  z3 += z5;
688  z4 += z5;
689 
690  tmp0 += z1 + z3;
691  tmp1 += z2 + z4;
692  tmp2 += z2 + z3;
693  tmp3 += z1 + z4;
694  } else {
695  /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
696  z2 = d5 + d3;
697  z3 = d7 + d3;
698  z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
699 
700  tmp0 = MULTIPLY(d7, FIX_0_298631336);
701  tmp1 = MULTIPLY(d5, FIX_2_053119869);
702  tmp2 = MULTIPLY(d3, FIX_3_072711026);
703  z1 = MULTIPLY(-d7, FIX_0_899976223);
704  z2 = MULTIPLY(-z2, FIX_2_562915447);
705  z3 = MULTIPLY(-z3, FIX_1_961570560);
706  z4 = MULTIPLY(-d5, FIX_0_390180644);
707 
708  z3 += z5;
709  z4 += z5;
710 
711  tmp0 += z1 + z3;
712  tmp1 += z2 + z4;
713  tmp2 += z2 + z3;
714  tmp3 = z1 + z4;
715  }
716  } else {
717  if (d1) {
718  /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
719  z1 = d7 + d1;
720  z3 = d7;
721  z4 = d5 + d1;
722  z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
723 
724  tmp0 = MULTIPLY(d7, FIX_0_298631336);
725  tmp1 = MULTIPLY(d5, FIX_2_053119869);
726  tmp3 = MULTIPLY(d1, FIX_1_501321110);
727  z1 = MULTIPLY(-z1, FIX_0_899976223);
728  z2 = MULTIPLY(-d5, FIX_2_562915447);
729  z3 = MULTIPLY(-d7, FIX_1_961570560);
730  z4 = MULTIPLY(-z4, FIX_0_390180644);
731 
732  z3 += z5;
733  z4 += z5;
734 
735  tmp0 += z1 + z3;
736  tmp1 += z2 + z4;
737  tmp2 = z2 + z3;
738  tmp3 += z1 + z4;
739  } else {
740  /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
741  tmp0 = MULTIPLY(-d7, FIX_0_601344887);
742  z1 = MULTIPLY(-d7, FIX_0_899976223);
743  z3 = MULTIPLY(-d7, FIX_1_961570560);
744  tmp1 = MULTIPLY(-d5, FIX_0_509795579);
745  z2 = MULTIPLY(-d5, FIX_2_562915447);
746  z4 = MULTIPLY(-d5, FIX_0_390180644);
747  z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
748 
749  z3 += z5;
750  z4 += z5;
751 
752  tmp0 += z3;
753  tmp1 += z4;
754  tmp2 = z2 + z3;
755  tmp3 = z1 + z4;
756  }
757  }
758  } else {
759  if (d3) {
760  if (d1) {
761  /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
762  z1 = d7 + d1;
763  z3 = d7 + d3;
764  z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
765 
766  tmp0 = MULTIPLY(d7, FIX_0_298631336);
767  tmp2 = MULTIPLY(d3, FIX_3_072711026);
768  tmp3 = MULTIPLY(d1, FIX_1_501321110);
769  z1 = MULTIPLY(-z1, FIX_0_899976223);
770  z2 = MULTIPLY(-d3, FIX_2_562915447);
771  z3 = MULTIPLY(-z3, FIX_1_961570560);
772  z4 = MULTIPLY(-d1, FIX_0_390180644);
773 
774  z3 += z5;
775  z4 += z5;
776 
777  tmp0 += z1 + z3;
778  tmp1 = z2 + z4;
779  tmp2 += z2 + z3;
780  tmp3 += z1 + z4;
781  } else {
782  /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
783  z3 = d7 + d3;
784 
785  tmp0 = MULTIPLY(-d7, FIX_0_601344887);
786  z1 = MULTIPLY(-d7, FIX_0_899976223);
787  tmp2 = MULTIPLY(d3, FIX_0_509795579);
788  z2 = MULTIPLY(-d3, FIX_2_562915447);
789  z5 = MULTIPLY(z3, FIX_1_175875602);
790  z3 = MULTIPLY(-z3, FIX_0_785694958);
791 
792  tmp0 += z3;
793  tmp1 = z2 + z5;
794  tmp2 += z3;
795  tmp3 = z1 + z5;
796  }
797  } else {
798  if (d1) {
799  /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
800  z1 = d7 + d1;
801  z5 = MULTIPLY(z1, FIX_1_175875602);
802 
803  z1 = MULTIPLY(z1, FIX_0_275899380);
804  z3 = MULTIPLY(-d7, FIX_1_961570560);
805  tmp0 = MULTIPLY(-d7, FIX_1_662939225);
806  z4 = MULTIPLY(-d1, FIX_0_390180644);
807  tmp3 = MULTIPLY(d1, FIX_1_111140466);
808 
809  tmp0 += z1;
810  tmp1 = z4 + z5;
811  tmp2 = z3 + z5;
812  tmp3 += z1;
813  } else {
814  /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
815  tmp0 = MULTIPLY(-d7, FIX_1_387039845);
816  tmp1 = MULTIPLY(d7, FIX_1_175875602);
817  tmp2 = MULTIPLY(-d7, FIX_0_785694958);
818  tmp3 = MULTIPLY(d7, FIX_0_275899380);
819  }
820  }
821  }
822  } else {
823  if (d5) {
824  if (d3) {
825  if (d1) {
826  /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
827  z2 = d5 + d3;
828  z4 = d5 + d1;
829  z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
830 
831  tmp1 = MULTIPLY(d5, FIX_2_053119869);
832  tmp2 = MULTIPLY(d3, FIX_3_072711026);
833  tmp3 = MULTIPLY(d1, FIX_1_501321110);
834  z1 = MULTIPLY(-d1, FIX_0_899976223);
835  z2 = MULTIPLY(-z2, FIX_2_562915447);
836  z3 = MULTIPLY(-d3, FIX_1_961570560);
837  z4 = MULTIPLY(-z4, FIX_0_390180644);
838 
839  z3 += z5;
840  z4 += z5;
841 
842  tmp0 = z1 + z3;
843  tmp1 += z2 + z4;
844  tmp2 += z2 + z3;
845  tmp3 += z1 + z4;
846  } else {
847  /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
848  z2 = d5 + d3;
849 
850  z5 = MULTIPLY(z2, FIX_1_175875602);
851  tmp1 = MULTIPLY(d5, FIX_1_662939225);
852  z4 = MULTIPLY(-d5, FIX_0_390180644);
853  z2 = MULTIPLY(-z2, FIX_1_387039845);
854  tmp2 = MULTIPLY(d3, FIX_1_111140466);
855  z3 = MULTIPLY(-d3, FIX_1_961570560);
856 
857  tmp0 = z3 + z5;
858  tmp1 += z2;
859  tmp2 += z2;
860  tmp3 = z4 + z5;
861  }
862  } else {
863  if (d1) {
864  /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
865  z4 = d5 + d1;
866 
867  z5 = MULTIPLY(z4, FIX_1_175875602);
868  z1 = MULTIPLY(-d1, FIX_0_899976223);
869  tmp3 = MULTIPLY(d1, FIX_0_601344887);
870  tmp1 = MULTIPLY(-d5, FIX_0_509795579);
871  z2 = MULTIPLY(-d5, FIX_2_562915447);
872  z4 = MULTIPLY(z4, FIX_0_785694958);
873 
874  tmp0 = z1 + z5;
875  tmp1 += z4;
876  tmp2 = z2 + z5;
877  tmp3 += z4;
878  } else {
879  /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
880  tmp0 = MULTIPLY(d5, FIX_1_175875602);
881  tmp1 = MULTIPLY(d5, FIX_0_275899380);
882  tmp2 = MULTIPLY(-d5, FIX_1_387039845);
883  tmp3 = MULTIPLY(d5, FIX_0_785694958);
884  }
885  }
886  } else {
887  if (d3) {
888  if (d1) {
889  /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
890  z5 = d1 + d3;
891  tmp3 = MULTIPLY(d1, FIX_0_211164243);
892  tmp2 = MULTIPLY(-d3, FIX_1_451774981);
893  z1 = MULTIPLY(d1, FIX_1_061594337);
894  z2 = MULTIPLY(-d3, FIX_2_172734803);
895  z4 = MULTIPLY(z5, FIX_0_785694958);
896  z5 = MULTIPLY(z5, FIX_1_175875602);
897 
898  tmp0 = z1 - z4;
899  tmp1 = z2 + z4;
900  tmp2 += z5;
901  tmp3 += z5;
902  } else {
903  /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
904  tmp0 = MULTIPLY(-d3, FIX_0_785694958);
905  tmp1 = MULTIPLY(-d3, FIX_1_387039845);
906  tmp2 = MULTIPLY(-d3, FIX_0_275899380);
907  tmp3 = MULTIPLY(d3, FIX_1_175875602);
908  }
909  } else {
910  if (d1) {
911  /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
912  tmp0 = MULTIPLY(d1, FIX_0_275899380);
913  tmp1 = MULTIPLY(d1, FIX_0_785694958);
914  tmp2 = MULTIPLY(d1, FIX_1_175875602);
915  tmp3 = MULTIPLY(d1, FIX_1_387039845);
916  } else {
917  /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
918  tmp0 = tmp1 = tmp2 = tmp3 = 0;
919  }
920  }
921  }
922  }
923 
924  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
925 
926  dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3,
928  dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3,
930  dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2,
932  dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2,
934  dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1,
936  dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1,
938  dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0,
940  dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0,
942 
943  dataptr++; /* advance pointer to next column */
944  }
945 }
946 
947 #undef DCTSIZE
948 #define DCTSIZE 4
949 #define DCTSTRIDE 8
950 
952 {
953  int32_t tmp0, tmp1, tmp2, tmp3;
954  int32_t tmp10, tmp11, tmp12, tmp13;
955  int32_t z1;
956  int32_t d0, d2, d4, d6;
957  register int16_t *dataptr;
958  int rowctr;
959 
960  /* Pass 1: process rows. */
961  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
962  /* furthermore, we scale the results by 2**PASS1_BITS. */
963 
964  data[0] += 4;
965 
966  dataptr = data;
967 
968  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
969  /* Due to quantization, we will usually find that many of the input
970  * coefficients are zero, especially the AC terms. We can exploit this
971  * by short-circuiting the IDCT calculation for any row in which all
972  * the AC terms are zero. In that case each output is equal to the
973  * DC coefficient (with scale factor as needed).
974  * With typical images and quantization tables, half or more of the
975  * row DCT calculations can be simplified this way.
976  */
977 
978  register uint8_t *idataptr = (uint8_t*)dataptr;
979 
980  d0 = dataptr[0];
981  d2 = dataptr[1];
982  d4 = dataptr[2];
983  d6 = dataptr[3];
984 
985  if ((d2 | d4 | d6) == 0) {
986  /* AC terms all zero */
987  if (d0) {
988  /* Compute a 32 bit value to assign. */
989  int16_t dcval = (int16_t) (d0 << PASS1_BITS);
990  register int v = (dcval & 0xffff) | ((dcval << 16) & 0xffff0000);
991 
992  AV_WN32A(&idataptr[0], v);
993  AV_WN32A(&idataptr[4], v);
994  }
995 
996  dataptr += DCTSTRIDE; /* advance pointer to next row */
997  continue;
998  }
999 
1000  /* Even part: reverse the even part of the forward DCT. */
1001  /* The rotator is sqrt(2)*c(-6). */
1002  if (d6) {
1003  if (d2) {
1004  /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1005  z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1006  tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1007  tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1008 
1009  tmp0 = (d0 + d4) << CONST_BITS;
1010  tmp1 = (d0 - d4) << CONST_BITS;
1011 
1012  tmp10 = tmp0 + tmp3;
1013  tmp13 = tmp0 - tmp3;
1014  tmp11 = tmp1 + tmp2;
1015  tmp12 = tmp1 - tmp2;
1016  } else {
1017  /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1018  tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1019  tmp3 = MULTIPLY(d6, FIX_0_541196100);
1020 
1021  tmp0 = (d0 + d4) << CONST_BITS;
1022  tmp1 = (d0 - d4) << CONST_BITS;
1023 
1024  tmp10 = tmp0 + tmp3;
1025  tmp13 = tmp0 - tmp3;
1026  tmp11 = tmp1 + tmp2;
1027  tmp12 = tmp1 - tmp2;
1028  }
1029  } else {
1030  if (d2) {
1031  /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1032  tmp2 = MULTIPLY(d2, FIX_0_541196100);
1033  tmp3 = MULTIPLY(d2, FIX_1_306562965);
1034 
1035  tmp0 = (d0 + d4) << CONST_BITS;
1036  tmp1 = (d0 - d4) << CONST_BITS;
1037 
1038  tmp10 = tmp0 + tmp3;
1039  tmp13 = tmp0 - tmp3;
1040  tmp11 = tmp1 + tmp2;
1041  tmp12 = tmp1 - tmp2;
1042  } else {
1043  /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1044  tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1045  tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1046  }
1047  }
1048 
1049  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1050 
1051  dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
1052  dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
1053  dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
1054  dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
1055 
1056  dataptr += DCTSTRIDE; /* advance pointer to next row */
1057  }
1058 
1059  /* Pass 2: process columns. */
1060  /* Note that we must descale the results by a factor of 8 == 2**3, */
1061  /* and also undo the PASS1_BITS scaling. */
1062 
1063  dataptr = data;
1064  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
1065  /* Columns of zeroes can be exploited in the same way as we did with rows.
1066  * However, the row calculation has created many nonzero AC terms, so the
1067  * simplification applies less often (typically 5% to 10% of the time).
1068  * On machines with very fast multiplication, it's possible that the
1069  * test takes more time than it's worth. In that case this section
1070  * may be commented out.
1071  */
1072 
1073  d0 = dataptr[DCTSTRIDE*0];
1074  d2 = dataptr[DCTSTRIDE*1];
1075  d4 = dataptr[DCTSTRIDE*2];
1076  d6 = dataptr[DCTSTRIDE*3];
1077 
1078  /* Even part: reverse the even part of the forward DCT. */
1079  /* The rotator is sqrt(2)*c(-6). */
1080  if (d6) {
1081  if (d2) {
1082  /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1083  z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1084  tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1085  tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1086 
1087  tmp0 = (d0 + d4) << CONST_BITS;
1088  tmp1 = (d0 - d4) << CONST_BITS;
1089 
1090  tmp10 = tmp0 + tmp3;
1091  tmp13 = tmp0 - tmp3;
1092  tmp11 = tmp1 + tmp2;
1093  tmp12 = tmp1 - tmp2;
1094  } else {
1095  /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1096  tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1097  tmp3 = MULTIPLY(d6, FIX_0_541196100);
1098 
1099  tmp0 = (d0 + d4) << CONST_BITS;
1100  tmp1 = (d0 - d4) << CONST_BITS;
1101 
1102  tmp10 = tmp0 + tmp3;
1103  tmp13 = tmp0 - tmp3;
1104  tmp11 = tmp1 + tmp2;
1105  tmp12 = tmp1 - tmp2;
1106  }
1107  } else {
1108  if (d2) {
1109  /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1110  tmp2 = MULTIPLY(d2, FIX_0_541196100);
1111  tmp3 = MULTIPLY(d2, FIX_1_306562965);
1112 
1113  tmp0 = (d0 + d4) << CONST_BITS;
1114  tmp1 = (d0 - d4) << CONST_BITS;
1115 
1116  tmp10 = tmp0 + tmp3;
1117  tmp13 = tmp0 - tmp3;
1118  tmp11 = tmp1 + tmp2;
1119  tmp12 = tmp1 - tmp2;
1120  } else {
1121  /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1122  tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
1123  tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
1124  }
1125  }
1126 
1127  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1128 
1129  dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
1130  dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
1131  dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
1132  dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
1133 
1134  dataptr++; /* advance pointer to next column */
1135  }
1136 }
1137 
1139  int d00, d01, d10, d11;
1140 
1141  data[0] += 4;
1142  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
1143  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
1144  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
1145  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
1146 
1147  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
1148  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
1149  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
1150  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
1151 }
1152 
1154  data[0] = (data[0] + 4)>>3;
1155 }
1156 
1157 #undef FIX
1158 #undef CONST_BITS
1159 
1160 void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
1161 {
1162  ff_j_rev_dct(block);
1163  ff_put_pixels_clamped_c(block, dest, line_size);
1164 }
1165 
1166 void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
1167 {
1168  ff_j_rev_dct(block);
1169  ff_add_pixels_clamped_c(block, dest, line_size);
1170 }
void ff_j_rev_dct1(DCTBLOCK data)
Definition: jrevdct.c:1153
#define FIX_0_899976223
Definition: jrevdct.c:193
ptrdiff_t const GLvoid * data
Definition: opengl_enc.c:101
#define DCTSIZE
Definition: jrevdct.c:948
#define FIX_1_501321110
Definition: jrevdct.c:200
#define FIX_1_847759065
Definition: jrevdct.c:202
void ff_j_rev_dct4(DCTBLOCK data)
Definition: jrevdct.c:951
#define DESCALE(x, n)
Definition: jrevdct.c:153
#define CONST_SCALE
Definition: jrevdct.c:137
#define AV_WN32A(p, v)
Definition: intreadwrite.h:538
#define FIX_0_541196100
Definition: jrevdct.c:189
#define FIX_0_765366865
Definition: jrevdct.c:191
static int16_t block[64]
Definition: dct.c:115
uint8_t
#define FIX_2_562915447
Definition: jrevdct.c:206
#define FIX_2_172734803
Definition: jrevdct.c:205
int16_t DCTBLOCK[DCTSIZE2]
Definition: jrevdct.c:80
void ff_add_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels, ptrdiff_t line_size)
Definition: idctdsp.c:157
#define FIX_1_451774981
Definition: jrevdct.c:199
#define DCTSIZE2
Definition: jrevdct.c:74
#define CONST_BITS
Definition: jrevdct.c:82
#define FIX_1_662939225
Definition: jrevdct.c:201
void ff_j_rev_dct2(DCTBLOCK data)
Definition: jrevdct.c:1138
int32_t
#define FIX_0_390180644
Definition: jrevdct.c:187
void ff_put_pixels_clamped_c(const int16_t *block, uint8_t *av_restrict pixels, ptrdiff_t line_size)
Definition: idctdsp.c:83
#define FIX_0_298631336
Definition: jrevdct.c:186
#define FIX_0_509795579
Definition: jrevdct.c:188
#define FIX_1_175875602
Definition: jrevdct.c:196
#define FIX_1_306562965
Definition: jrevdct.c:197
#define PASS1_BITS
Definition: jrevdct.c:130
void ff_j_rev_dct(DCTBLOCK data)
Definition: jrevdct.c:213
#define FIX_0_211164243
Definition: jrevdct.c:184
#define FIX_1_387039845
Definition: jrevdct.c:198
void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: jrevdct.c:1160
#define FIX_2_053119869
Definition: jrevdct.c:204
#define FIX_0_275899380
Definition: jrevdct.c:185
#define FIX_0_785694958
Definition: jrevdct.c:192
#define FIX_1_961570560
Definition: jrevdct.c:203
common internal and external API header
#define FIX_3_072711026
Definition: jrevdct.c:207
#define DCTSTRIDE
Definition: jrevdct.c:949
#define FIX_1_111140466
Definition: jrevdct.c:195
void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: jrevdct.c:1166
#define MULTIPLY(var, const)
Definition: jrevdct.c:176
#define FIX_0_601344887
Definition: jrevdct.c:190
#define FIX_1_061594337
Definition: jrevdct.c:194