FFmpeg
jrevdct.c
Go to the documentation of this file.
1 /*
2  * This file is part of the Independent JPEG Group's software.
3  *
4  * The authors make NO WARRANTY or representation, either express or implied,
5  * with respect to this software, its quality, accuracy, merchantability, or
6  * fitness for a particular purpose. This software is provided "AS IS", and
7  * you, its user, assume the entire risk as to its quality and accuracy.
8  *
9  * This software is copyright (C) 1991, 1992, Thomas G. Lane.
10  * All Rights Reserved except as specified below.
11  *
12  * Permission is hereby granted to use, copy, modify, and distribute this
13  * software (or portions thereof) for any purpose, without fee, subject to
14  * these conditions:
15  * (1) If any part of the source code for this software is distributed, then
16  * this README file must be included, with this copyright and no-warranty
17  * notice unaltered; and any additions, deletions, or changes to the original
18  * files must be clearly indicated in accompanying documentation.
19  * (2) If only executable code is distributed, then the accompanying
20  * documentation must state that "this software is based in part on the work
21  * of the Independent JPEG Group".
22  * (3) Permission for use of this software is granted only if the user accepts
23  * full responsibility for any undesirable consequences; the authors accept
24  * NO LIABILITY for damages of any kind.
25  *
26  * These conditions apply to any software derived from or based on the IJG
27  * code, not just to the unmodified library. If you use our work, you ought
28  * to acknowledge us.
29  *
30  * Permission is NOT granted for the use of any IJG author's name or company
31  * name in advertising or publicity relating to this software or products
32  * derived from it. This software may be referred to only as "the Independent
33  * JPEG Group's software".
34  *
35  * We specifically permit and encourage the use of this software as the basis
36  * of commercial products, provided that all warranty or liability claims are
37  * assumed by the product vendor.
38  *
39  * This file contains the basic inverse-DCT transformation subroutine.
40  *
41  * This implementation is based on an algorithm described in
42  * C. Loeffler, A. Ligtenberg and G. Moschytz, "Practical Fast 1-D DCT
43  * Algorithms with 11 Multiplications", Proc. Int'l. Conf. on Acoustics,
44  * Speech, and Signal Processing 1989 (ICASSP '89), pp. 988-991.
45  * The primary algorithm described there uses 11 multiplies and 29 adds.
46  * We use their alternate method with 12 multiplies and 32 adds.
47  * The advantage of this method is that no data path contains more than one
48  * multiplication; this allows a very simple and accurate implementation in
49  * scaled fixed-point arithmetic, with a minimal number of shifts.
50  *
51  * I've made lots of modifications to attempt to take advantage of the
52  * sparse nature of the DCT matrices we're getting. Although the logic
53  * is cumbersome, it's straightforward and the resulting code is much
54  * faster.
55  *
56  * A better way to do this would be to pass in the DCT block as a sparse
57  * matrix, perhaps with the difference cases encoded.
58  */
59 
60 /**
61  * @file
62  * Independent JPEG Group's LLM idct.
63  */
64 
65 #include <stddef.h>
66 #include <stdint.h>
67 
68 #include "libavutil/intreadwrite.h"
69 
70 #include "dct.h"
71 #include "idctdsp.h"
72 
73 #define EIGHT_BIT_SAMPLES
74 
75 #define DCTSIZE 8
76 #define DCTSIZE2 64
77 
78 #define GLOBAL
79 
80 #define RIGHT_SHIFT(x, n) ((x) >> (n))
81 
82 typedef int16_t DCTBLOCK[DCTSIZE2];
83 
84 #define CONST_BITS 13
85 
86 /*
87  * This routine is specialized to the case DCTSIZE = 8.
88  */
89 
90 #if DCTSIZE != 8
91  Sorry, this code only copes with 8x8 DCTs. /* deliberate syntax err */
92 #endif
93 
94 
95 /*
96  * A 2-D IDCT can be done by 1-D IDCT on each row followed by 1-D IDCT
97  * on each column. Direct algorithms are also available, but they are
98  * much more complex and seem not to be any faster when reduced to code.
99  *
100  * The poop on this scaling stuff is as follows:
101  *
102  * Each 1-D IDCT step produces outputs which are a factor of sqrt(N)
103  * larger than the true IDCT outputs. The final outputs are therefore
104  * a factor of N larger than desired; since N=8 this can be cured by
105  * a simple right shift at the end of the algorithm. The advantage of
106  * this arrangement is that we save two multiplications per 1-D IDCT,
107  * because the y0 and y4 inputs need not be divided by sqrt(N).
108  *
109  * We have to do addition and subtraction of the integer inputs, which
110  * is no problem, and multiplication by fractional constants, which is
111  * a problem to do in integer arithmetic. We multiply all the constants
112  * by CONST_SCALE and convert them to integer constants (thus retaining
113  * CONST_BITS bits of precision in the constants). After doing a
114  * multiplication we have to divide the product by CONST_SCALE, with proper
115  * rounding, to produce the correct output. This division can be done
116  * cheaply as a right shift of CONST_BITS bits. We postpone shifting
117  * as long as possible so that partial sums can be added together with
118  * full fractional precision.
119  *
120  * The outputs of the first pass are scaled up by PASS1_BITS bits so that
121  * they are represented to better-than-integral precision. These outputs
122  * require BITS_IN_JSAMPLE + PASS1_BITS + 3 bits; this fits in a 16-bit word
123  * with the recommended scaling. (To scale up 12-bit sample data further, an
124  * intermediate int32 array would be needed.)
125  *
126  * To avoid overflow of the 32-bit intermediate results in pass 2, we must
127  * have BITS_IN_JSAMPLE + CONST_BITS + PASS1_BITS <= 26. Error analysis
128  * shows that the values given below are the most effective.
129  */
130 
131 #ifdef EIGHT_BIT_SAMPLES
132 #define PASS1_BITS 2
133 #else
134 #define PASS1_BITS 1 /* lose a little precision to avoid overflow */
135 #endif
136 
137 #define ONE ((int32_t) 1)
138 
139 #define CONST_SCALE (ONE << CONST_BITS)
140 
141 /* Convert a positive real constant to an integer scaled by CONST_SCALE.
142  * IMPORTANT: if your compiler doesn't do this arithmetic at compile time,
143  * you will pay a significant penalty in run time. In that case, figure
144  * the correct integer constant values and insert them by hand.
145  */
146 
147 /* Actually FIX is no longer used, we precomputed them all */
148 #define FIX(x) ((int32_t) ((x) * CONST_SCALE + 0.5))
149 
150 /* Descale and correctly round an int32_t value that's scaled by N bits.
151  * We assume RIGHT_SHIFT rounds towards minus infinity, so adding
152  * the fudge factor is correct for either sign of X.
153  */
154 
155 #define DESCALE(x,n) RIGHT_SHIFT((x) + (ONE << ((n)-1)), n)
156 
157 /* Multiply an int32_t variable by an int32_t constant to yield an int32_t result.
158  * For 8-bit samples with the recommended scaling, all the variable
159  * and constant values involved are no more than 16 bits wide, so a
160  * 16x16->32 bit multiply can be used instead of a full 32x32 multiply;
161  * this provides a useful speedup on many machines.
162  * There is no way to specify a 16x16->32 multiply in portable C, but
163  * some C compilers will do the right thing if you provide the correct
164  * combination of casts.
165  * NB: for 12-bit samples, a full 32-bit multiplication will be needed.
166  */
167 
168 #ifdef EIGHT_BIT_SAMPLES
169 #ifdef SHORTxSHORT_32 /* may work if 'int' is 32 bits */
170 #define MULTIPLY(var,const) (((int16_t) (var)) * ((int16_t) (const)))
171 #endif
172 #ifdef SHORTxLCONST_32 /* known to work with Microsoft C 6.0 */
173 #define MULTIPLY(var,const) (((int16_t) (var)) * ((int32_t) (const)))
174 #endif
175 #endif
176 
177 #ifndef MULTIPLY /* default definition */
178 #define MULTIPLY(var,const) ((var) * (const))
179 #endif
180 
181 
182 /*
183  Unlike our decoder where we approximate the FIXes, we need to use exact
184 ones here or successive P-frames will drift too much with Reference frame coding
185 */
186 #define FIX_0_211164243 1730
187 #define FIX_0_275899380 2260
188 #define FIX_0_298631336 2446
189 #define FIX_0_390180644 3196
190 #define FIX_0_509795579 4176
191 #define FIX_0_541196100 4433
192 #define FIX_0_601344887 4926
193 #define FIX_0_765366865 6270
194 #define FIX_0_785694958 6436
195 #define FIX_0_899976223 7373
196 #define FIX_1_061594337 8697
197 #define FIX_1_111140466 9102
198 #define FIX_1_175875602 9633
199 #define FIX_1_306562965 10703
200 #define FIX_1_387039845 11363
201 #define FIX_1_451774981 11893
202 #define FIX_1_501321110 12299
203 #define FIX_1_662939225 13623
204 #define FIX_1_847759065 15137
205 #define FIX_1_961570560 16069
206 #define FIX_2_053119869 16819
207 #define FIX_2_172734803 17799
208 #define FIX_2_562915447 20995
209 #define FIX_3_072711026 25172
210 
211 /*
212  * Perform the inverse DCT on one block of coefficients.
213  */
214 
216 {
217  int32_t tmp0, tmp1, tmp2, tmp3;
218  int32_t tmp10, tmp11, tmp12, tmp13;
219  int32_t z1, z2, z3, z4, z5;
220  int32_t d0, d1, d2, d3, d4, d5, d6, d7;
221  register int16_t *dataptr;
222  int rowctr;
223 
224  /* Pass 1: process rows. */
225  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
226  /* furthermore, we scale the results by 2**PASS1_BITS. */
227 
228  dataptr = data;
229 
230  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
231  /* Due to quantization, we will usually find that many of the input
232  * coefficients are zero, especially the AC terms. We can exploit this
233  * by short-circuiting the IDCT calculation for any row in which all
234  * the AC terms are zero. In that case each output is equal to the
235  * DC coefficient (with scale factor as needed).
236  * With typical images and quantization tables, half or more of the
237  * row DCT calculations can be simplified this way.
238  */
239 
240  register uint8_t *idataptr = (uint8_t*)dataptr;
241 
242  /* WARNING: we do the same permutation as MMX idct to simplify the
243  video core */
244  d0 = dataptr[0];
245  d2 = dataptr[1];
246  d4 = dataptr[2];
247  d6 = dataptr[3];
248  d1 = dataptr[4];
249  d3 = dataptr[5];
250  d5 = dataptr[6];
251  d7 = dataptr[7];
252 
253  if ((d1 | d2 | d3 | d4 | d5 | d6 | d7) == 0) {
254  /* AC terms all zero */
255  if (d0) {
256  /* Compute a 32 bit value to assign. */
257  int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
258  register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16);
259 
260  AV_WN32A(&idataptr[ 0], v);
261  AV_WN32A(&idataptr[ 4], v);
262  AV_WN32A(&idataptr[ 8], v);
263  AV_WN32A(&idataptr[12], v);
264  }
265 
266  dataptr += DCTSIZE; /* advance pointer to next row */
267  continue;
268  }
269 
270  /* Even part: reverse the even part of the forward DCT. */
271  /* The rotator is sqrt(2)*c(-6). */
272 {
273  if (d6) {
274  if (d2) {
275  /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
276  z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
277  tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
278  tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
279 
280  tmp0 = (d0 + d4) * CONST_SCALE;
281  tmp1 = (d0 - d4) * CONST_SCALE;
282 
283  tmp10 = tmp0 + tmp3;
284  tmp13 = tmp0 - tmp3;
285  tmp11 = tmp1 + tmp2;
286  tmp12 = tmp1 - tmp2;
287  } else {
288  /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
289  tmp2 = MULTIPLY(-d6, FIX_1_306562965);
290  tmp3 = MULTIPLY(d6, FIX_0_541196100);
291 
292  tmp0 = (d0 + d4) * CONST_SCALE;
293  tmp1 = (d0 - d4) * CONST_SCALE;
294 
295  tmp10 = tmp0 + tmp3;
296  tmp13 = tmp0 - tmp3;
297  tmp11 = tmp1 + tmp2;
298  tmp12 = tmp1 - tmp2;
299  }
300  } else {
301  if (d2) {
302  /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
303  tmp2 = MULTIPLY(d2, FIX_0_541196100);
304  tmp3 = MULTIPLY(d2, FIX_1_306562965);
305 
306  tmp0 = (d0 + d4) * CONST_SCALE;
307  tmp1 = (d0 - d4) * CONST_SCALE;
308 
309  tmp10 = tmp0 + tmp3;
310  tmp13 = tmp0 - tmp3;
311  tmp11 = tmp1 + tmp2;
312  tmp12 = tmp1 - tmp2;
313  } else {
314  /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
315  tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
316  tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
317  }
318  }
319 
320  /* Odd part per figure 8; the matrix is unitary and hence its
321  * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
322  */
323 
324  if (d7) {
325  if (d5) {
326  if (d3) {
327  if (d1) {
328  /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
329  z1 = d7 + d1;
330  z2 = d5 + d3;
331  z3 = d7 + d3;
332  z4 = d5 + d1;
333  z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
334 
335  tmp0 = MULTIPLY(d7, FIX_0_298631336);
336  tmp1 = MULTIPLY(d5, FIX_2_053119869);
337  tmp2 = MULTIPLY(d3, FIX_3_072711026);
338  tmp3 = MULTIPLY(d1, FIX_1_501321110);
339  z1 = MULTIPLY(-z1, FIX_0_899976223);
340  z2 = MULTIPLY(-z2, FIX_2_562915447);
341  z3 = MULTIPLY(-z3, FIX_1_961570560);
342  z4 = MULTIPLY(-z4, FIX_0_390180644);
343 
344  z3 += z5;
345  z4 += z5;
346 
347  tmp0 += z1 + z3;
348  tmp1 += z2 + z4;
349  tmp2 += z2 + z3;
350  tmp3 += z1 + z4;
351  } else {
352  /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
353  z2 = d5 + d3;
354  z3 = d7 + d3;
355  z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
356 
357  tmp0 = MULTIPLY(d7, FIX_0_298631336);
358  tmp1 = MULTIPLY(d5, FIX_2_053119869);
359  tmp2 = MULTIPLY(d3, FIX_3_072711026);
360  z1 = MULTIPLY(-d7, FIX_0_899976223);
361  z2 = MULTIPLY(-z2, FIX_2_562915447);
362  z3 = MULTIPLY(-z3, FIX_1_961570560);
363  z4 = MULTIPLY(-d5, FIX_0_390180644);
364 
365  z3 += z5;
366  z4 += z5;
367 
368  tmp0 += z1 + z3;
369  tmp1 += z2 + z4;
370  tmp2 += z2 + z3;
371  tmp3 = z1 + z4;
372  }
373  } else {
374  if (d1) {
375  /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
376  z1 = d7 + d1;
377  z4 = d5 + d1;
378  z5 = MULTIPLY(d7 + z4, FIX_1_175875602);
379 
380  tmp0 = MULTIPLY(d7, FIX_0_298631336);
381  tmp1 = MULTIPLY(d5, FIX_2_053119869);
382  tmp3 = MULTIPLY(d1, FIX_1_501321110);
383  z1 = MULTIPLY(-z1, FIX_0_899976223);
384  z2 = MULTIPLY(-d5, FIX_2_562915447);
385  z3 = MULTIPLY(-d7, FIX_1_961570560);
386  z4 = MULTIPLY(-z4, FIX_0_390180644);
387 
388  z3 += z5;
389  z4 += z5;
390 
391  tmp0 += z1 + z3;
392  tmp1 += z2 + z4;
393  tmp2 = z2 + z3;
394  tmp3 += z1 + z4;
395  } else {
396  /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
397  tmp0 = MULTIPLY(-d7, FIX_0_601344887);
398  z1 = MULTIPLY(-d7, FIX_0_899976223);
399  z3 = MULTIPLY(-d7, FIX_1_961570560);
400  tmp1 = MULTIPLY(-d5, FIX_0_509795579);
401  z2 = MULTIPLY(-d5, FIX_2_562915447);
402  z4 = MULTIPLY(-d5, FIX_0_390180644);
403  z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
404 
405  z3 += z5;
406  z4 += z5;
407 
408  tmp0 += z3;
409  tmp1 += z4;
410  tmp2 = z2 + z3;
411  tmp3 = z1 + z4;
412  }
413  }
414  } else {
415  if (d3) {
416  if (d1) {
417  /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
418  z1 = d7 + d1;
419  z3 = d7 + d3;
420  z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
421 
422  tmp0 = MULTIPLY(d7, FIX_0_298631336);
423  tmp2 = MULTIPLY(d3, FIX_3_072711026);
424  tmp3 = MULTIPLY(d1, FIX_1_501321110);
425  z1 = MULTIPLY(-z1, FIX_0_899976223);
426  z2 = MULTIPLY(-d3, FIX_2_562915447);
427  z3 = MULTIPLY(-z3, FIX_1_961570560);
428  z4 = MULTIPLY(-d1, FIX_0_390180644);
429 
430  z3 += z5;
431  z4 += z5;
432 
433  tmp0 += z1 + z3;
434  tmp1 = z2 + z4;
435  tmp2 += z2 + z3;
436  tmp3 += z1 + z4;
437  } else {
438  /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
439  z3 = d7 + d3;
440 
441  tmp0 = MULTIPLY(-d7, FIX_0_601344887);
442  z1 = MULTIPLY(-d7, FIX_0_899976223);
443  tmp2 = MULTIPLY(d3, FIX_0_509795579);
444  z2 = MULTIPLY(-d3, FIX_2_562915447);
445  z5 = MULTIPLY(z3, FIX_1_175875602);
446  z3 = MULTIPLY(-z3, FIX_0_785694958);
447 
448  tmp0 += z3;
449  tmp1 = z2 + z5;
450  tmp2 += z3;
451  tmp3 = z1 + z5;
452  }
453  } else {
454  if (d1) {
455  /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
456  z1 = d7 + d1;
457  z5 = MULTIPLY(z1, FIX_1_175875602);
458 
459  z1 = MULTIPLY(z1, FIX_0_275899380);
460  z3 = MULTIPLY(-d7, FIX_1_961570560);
461  tmp0 = MULTIPLY(-d7, FIX_1_662939225);
462  z4 = MULTIPLY(-d1, FIX_0_390180644);
463  tmp3 = MULTIPLY(d1, FIX_1_111140466);
464 
465  tmp0 += z1;
466  tmp1 = z4 + z5;
467  tmp2 = z3 + z5;
468  tmp3 += z1;
469  } else {
470  /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
471  tmp0 = MULTIPLY(-d7, FIX_1_387039845);
472  tmp1 = MULTIPLY(d7, FIX_1_175875602);
473  tmp2 = MULTIPLY(-d7, FIX_0_785694958);
474  tmp3 = MULTIPLY(d7, FIX_0_275899380);
475  }
476  }
477  }
478  } else {
479  if (d5) {
480  if (d3) {
481  if (d1) {
482  /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
483  z2 = d5 + d3;
484  z4 = d5 + d1;
485  z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
486 
487  tmp1 = MULTIPLY(d5, FIX_2_053119869);
488  tmp2 = MULTIPLY(d3, FIX_3_072711026);
489  tmp3 = MULTIPLY(d1, FIX_1_501321110);
490  z1 = MULTIPLY(-d1, FIX_0_899976223);
491  z2 = MULTIPLY(-z2, FIX_2_562915447);
492  z3 = MULTIPLY(-d3, FIX_1_961570560);
493  z4 = MULTIPLY(-z4, FIX_0_390180644);
494 
495  z3 += z5;
496  z4 += z5;
497 
498  tmp0 = z1 + z3;
499  tmp1 += z2 + z4;
500  tmp2 += z2 + z3;
501  tmp3 += z1 + z4;
502  } else {
503  /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
504  z2 = d5 + d3;
505 
506  z5 = MULTIPLY(z2, FIX_1_175875602);
507  tmp1 = MULTIPLY(d5, FIX_1_662939225);
508  z4 = MULTIPLY(-d5, FIX_0_390180644);
509  z2 = MULTIPLY(-z2, FIX_1_387039845);
510  tmp2 = MULTIPLY(d3, FIX_1_111140466);
511  z3 = MULTIPLY(-d3, FIX_1_961570560);
512 
513  tmp0 = z3 + z5;
514  tmp1 += z2;
515  tmp2 += z2;
516  tmp3 = z4 + z5;
517  }
518  } else {
519  if (d1) {
520  /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
521  z4 = d5 + d1;
522 
523  z5 = MULTIPLY(z4, FIX_1_175875602);
524  z1 = MULTIPLY(-d1, FIX_0_899976223);
525  tmp3 = MULTIPLY(d1, FIX_0_601344887);
526  tmp1 = MULTIPLY(-d5, FIX_0_509795579);
527  z2 = MULTIPLY(-d5, FIX_2_562915447);
528  z4 = MULTIPLY(z4, FIX_0_785694958);
529 
530  tmp0 = z1 + z5;
531  tmp1 += z4;
532  tmp2 = z2 + z5;
533  tmp3 += z4;
534  } else {
535  /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
536  tmp0 = MULTIPLY(d5, FIX_1_175875602);
537  tmp1 = MULTIPLY(d5, FIX_0_275899380);
538  tmp2 = MULTIPLY(-d5, FIX_1_387039845);
539  tmp3 = MULTIPLY(d5, FIX_0_785694958);
540  }
541  }
542  } else {
543  if (d3) {
544  if (d1) {
545  /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
546  z5 = d1 + d3;
547  tmp3 = MULTIPLY(d1, FIX_0_211164243);
548  tmp2 = MULTIPLY(-d3, FIX_1_451774981);
549  z1 = MULTIPLY(d1, FIX_1_061594337);
550  z2 = MULTIPLY(-d3, FIX_2_172734803);
551  z4 = MULTIPLY(z5, FIX_0_785694958);
552  z5 = MULTIPLY(z5, FIX_1_175875602);
553 
554  tmp0 = z1 - z4;
555  tmp1 = z2 + z4;
556  tmp2 += z5;
557  tmp3 += z5;
558  } else {
559  /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
560  tmp0 = MULTIPLY(-d3, FIX_0_785694958);
561  tmp1 = MULTIPLY(-d3, FIX_1_387039845);
562  tmp2 = MULTIPLY(-d3, FIX_0_275899380);
563  tmp3 = MULTIPLY(d3, FIX_1_175875602);
564  }
565  } else {
566  if (d1) {
567  /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
568  tmp0 = MULTIPLY(d1, FIX_0_275899380);
569  tmp1 = MULTIPLY(d1, FIX_0_785694958);
570  tmp2 = MULTIPLY(d1, FIX_1_175875602);
571  tmp3 = MULTIPLY(d1, FIX_1_387039845);
572  } else {
573  /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
574  tmp0 = tmp1 = tmp2 = tmp3 = 0;
575  }
576  }
577  }
578  }
579 }
580  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
581 
582  dataptr[0] = (int16_t) DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
583  dataptr[7] = (int16_t) DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
584  dataptr[1] = (int16_t) DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
585  dataptr[6] = (int16_t) DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
586  dataptr[2] = (int16_t) DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
587  dataptr[5] = (int16_t) DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
588  dataptr[3] = (int16_t) DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
589  dataptr[4] = (int16_t) DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);
590 
591  dataptr += DCTSIZE; /* advance pointer to next row */
592  }
593 
594  /* Pass 2: process columns. */
595  /* Note that we must descale the results by a factor of 8 == 2**3, */
596  /* and also undo the PASS1_BITS scaling. */
597 
598  dataptr = data;
599  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
600  /* Columns of zeroes can be exploited in the same way as we did with rows.
601  * However, the row calculation has created many nonzero AC terms, so the
602  * simplification applies less often (typically 5% to 10% of the time).
603  * On machines with very fast multiplication, it's possible that the
604  * test takes more time than it's worth. In that case this section
605  * may be commented out.
606  */
607 
608  d0 = dataptr[DCTSIZE*0];
609  d1 = dataptr[DCTSIZE*1];
610  d2 = dataptr[DCTSIZE*2];
611  d3 = dataptr[DCTSIZE*3];
612  d4 = dataptr[DCTSIZE*4];
613  d5 = dataptr[DCTSIZE*5];
614  d6 = dataptr[DCTSIZE*6];
615  d7 = dataptr[DCTSIZE*7];
616 
617  /* Even part: reverse the even part of the forward DCT. */
618  /* The rotator is sqrt(2)*c(-6). */
619  if (d6) {
620  if (d2) {
621  /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
622  z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
623  tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
624  tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
625 
626  tmp0 = (d0 + d4) * CONST_SCALE;
627  tmp1 = (d0 - d4) * CONST_SCALE;
628 
629  tmp10 = tmp0 + tmp3;
630  tmp13 = tmp0 - tmp3;
631  tmp11 = tmp1 + tmp2;
632  tmp12 = tmp1 - tmp2;
633  } else {
634  /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
635  tmp2 = MULTIPLY(-d6, FIX_1_306562965);
636  tmp3 = MULTIPLY(d6, FIX_0_541196100);
637 
638  tmp0 = (d0 + d4) * CONST_SCALE;
639  tmp1 = (d0 - d4) * CONST_SCALE;
640 
641  tmp10 = tmp0 + tmp3;
642  tmp13 = tmp0 - tmp3;
643  tmp11 = tmp1 + tmp2;
644  tmp12 = tmp1 - tmp2;
645  }
646  } else {
647  if (d2) {
648  /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
649  tmp2 = MULTIPLY(d2, FIX_0_541196100);
650  tmp3 = MULTIPLY(d2, FIX_1_306562965);
651 
652  tmp0 = (d0 + d4) * CONST_SCALE;
653  tmp1 = (d0 - d4) * CONST_SCALE;
654 
655  tmp10 = tmp0 + tmp3;
656  tmp13 = tmp0 - tmp3;
657  tmp11 = tmp1 + tmp2;
658  tmp12 = tmp1 - tmp2;
659  } else {
660  /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
661  tmp10 = tmp13 = (d0 + d4) * CONST_SCALE;
662  tmp11 = tmp12 = (d0 - d4) * CONST_SCALE;
663  }
664  }
665 
666  /* Odd part per figure 8; the matrix is unitary and hence its
667  * transpose is its inverse. i0..i3 are y7,y5,y3,y1 respectively.
668  */
669  if (d7) {
670  if (d5) {
671  if (d3) {
672  if (d1) {
673  /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
674  z1 = d7 + d1;
675  z2 = d5 + d3;
676  z3 = d7 + d3;
677  z4 = d5 + d1;
678  z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
679 
680  tmp0 = MULTIPLY(d7, FIX_0_298631336);
681  tmp1 = MULTIPLY(d5, FIX_2_053119869);
682  tmp2 = MULTIPLY(d3, FIX_3_072711026);
683  tmp3 = MULTIPLY(d1, FIX_1_501321110);
684  z1 = MULTIPLY(-z1, FIX_0_899976223);
685  z2 = MULTIPLY(-z2, FIX_2_562915447);
686  z3 = MULTIPLY(-z3, FIX_1_961570560);
687  z4 = MULTIPLY(-z4, FIX_0_390180644);
688 
689  z3 += z5;
690  z4 += z5;
691 
692  tmp0 += z1 + z3;
693  tmp1 += z2 + z4;
694  tmp2 += z2 + z3;
695  tmp3 += z1 + z4;
696  } else {
697  /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
698  z2 = d5 + d3;
699  z3 = d7 + d3;
700  z5 = MULTIPLY(z3 + d5, FIX_1_175875602);
701 
702  tmp0 = MULTIPLY(d7, FIX_0_298631336);
703  tmp1 = MULTIPLY(d5, FIX_2_053119869);
704  tmp2 = MULTIPLY(d3, FIX_3_072711026);
705  z1 = MULTIPLY(-d7, FIX_0_899976223);
706  z2 = MULTIPLY(-z2, FIX_2_562915447);
707  z3 = MULTIPLY(-z3, FIX_1_961570560);
708  z4 = MULTIPLY(-d5, FIX_0_390180644);
709 
710  z3 += z5;
711  z4 += z5;
712 
713  tmp0 += z1 + z3;
714  tmp1 += z2 + z4;
715  tmp2 += z2 + z3;
716  tmp3 = z1 + z4;
717  }
718  } else {
719  if (d1) {
720  /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
721  z1 = d7 + d1;
722  z3 = d7;
723  z4 = d5 + d1;
724  z5 = MULTIPLY(z3 + z4, FIX_1_175875602);
725 
726  tmp0 = MULTIPLY(d7, FIX_0_298631336);
727  tmp1 = MULTIPLY(d5, FIX_2_053119869);
728  tmp3 = MULTIPLY(d1, FIX_1_501321110);
729  z1 = MULTIPLY(-z1, FIX_0_899976223);
730  z2 = MULTIPLY(-d5, FIX_2_562915447);
731  z3 = MULTIPLY(-d7, FIX_1_961570560);
732  z4 = MULTIPLY(-z4, FIX_0_390180644);
733 
734  z3 += z5;
735  z4 += z5;
736 
737  tmp0 += z1 + z3;
738  tmp1 += z2 + z4;
739  tmp2 = z2 + z3;
740  tmp3 += z1 + z4;
741  } else {
742  /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
743  tmp0 = MULTIPLY(-d7, FIX_0_601344887);
744  z1 = MULTIPLY(-d7, FIX_0_899976223);
745  z3 = MULTIPLY(-d7, FIX_1_961570560);
746  tmp1 = MULTIPLY(-d5, FIX_0_509795579);
747  z2 = MULTIPLY(-d5, FIX_2_562915447);
748  z4 = MULTIPLY(-d5, FIX_0_390180644);
749  z5 = MULTIPLY(d5 + d7, FIX_1_175875602);
750 
751  z3 += z5;
752  z4 += z5;
753 
754  tmp0 += z3;
755  tmp1 += z4;
756  tmp2 = z2 + z3;
757  tmp3 = z1 + z4;
758  }
759  }
760  } else {
761  if (d3) {
762  if (d1) {
763  /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
764  z1 = d7 + d1;
765  z3 = d7 + d3;
766  z5 = MULTIPLY(z3 + d1, FIX_1_175875602);
767 
768  tmp0 = MULTIPLY(d7, FIX_0_298631336);
769  tmp2 = MULTIPLY(d3, FIX_3_072711026);
770  tmp3 = MULTIPLY(d1, FIX_1_501321110);
771  z1 = MULTIPLY(-z1, FIX_0_899976223);
772  z2 = MULTIPLY(-d3, FIX_2_562915447);
773  z3 = MULTIPLY(-z3, FIX_1_961570560);
774  z4 = MULTIPLY(-d1, FIX_0_390180644);
775 
776  z3 += z5;
777  z4 += z5;
778 
779  tmp0 += z1 + z3;
780  tmp1 = z2 + z4;
781  tmp2 += z2 + z3;
782  tmp3 += z1 + z4;
783  } else {
784  /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
785  z3 = d7 + d3;
786 
787  tmp0 = MULTIPLY(-d7, FIX_0_601344887);
788  z1 = MULTIPLY(-d7, FIX_0_899976223);
789  tmp2 = MULTIPLY(d3, FIX_0_509795579);
790  z2 = MULTIPLY(-d3, FIX_2_562915447);
791  z5 = MULTIPLY(z3, FIX_1_175875602);
792  z3 = MULTIPLY(-z3, FIX_0_785694958);
793 
794  tmp0 += z3;
795  tmp1 = z2 + z5;
796  tmp2 += z3;
797  tmp3 = z1 + z5;
798  }
799  } else {
800  if (d1) {
801  /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
802  z1 = d7 + d1;
803  z5 = MULTIPLY(z1, FIX_1_175875602);
804 
805  z1 = MULTIPLY(z1, FIX_0_275899380);
806  z3 = MULTIPLY(-d7, FIX_1_961570560);
807  tmp0 = MULTIPLY(-d7, FIX_1_662939225);
808  z4 = MULTIPLY(-d1, FIX_0_390180644);
809  tmp3 = MULTIPLY(d1, FIX_1_111140466);
810 
811  tmp0 += z1;
812  tmp1 = z4 + z5;
813  tmp2 = z3 + z5;
814  tmp3 += z1;
815  } else {
816  /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
817  tmp0 = MULTIPLY(-d7, FIX_1_387039845);
818  tmp1 = MULTIPLY(d7, FIX_1_175875602);
819  tmp2 = MULTIPLY(-d7, FIX_0_785694958);
820  tmp3 = MULTIPLY(d7, FIX_0_275899380);
821  }
822  }
823  }
824  } else {
825  if (d5) {
826  if (d3) {
827  if (d1) {
828  /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
829  z2 = d5 + d3;
830  z4 = d5 + d1;
831  z5 = MULTIPLY(d3 + z4, FIX_1_175875602);
832 
833  tmp1 = MULTIPLY(d5, FIX_2_053119869);
834  tmp2 = MULTIPLY(d3, FIX_3_072711026);
835  tmp3 = MULTIPLY(d1, FIX_1_501321110);
836  z1 = MULTIPLY(-d1, FIX_0_899976223);
837  z2 = MULTIPLY(-z2, FIX_2_562915447);
838  z3 = MULTIPLY(-d3, FIX_1_961570560);
839  z4 = MULTIPLY(-z4, FIX_0_390180644);
840 
841  z3 += z5;
842  z4 += z5;
843 
844  tmp0 = z1 + z3;
845  tmp1 += z2 + z4;
846  tmp2 += z2 + z3;
847  tmp3 += z1 + z4;
848  } else {
849  /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
850  z2 = d5 + d3;
851 
852  z5 = MULTIPLY(z2, FIX_1_175875602);
853  tmp1 = MULTIPLY(d5, FIX_1_662939225);
854  z4 = MULTIPLY(-d5, FIX_0_390180644);
855  z2 = MULTIPLY(-z2, FIX_1_387039845);
856  tmp2 = MULTIPLY(d3, FIX_1_111140466);
857  z3 = MULTIPLY(-d3, FIX_1_961570560);
858 
859  tmp0 = z3 + z5;
860  tmp1 += z2;
861  tmp2 += z2;
862  tmp3 = z4 + z5;
863  }
864  } else {
865  if (d1) {
866  /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
867  z4 = d5 + d1;
868 
869  z5 = MULTIPLY(z4, FIX_1_175875602);
870  z1 = MULTIPLY(-d1, FIX_0_899976223);
871  tmp3 = MULTIPLY(d1, FIX_0_601344887);
872  tmp1 = MULTIPLY(-d5, FIX_0_509795579);
873  z2 = MULTIPLY(-d5, FIX_2_562915447);
874  z4 = MULTIPLY(z4, FIX_0_785694958);
875 
876  tmp0 = z1 + z5;
877  tmp1 += z4;
878  tmp2 = z2 + z5;
879  tmp3 += z4;
880  } else {
881  /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
882  tmp0 = MULTIPLY(d5, FIX_1_175875602);
883  tmp1 = MULTIPLY(d5, FIX_0_275899380);
884  tmp2 = MULTIPLY(-d5, FIX_1_387039845);
885  tmp3 = MULTIPLY(d5, FIX_0_785694958);
886  }
887  }
888  } else {
889  if (d3) {
890  if (d1) {
891  /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
892  z5 = d1 + d3;
893  tmp3 = MULTIPLY(d1, FIX_0_211164243);
894  tmp2 = MULTIPLY(-d3, FIX_1_451774981);
895  z1 = MULTIPLY(d1, FIX_1_061594337);
896  z2 = MULTIPLY(-d3, FIX_2_172734803);
897  z4 = MULTIPLY(z5, FIX_0_785694958);
898  z5 = MULTIPLY(z5, FIX_1_175875602);
899 
900  tmp0 = z1 - z4;
901  tmp1 = z2 + z4;
902  tmp2 += z5;
903  tmp3 += z5;
904  } else {
905  /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
906  tmp0 = MULTIPLY(-d3, FIX_0_785694958);
907  tmp1 = MULTIPLY(-d3, FIX_1_387039845);
908  tmp2 = MULTIPLY(-d3, FIX_0_275899380);
909  tmp3 = MULTIPLY(d3, FIX_1_175875602);
910  }
911  } else {
912  if (d1) {
913  /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
914  tmp0 = MULTIPLY(d1, FIX_0_275899380);
915  tmp1 = MULTIPLY(d1, FIX_0_785694958);
916  tmp2 = MULTIPLY(d1, FIX_1_175875602);
917  tmp3 = MULTIPLY(d1, FIX_1_387039845);
918  } else {
919  /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
920  tmp0 = tmp1 = tmp2 = tmp3 = 0;
921  }
922  }
923  }
924  }
925 
926  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
927 
928  dataptr[DCTSIZE*0] = (int16_t) DESCALE(tmp10 + tmp3,
930  dataptr[DCTSIZE*7] = (int16_t) DESCALE(tmp10 - tmp3,
932  dataptr[DCTSIZE*1] = (int16_t) DESCALE(tmp11 + tmp2,
934  dataptr[DCTSIZE*6] = (int16_t) DESCALE(tmp11 - tmp2,
936  dataptr[DCTSIZE*2] = (int16_t) DESCALE(tmp12 + tmp1,
938  dataptr[DCTSIZE*5] = (int16_t) DESCALE(tmp12 - tmp1,
940  dataptr[DCTSIZE*3] = (int16_t) DESCALE(tmp13 + tmp0,
942  dataptr[DCTSIZE*4] = (int16_t) DESCALE(tmp13 - tmp0,
944 
945  dataptr++; /* advance pointer to next column */
946  }
947 }
948 
949 #undef DCTSIZE
950 #define DCTSIZE 4
951 #define DCTSTRIDE 8
952 
954 {
955  int32_t tmp0, tmp1, tmp2, tmp3;
956  int32_t tmp10, tmp11, tmp12, tmp13;
957  int32_t z1;
958  int32_t d0, d2, d4, d6;
959  register int16_t *dataptr;
960  int rowctr;
961 
962  /* Pass 1: process rows. */
963  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
964  /* furthermore, we scale the results by 2**PASS1_BITS. */
965 
966  data[0] += 4;
967 
968  dataptr = data;
969 
970  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
971  /* Due to quantization, we will usually find that many of the input
972  * coefficients are zero, especially the AC terms. We can exploit this
973  * by short-circuiting the IDCT calculation for any row in which all
974  * the AC terms are zero. In that case each output is equal to the
975  * DC coefficient (with scale factor as needed).
976  * With typical images and quantization tables, half or more of the
977  * row DCT calculations can be simplified this way.
978  */
979 
980  register uint8_t *idataptr = (uint8_t*)dataptr;
981 
982  d0 = dataptr[0];
983  d2 = dataptr[1];
984  d4 = dataptr[2];
985  d6 = dataptr[3];
986 
987  if ((d2 | d4 | d6) == 0) {
988  /* AC terms all zero */
989  if (d0) {
990  /* Compute a 32 bit value to assign. */
991  int16_t dcval = (int16_t) (d0 * (1 << PASS1_BITS));
992  register unsigned v = (dcval & 0xffff) | ((uint32_t)dcval << 16);
993 
994  AV_WN32A(&idataptr[0], v);
995  AV_WN32A(&idataptr[4], v);
996  }
997 
998  dataptr += DCTSTRIDE; /* advance pointer to next row */
999  continue;
1000  }
1001 
1002  /* Even part: reverse the even part of the forward DCT. */
1003  /* The rotator is sqrt(2)*c(-6). */
1004  if (d6) {
1005  if (d2) {
1006  /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1007  z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1008  tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1009  tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1010 
1011  tmp0 = (d0 + d4) * (1 << CONST_BITS);
1012  tmp1 = (d0 - d4) * (1 << CONST_BITS);
1013 
1014  tmp10 = tmp0 + tmp3;
1015  tmp13 = tmp0 - tmp3;
1016  tmp11 = tmp1 + tmp2;
1017  tmp12 = tmp1 - tmp2;
1018  } else {
1019  /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1020  tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1021  tmp3 = MULTIPLY(d6, FIX_0_541196100);
1022 
1023  tmp0 = (d0 + d4) * (1 << CONST_BITS);
1024  tmp1 = (d0 - d4) * (1 << CONST_BITS);
1025 
1026  tmp10 = tmp0 + tmp3;
1027  tmp13 = tmp0 - tmp3;
1028  tmp11 = tmp1 + tmp2;
1029  tmp12 = tmp1 - tmp2;
1030  }
1031  } else {
1032  if (d2) {
1033  /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1034  tmp2 = MULTIPLY(d2, FIX_0_541196100);
1035  tmp3 = MULTIPLY(d2, FIX_1_306562965);
1036 
1037  tmp0 = (d0 + d4) * (1 << CONST_BITS);
1038  tmp1 = (d0 - d4) * (1 << CONST_BITS);
1039 
1040  tmp10 = tmp0 + tmp3;
1041  tmp13 = tmp0 - tmp3;
1042  tmp11 = tmp1 + tmp2;
1043  tmp12 = tmp1 - tmp2;
1044  } else {
1045  /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1046  tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS);
1047  tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS);
1048  }
1049  }
1050 
1051  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1052 
1053  dataptr[0] = (int16_t) DESCALE(tmp10, CONST_BITS-PASS1_BITS);
1054  dataptr[1] = (int16_t) DESCALE(tmp11, CONST_BITS-PASS1_BITS);
1055  dataptr[2] = (int16_t) DESCALE(tmp12, CONST_BITS-PASS1_BITS);
1056  dataptr[3] = (int16_t) DESCALE(tmp13, CONST_BITS-PASS1_BITS);
1057 
1058  dataptr += DCTSTRIDE; /* advance pointer to next row */
1059  }
1060 
1061  /* Pass 2: process columns. */
1062  /* Note that we must descale the results by a factor of 8 == 2**3, */
1063  /* and also undo the PASS1_BITS scaling. */
1064 
1065  dataptr = data;
1066  for (rowctr = DCTSIZE-1; rowctr >= 0; rowctr--) {
1067  /* Columns of zeroes can be exploited in the same way as we did with rows.
1068  * However, the row calculation has created many nonzero AC terms, so the
1069  * simplification applies less often (typically 5% to 10% of the time).
1070  * On machines with very fast multiplication, it's possible that the
1071  * test takes more time than it's worth. In that case this section
1072  * may be commented out.
1073  */
1074 
1075  d0 = dataptr[DCTSTRIDE*0];
1076  d2 = dataptr[DCTSTRIDE*1];
1077  d4 = dataptr[DCTSTRIDE*2];
1078  d6 = dataptr[DCTSTRIDE*3];
1079 
1080  /* Even part: reverse the even part of the forward DCT. */
1081  /* The rotator is sqrt(2)*c(-6). */
1082  if (d6) {
1083  if (d2) {
1084  /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
1085  z1 = MULTIPLY(d2 + d6, FIX_0_541196100);
1086  tmp2 = z1 + MULTIPLY(-d6, FIX_1_847759065);
1087  tmp3 = z1 + MULTIPLY(d2, FIX_0_765366865);
1088 
1089  tmp0 = (d0 + d4) * (1 << CONST_BITS);
1090  tmp1 = (d0 - d4) * (1 << CONST_BITS);
1091 
1092  tmp10 = tmp0 + tmp3;
1093  tmp13 = tmp0 - tmp3;
1094  tmp11 = tmp1 + tmp2;
1095  tmp12 = tmp1 - tmp2;
1096  } else {
1097  /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
1098  tmp2 = MULTIPLY(-d6, FIX_1_306562965);
1099  tmp3 = MULTIPLY(d6, FIX_0_541196100);
1100 
1101  tmp0 = (d0 + d4) * (1 << CONST_BITS);
1102  tmp1 = (d0 - d4) * (1 << CONST_BITS);
1103 
1104  tmp10 = tmp0 + tmp3;
1105  tmp13 = tmp0 - tmp3;
1106  tmp11 = tmp1 + tmp2;
1107  tmp12 = tmp1 - tmp2;
1108  }
1109  } else {
1110  if (d2) {
1111  /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
1112  tmp2 = MULTIPLY(d2, FIX_0_541196100);
1113  tmp3 = MULTIPLY(d2, FIX_1_306562965);
1114 
1115  tmp0 = (d0 + d4) * (1 << CONST_BITS);
1116  tmp1 = (d0 - d4) * (1 << CONST_BITS);
1117 
1118  tmp10 = tmp0 + tmp3;
1119  tmp13 = tmp0 - tmp3;
1120  tmp11 = tmp1 + tmp2;
1121  tmp12 = tmp1 - tmp2;
1122  } else {
1123  /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
1124  tmp10 = tmp13 = (d0 + d4) * (1 << CONST_BITS);
1125  tmp11 = tmp12 = (d0 - d4) * (1 << CONST_BITS);
1126  }
1127  }
1128 
1129  /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */
1130 
1131  dataptr[DCTSTRIDE*0] = tmp10 >> (CONST_BITS+PASS1_BITS+3);
1132  dataptr[DCTSTRIDE*1] = tmp11 >> (CONST_BITS+PASS1_BITS+3);
1133  dataptr[DCTSTRIDE*2] = tmp12 >> (CONST_BITS+PASS1_BITS+3);
1134  dataptr[DCTSTRIDE*3] = tmp13 >> (CONST_BITS+PASS1_BITS+3);
1135 
1136  dataptr++; /* advance pointer to next column */
1137  }
1138 }
1139 
1141  int d00, d01, d10, d11;
1142 
1143  data[0] += 4;
1144  d00 = data[0+0*DCTSTRIDE] + data[1+0*DCTSTRIDE];
1145  d01 = data[0+0*DCTSTRIDE] - data[1+0*DCTSTRIDE];
1146  d10 = data[0+1*DCTSTRIDE] + data[1+1*DCTSTRIDE];
1147  d11 = data[0+1*DCTSTRIDE] - data[1+1*DCTSTRIDE];
1148 
1149  data[0+0*DCTSTRIDE]= (d00 + d10)>>3;
1150  data[1+0*DCTSTRIDE]= (d01 + d11)>>3;
1151  data[0+1*DCTSTRIDE]= (d00 - d10)>>3;
1152  data[1+1*DCTSTRIDE]= (d01 - d11)>>3;
1153 }
1154 
1156  data[0] = (data[0] + 4)>>3;
1157 }
1158 
1159 #undef FIX
1160 #undef CONST_BITS
1161 
1162 void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
1163 {
1165  ff_put_pixels_clamped_c(block, dest, line_size);
1166 }
1167 
1168 void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
1169 {
1171  ff_add_pixels_clamped_c(block, dest, line_size);
1172 }
DESCALE
#define DESCALE(x, n)
Definition: jrevdct.c:155
DCTSTRIDE
#define DCTSTRIDE
Definition: jrevdct.c:951
DCTSIZE
#define DCTSIZE
Definition: jrevdct.c:950
ff_jref_idct_add
void ff_jref_idct_add(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: jrevdct.c:1168
FIX_1_061594337
#define FIX_1_061594337
Definition: jrevdct.c:196
FIX_1_501321110
#define FIX_1_501321110
Definition: jrevdct.c:202
FIX_0_899976223
#define FIX_0_899976223
Definition: jrevdct.c:195
ff_j_rev_dct4
void ff_j_rev_dct4(DCTBLOCK data)
Definition: jrevdct.c:953
FIX_1_847759065
#define FIX_1_847759065
Definition: jrevdct.c:204
FIX_2_562915447
#define FIX_2_562915447
Definition: jrevdct.c:208
data
const char data[16]
Definition: mxf.c:149
FIX_0_765366865
#define FIX_0_765366865
Definition: jrevdct.c:193
FIX_0_541196100
#define FIX_0_541196100
Definition: jrevdct.c:191
AV_WN32A
#define AV_WN32A(p, v)
Definition: intreadwrite.h:534
ff_j_rev_dct1
void ff_j_rev_dct1(DCTBLOCK data)
Definition: jrevdct.c:1155
CONST_SCALE
#define CONST_SCALE
Definition: jrevdct.c:139
FIX_2_172734803
#define FIX_2_172734803
Definition: jrevdct.c:207
FIX_1_451774981
#define FIX_1_451774981
Definition: jrevdct.c:201
dct.h
intreadwrite.h
FIX_1_662939225
#define FIX_1_662939225
Definition: jrevdct.c:203
DCTSIZE2
#define DCTSIZE2
Definition: jrevdct.c:76
FIX_0_509795579
#define FIX_0_509795579
Definition: jrevdct.c:190
DCTBLOCK
int16_t DCTBLOCK[DCTSIZE2]
Definition: jrevdct.c:82
FIX_0_298631336
#define FIX_0_298631336
Definition: jrevdct.c:188
CONST_BITS
#define CONST_BITS
Definition: jrevdct.c:84
FIX_0_211164243
#define FIX_0_211164243
Definition: jrevdct.c:186
FIX_0_390180644
#define FIX_0_390180644
Definition: jrevdct.c:189
FIX_1_961570560
#define FIX_1_961570560
Definition: jrevdct.c:205
FIX_1_175875602
#define FIX_1_175875602
Definition: jrevdct.c:198
FIX_2_053119869
#define FIX_2_053119869
Definition: jrevdct.c:206
FIX_3_072711026
#define FIX_3_072711026
Definition: jrevdct.c:209
code
and forward the test the status of outputs and forward it to the corresponding return FFERROR_NOT_READY If the filters stores internally one or a few frame for some it can consider them to be part of the FIFO and delay acknowledging a status change accordingly Example code
Definition: filter_design.txt:178
ff_add_pixels_clamped_c
void ff_add_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels, ptrdiff_t line_size)
Definition: idctdsp.c:147
ff_j_rev_dct2
void ff_j_rev_dct2(DCTBLOCK data)
Definition: jrevdct.c:1140
ff_jref_idct_put
void ff_jref_idct_put(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
Definition: jrevdct.c:1162
FIX_0_275899380
#define FIX_0_275899380
Definition: jrevdct.c:187
idctdsp.h
FIX_0_601344887
#define FIX_0_601344887
Definition: jrevdct.c:192
ff_j_rev_dct
void ff_j_rev_dct(DCTBLOCK data)
Definition: jrevdct.c:215
FIX_1_387039845
#define FIX_1_387039845
Definition: jrevdct.c:200
FIX_1_306562965
#define FIX_1_306562965
Definition: jrevdct.c:199
FIX_0_785694958
#define FIX_0_785694958
Definition: jrevdct.c:194
ff_put_pixels_clamped_c
void ff_put_pixels_clamped_c(const int16_t *block, uint8_t *restrict pixels, ptrdiff_t line_size)
Definition: idctdsp.c:73
PASS1_BITS
#define PASS1_BITS
Definition: jrevdct.c:132
int32_t
int32_t
Definition: audioconvert.c:56
block
The exact code depends on how similar the blocks are and how related they are to the block
Definition: filter_design.txt:207
FIX_1_111140466
#define FIX_1_111140466
Definition: jrevdct.c:197
MULTIPLY
#define MULTIPLY(var, const)
Definition: jrevdct.c:178