FFmpeg
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
dct-test.c
Go to the documentation of this file.
1 /*
2  * (c) 2001 Fabrice Bellard
3  * 2007 Marc Hoffman <marc.hoffman@analog.com>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21 
22 /**
23  * @file
24  * DCT test (c) 2001 Fabrice Bellard
25  * Started from sample code by Juan J. Sierralta P.
26  */
27 
28 #include "config.h"
29 #include <stdlib.h>
30 #include <stdio.h>
31 #include <string.h>
32 #if HAVE_UNISTD_H
33 #include <unistd.h>
34 #endif
35 #include <math.h>
36 
37 #include "libavutil/cpu.h"
38 #include "libavutil/common.h"
39 #include "libavutil/lfg.h"
40 #include "libavutil/time.h"
41 
42 #include "dct.h"
43 #include "simple_idct.h"
44 #include "aandcttab.h"
45 #include "faandct.h"
46 #include "faanidct.h"
47 #include "x86/idct_xvid.h"
48 #include "dctref.h"
49 
50 // ALTIVEC
51 void ff_fdct_altivec(int16_t *block);
52 
53 // ARM
54 void ff_j_rev_dct_arm(int16_t *data);
55 void ff_simple_idct_arm(int16_t *data);
56 void ff_simple_idct_armv5te(int16_t *data);
57 void ff_simple_idct_armv6(int16_t *data);
58 void ff_simple_idct_neon(int16_t *data);
59 
60 struct algo {
61  const char *name;
62  void (*func)(int16_t *block);
66  int nonspec;
67 };
68 
69 static int cpu_flags;
70 
71 static const struct algo fdct_tab[] = {
72  { "REF-DBL", ff_ref_fdct, NO_PERM },
73  { "FAAN", ff_faandct, NO_PERM },
74  { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
75  { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
76 
77 #if HAVE_MMX_INLINE
78  { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
79 #endif
80 #if HAVE_MMXEXT_INLINE
82 #endif
83 #if HAVE_SSE2_INLINE
85 #endif
86 
87 #if HAVE_ALTIVEC
88  { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
89 #endif
90 
91  { 0 }
92 };
93 
94 static void ff_prores_idct_wrap(int16_t *dst){
95  DECLARE_ALIGNED(16, static int16_t, qmat)[64];
96  int i;
97 
98  for(i=0; i<64; i++){
99  qmat[i]=4;
100  }
101  ff_prores_idct(dst, qmat);
102  for(i=0; i<64; i++) {
103  dst[i] -= 512;
104  }
105 }
106 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
107 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
108  int16_t *block, int16_t *qmat);
109 
110 static void ff_prores_idct_put_10_sse2_wrap(int16_t *dst){
111  DECLARE_ALIGNED(16, static int16_t, qmat)[64];
112  DECLARE_ALIGNED(16, static int16_t, tmp)[64];
113  int i;
114 
115  for(i=0; i<64; i++){
116  qmat[i]=4;
117  tmp[i]= dst[i];
118  }
119  ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
120 
121  for(i=0; i<64; i++) {
122  dst[i] -= 512;
123  }
124 }
125 #endif
126 
127 static const struct algo idct_tab[] = {
128  { "FAANI", ff_faanidct, NO_PERM },
129  { "REF-DBL", ff_ref_idct, NO_PERM },
130  { "INT", ff_j_rev_dct, MMX_PERM },
131  { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
132  { "PR-C", ff_prores_idct_wrap, NO_PERM, 0, 1 },
133 
134 #if HAVE_MMX_INLINE
136  { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
137 #endif
138 #if HAVE_MMXEXT_INLINE
139  { "XVID-MMXEXT", ff_idct_xvid_mmxext, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
140 #endif
141 #if HAVE_SSE2_INLINE
142  { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
143 #if ARCH_X86_64 && HAVE_YASM
144  { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
145 #endif
146 #endif
147 
148 #if ARCH_ARM
149  { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
150  { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
151 #endif
152 #if HAVE_ARMV5TE
153  { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM, AV_CPU_FLAG_ARMV5TE },
154 #endif
155 #if HAVE_ARMV6
156  { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM, AV_CPU_FLAG_ARMV6 },
157 #endif
158 #if HAVE_NEON && ARCH_ARM
160 #endif
161 
162  { 0 }
163 };
164 
165 #define AANSCALE_BITS 12
166 
167 #define NB_ITS 20000
168 #define NB_ITS_SPEED 50000
169 
170 static short idct_mmx_perm[64];
171 
172 static short idct_simple_mmx_perm[64] = {
173  0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
174  0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
175  0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
176  0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
177  0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
178  0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
179  0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
180  0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
181 };
182 
183 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
184 
185 static void idct_mmx_init(void)
186 {
187  int i;
188 
189  /* the mmx/mmxext idct uses a reordered input, so we patch scan tables */
190  for (i = 0; i < 64; i++) {
191  idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
192  }
193 }
194 
195 DECLARE_ALIGNED(16, static int16_t, block)[64];
196 DECLARE_ALIGNED(8, static int16_t, block1)[64];
197 
198 static void init_block(int16_t block[64], int test, int is_idct, AVLFG *prng, int vals)
199 {
200  int i, j;
201 
202  memset(block, 0, 64 * sizeof(*block));
203 
204  switch (test) {
205  case 0:
206  for (i = 0; i < 64; i++)
207  block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
208  if (is_idct) {
209  ff_ref_fdct(block);
210  for (i = 0; i < 64; i++)
211  block[i] >>= 3;
212  }
213  break;
214  case 1:
215  j = av_lfg_get(prng) % 10 + 1;
216  for (i = 0; i < j; i++) {
217  int idx = av_lfg_get(prng) % 64;
218  block[idx] = av_lfg_get(prng) % (2*vals) -vals;
219  }
220  break;
221  case 2:
222  block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
223  block[63] = (block[0] & 1) ^ 1;
224  break;
225  }
226 }
227 
228 static void permute(int16_t dst[64], const int16_t src[64], int perm)
229 {
230  int i;
231 
232  if (perm == MMX_PERM) {
233  for (i = 0; i < 64; i++)
234  dst[idct_mmx_perm[i]] = src[i];
235  } else if (perm == MMX_SIMPLE_PERM) {
236  for (i = 0; i < 64; i++)
237  dst[idct_simple_mmx_perm[i]] = src[i];
238  } else if (perm == SSE2_PERM) {
239  for (i = 0; i < 64; i++)
240  dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
241  } else if (perm == PARTTRANS_PERM) {
242  for (i = 0; i < 64; i++)
243  dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
244  } else if (perm == TRANSPOSE_PERM) {
245  for (i = 0; i < 64; i++)
246  dst[(i>>3) | ((i<<3)&0x38)] = src[i];
247  } else {
248  for (i = 0; i < 64; i++)
249  dst[i] = src[i];
250  }
251 }
252 
253 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
254 {
255  void (*ref)(int16_t *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
256  int it, i, scale;
257  int err_inf, v;
258  int64_t err2, ti, ti1, it1, err_sum = 0;
259  int64_t sysErr[64], sysErrMax = 0;
260  int maxout = 0;
261  int blockSumErrMax = 0, blockSumErr;
262  AVLFG prng;
263  const int vals=1<<bits;
264  double omse, ome;
265  int spec_err;
266 
267  av_lfg_init(&prng, 1);
268 
269  err_inf = 0;
270  err2 = 0;
271  for (i = 0; i < 64; i++)
272  sysErr[i] = 0;
273  for (it = 0; it < NB_ITS; it++) {
274  init_block(block1, test, is_idct, &prng, vals);
275  permute(block, block1, dct->format);
276 
277  dct->func(block);
278  emms_c();
279 
280  if (dct->format == SCALE_PERM) {
281  for (i = 0; i < 64; i++) {
282  scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
283  block[i] = (block[i] * scale) >> AANSCALE_BITS;
284  }
285  }
286 
287  ref(block1);
288  if (!strcmp(dct->name, "PR-SSE2"))
289  for (i = 0; i < 64; i++)
290  block1[i] = av_clip(block1[i], 4-512, 1019-512);
291 
292  blockSumErr = 0;
293  for (i = 0; i < 64; i++) {
294  int err = block[i] - block1[i];
295  err_sum += err;
296  v = abs(err);
297  if (v > err_inf)
298  err_inf = v;
299  err2 += v * v;
300  sysErr[i] += block[i] - block1[i];
301  blockSumErr += v;
302  if (abs(block[i]) > maxout)
303  maxout = abs(block[i]);
304  }
305  if (blockSumErrMax < blockSumErr)
306  blockSumErrMax = blockSumErr;
307  }
308  for (i = 0; i < 64; i++)
309  sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
310 
311  for (i = 0; i < 64; i++) {
312  if (i % 8 == 0)
313  printf("\n");
314  printf("%7d ", (int) sysErr[i]);
315  }
316  printf("\n");
317 
318  omse = (double) err2 / NB_ITS / 64;
319  ome = (double) err_sum / NB_ITS / 64;
320 
321  spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
322 
323  printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
324  is_idct ? "IDCT" : "DCT", dct->name, err_inf,
325  omse, ome, (double) sysErrMax / NB_ITS,
326  maxout, blockSumErrMax);
327 
328  if (spec_err && !dct->nonspec)
329  return 1;
330 
331  if (!speed)
332  return 0;
333 
334  /* speed test */
335 
336  init_block(block, test, is_idct, &prng, vals);
337  permute(block1, block, dct->format);
338 
339  ti = av_gettime_relative();
340  it1 = 0;
341  do {
342  for (it = 0; it < NB_ITS_SPEED; it++) {
343  memcpy(block, block1, sizeof(block));
344  dct->func(block);
345  }
346  emms_c();
347  it1 += NB_ITS_SPEED;
348  ti1 = av_gettime_relative() - ti;
349  } while (ti1 < 1000000);
350 
351  printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
352  (double) it1 * 1000.0 / (double) ti1);
353 
354  return 0;
355 }
356 
359 
360 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
361 {
362  static int init;
363  static double c8[8][8];
364  static double c4[4][4];
365  double block1[64], block2[64], block3[64];
366  double s, sum, v;
367  int i, j, k;
368 
369  if (!init) {
370  init = 1;
371 
372  for (i = 0; i < 8; i++) {
373  sum = 0;
374  for (j = 0; j < 8; j++) {
375  s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
376  c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
377  sum += c8[i][j] * c8[i][j];
378  }
379  }
380 
381  for (i = 0; i < 4; i++) {
382  sum = 0;
383  for (j = 0; j < 4; j++) {
384  s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
385  c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
386  sum += c4[i][j] * c4[i][j];
387  }
388  }
389  }
390 
391  /* butterfly */
392  s = 0.5 * sqrt(2.0);
393  for (i = 0; i < 4; i++) {
394  for (j = 0; j < 8; j++) {
395  block1[8 * (2 * i) + j] =
396  (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
397  block1[8 * (2 * i + 1) + j] =
398  (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
399  }
400  }
401 
402  /* idct8 on lines */
403  for (i = 0; i < 8; i++) {
404  for (j = 0; j < 8; j++) {
405  sum = 0;
406  for (k = 0; k < 8; k++)
407  sum += c8[k][j] * block1[8 * i + k];
408  block2[8 * i + j] = sum;
409  }
410  }
411 
412  /* idct4 */
413  for (i = 0; i < 8; i++) {
414  for (j = 0; j < 4; j++) {
415  /* top */
416  sum = 0;
417  for (k = 0; k < 4; k++)
418  sum += c4[k][j] * block2[8 * (2 * k) + i];
419  block3[8 * (2 * j) + i] = sum;
420 
421  /* bottom */
422  sum = 0;
423  for (k = 0; k < 4; k++)
424  sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
425  block3[8 * (2 * j + 1) + i] = sum;
426  }
427  }
428 
429  /* clamp and store the result */
430  for (i = 0; i < 8; i++) {
431  for (j = 0; j < 8; j++) {
432  v = block3[8 * i + j];
433  if (v < 0) v = 0;
434  else if (v > 255) v = 255;
435  dest[i * linesize + j] = (int) rint(v);
436  }
437  }
438 }
439 
440 static void idct248_error(const char *name,
441  void (*idct248_put)(uint8_t *dest, int line_size,
442  int16_t *block),
443  int speed)
444 {
445  int it, i, it1, ti, ti1, err_max, v;
446  AVLFG prng;
447 
448  av_lfg_init(&prng, 1);
449 
450  /* just one test to see if code is correct (precision is less
451  important here) */
452  err_max = 0;
453  for (it = 0; it < NB_ITS; it++) {
454  /* XXX: use forward transform to generate values */
455  for (i = 0; i < 64; i++)
456  block1[i] = av_lfg_get(&prng) % 256 - 128;
457  block1[0] += 1024;
458 
459  for (i = 0; i < 64; i++)
460  block[i] = block1[i];
461  idct248_ref(img_dest1, 8, block);
462 
463  for (i = 0; i < 64; i++)
464  block[i] = block1[i];
465  idct248_put(img_dest, 8, block);
466 
467  for (i = 0; i < 64; i++) {
468  v = abs((int) img_dest[i] - (int) img_dest1[i]);
469  if (v == 255)
470  printf("%d %d\n", img_dest[i], img_dest1[i]);
471  if (v > err_max)
472  err_max = v;
473  }
474 #if 0
475  printf("ref=\n");
476  for(i=0;i<8;i++) {
477  int j;
478  for(j=0;j<8;j++) {
479  printf(" %3d", img_dest1[i*8+j]);
480  }
481  printf("\n");
482  }
483 
484  printf("out=\n");
485  for(i=0;i<8;i++) {
486  int j;
487  for(j=0;j<8;j++) {
488  printf(" %3d", img_dest[i*8+j]);
489  }
490  printf("\n");
491  }
492 #endif
493  }
494  printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
495 
496  if (!speed)
497  return;
498 
499  ti = av_gettime_relative();
500  it1 = 0;
501  do {
502  for (it = 0; it < NB_ITS_SPEED; it++) {
503  for (i = 0; i < 64; i++)
504  block[i] = block1[i];
505  idct248_put(img_dest, 8, block);
506  }
507  emms_c();
508  it1 += NB_ITS_SPEED;
509  ti1 = av_gettime_relative() - ti;
510  } while (ti1 < 1000000);
511 
512  printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
513  (double) it1 * 1000.0 / (double) ti1);
514 }
515 
516 static void help(void)
517 {
518  printf("dct-test [-i] [<test-number>] [<bits>]\n"
519  "test-number 0 -> test with random matrixes\n"
520  " 1 -> test with random sparse matrixes\n"
521  " 2 -> do 3. test from mpeg4 std\n"
522  "bits Number of time domain bits to use, 8 is default\n"
523  "-i test IDCT implementations\n"
524  "-4 test IDCT248 implementations\n"
525  "-t speed test\n");
526 }
527 
528 #if !HAVE_GETOPT
529 #include "compat/getopt.c"
530 #endif
531 
532 int main(int argc, char **argv)
533 {
534  int test_idct = 0, test_248_dct = 0;
535  int c, i;
536  int test = 1;
537  int speed = 0;
538  int err = 0;
539  int bits=8;
540 
542 
543  ff_ref_dct_init();
544  idct_mmx_init();
545 
546  for (;;) {
547  c = getopt(argc, argv, "ih4t");
548  if (c == -1)
549  break;
550  switch (c) {
551  case 'i':
552  test_idct = 1;
553  break;
554  case '4':
555  test_248_dct = 1;
556  break;
557  case 't':
558  speed = 1;
559  break;
560  default:
561  case 'h':
562  help();
563  return 0;
564  }
565  }
566 
567  if (optind < argc)
568  test = atoi(argv[optind]);
569  if(optind+1 < argc) bits= atoi(argv[optind+1]);
570 
571  printf("ffmpeg DCT/IDCT test\n");
572 
573  if (test_248_dct) {
574  idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
575  } else {
576  const struct algo *algos = test_idct ? idct_tab : fdct_tab;
577  for (i = 0; algos[i].name; i++)
578  if (!(~cpu_flags & algos[i].mm_support)) {
579  err |= dct_error(&algos[i], test, test_idct, speed, bits);
580  }
581  }
582 
583  if (err)
584  printf("Error: %d.\n", err);
585 
586  return !!err;
587 }