00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include "config.h"
00029 #include <stdlib.h>
00030 #include <stdio.h>
00031 #include <string.h>
00032 #if HAVE_UNISTD_H
00033 #include <unistd.h>
00034 #endif
00035 #include <math.h>
00036
00037 #include "libavutil/cpu.h"
00038 #include "libavutil/common.h"
00039 #include "libavutil/lfg.h"
00040 #include "libavutil/time.h"
00041
00042 #include "simple_idct.h"
00043 #include "aandcttab.h"
00044 #include "faandct.h"
00045 #include "faanidct.h"
00046 #include "x86/idct_xvid.h"
00047 #include "dctref.h"
00048
00049 #undef printf
00050
00051 void ff_mmx_idct(DCTELEM *data);
00052 void ff_mmxext_idct(DCTELEM *data);
00053
00054
00055 void ff_bfin_idct(DCTELEM *block);
00056 void ff_bfin_fdct(DCTELEM *block);
00057
00058
00059 void ff_fdct_altivec(DCTELEM *block);
00060
00061
00062 void ff_j_rev_dct_arm(DCTELEM *data);
00063 void ff_simple_idct_arm(DCTELEM *data);
00064 void ff_simple_idct_armv5te(DCTELEM *data);
00065 void ff_simple_idct_armv6(DCTELEM *data);
00066 void ff_simple_idct_neon(DCTELEM *data);
00067
00068 void ff_simple_idct_axp(DCTELEM *data);
00069
00070 struct algo {
00071 const char *name;
00072 void (*func)(DCTELEM *block);
00073 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
00074 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
00075 int mm_support;
00076 int nonspec;
00077 };
00078
00079 static int cpu_flags;
00080
00081 static const struct algo fdct_tab[] = {
00082 { "REF-DBL", ff_ref_fdct, NO_PERM },
00083 { "FAAN", ff_faandct, NO_PERM },
00084 { "IJG-AAN-INT", ff_fdct_ifast, SCALE_PERM },
00085 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
00086
00087 #if HAVE_MMX_INLINE
00088 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
00089 { "MMXEXT", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMXEXT },
00090 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
00091 #endif
00092
00093 #if HAVE_ALTIVEC
00094 { "altivecfdct", ff_fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
00095 #endif
00096
00097 #if ARCH_BFIN
00098 { "BFINfdct", ff_bfin_fdct, NO_PERM },
00099 #endif
00100
00101 { 0 }
00102 };
00103
00104 #if ARCH_X86_64 && HAVE_MMX && HAVE_YASM
00105 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
00106 DCTELEM *block, int16_t *qmat);
00107
00108 static void ff_prores_idct_put_10_sse2_wrap(DCTELEM *dst){
00109 DECLARE_ALIGNED(16, static int16_t, qmat)[64];
00110 DECLARE_ALIGNED(16, static int16_t, tmp)[64];
00111 int i;
00112
00113 for(i=0; i<64; i++){
00114 qmat[i]=4;
00115 tmp[i]= dst[i];
00116 }
00117 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
00118 }
00119 #endif
00120
00121 static const struct algo idct_tab[] = {
00122 { "FAANI", ff_faanidct, NO_PERM },
00123 { "REF-DBL", ff_ref_idct, NO_PERM },
00124 { "INT", ff_j_rev_dct, MMX_PERM },
00125 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
00126
00127 #if HAVE_MMX_INLINE
00128 #if CONFIG_GPL
00129 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
00130 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
00131 #endif
00132 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
00133 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
00134 { "XVID-MMXEXT", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMXEXT, 1 },
00135 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
00136 #if ARCH_X86_64 && HAVE_YASM
00137 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
00138 #endif
00139 #endif
00140
00141 #if ARCH_BFIN
00142 { "BFINidct", ff_bfin_idct, NO_PERM },
00143 #endif
00144
00145 #if ARCH_ARM
00146 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
00147 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
00148 #endif
00149 #if HAVE_ARMV5TE
00150 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
00151 #endif
00152 #if HAVE_ARMV6
00153 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
00154 #endif
00155 #if HAVE_NEON
00156 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
00157 #endif
00158
00159 #if ARCH_ALPHA
00160 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
00161 #endif
00162
00163 { 0 }
00164 };
00165
00166 #define AANSCALE_BITS 12
00167
00168 #define NB_ITS 20000
00169 #define NB_ITS_SPEED 50000
00170
00171 static short idct_mmx_perm[64];
00172
00173 static short idct_simple_mmx_perm[64] = {
00174 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00175 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00176 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00177 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00178 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00179 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00180 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00181 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00182 };
00183
00184 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
00185
00186 static void idct_mmx_init(void)
00187 {
00188 int i;
00189
00190
00191 for (i = 0; i < 64; i++) {
00192 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00193 }
00194 }
00195
00196 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00197 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00198
00199 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
00200 {
00201 int i, j;
00202
00203 memset(block, 0, 64 * sizeof(*block));
00204
00205 switch (test) {
00206 case 0:
00207 for (i = 0; i < 64; i++)
00208 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
00209 if (is_idct) {
00210 ff_ref_fdct(block);
00211 for (i = 0; i < 64; i++)
00212 block[i] >>= 3;
00213 }
00214 break;
00215 case 1:
00216 j = av_lfg_get(prng) % 10 + 1;
00217 for (i = 0; i < j; i++) {
00218 int idx = av_lfg_get(prng) % 64;
00219 block[idx] = av_lfg_get(prng) % (2*vals) -vals;
00220 }
00221 break;
00222 case 2:
00223 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
00224 block[63] = (block[0] & 1) ^ 1;
00225 break;
00226 }
00227 }
00228
00229 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
00230 {
00231 int i;
00232
00233 if (perm == MMX_PERM) {
00234 for (i = 0; i < 64; i++)
00235 dst[idct_mmx_perm[i]] = src[i];
00236 } else if (perm == MMX_SIMPLE_PERM) {
00237 for (i = 0; i < 64; i++)
00238 dst[idct_simple_mmx_perm[i]] = src[i];
00239 } else if (perm == SSE2_PERM) {
00240 for (i = 0; i < 64; i++)
00241 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
00242 } else if (perm == PARTTRANS_PERM) {
00243 for (i = 0; i < 64; i++)
00244 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
00245 } else if (perm == TRANSPOSE_PERM) {
00246 for (i = 0; i < 64; i++)
00247 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
00248 } else {
00249 for (i = 0; i < 64; i++)
00250 dst[i] = src[i];
00251 }
00252 }
00253
00254 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
00255 {
00256 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
00257 int it, i, scale;
00258 int err_inf, v;
00259 int64_t err2, ti, ti1, it1, err_sum = 0;
00260 int64_t sysErr[64], sysErrMax = 0;
00261 int maxout = 0;
00262 int blockSumErrMax = 0, blockSumErr;
00263 AVLFG prng;
00264 const int vals=1<<bits;
00265 double omse, ome;
00266 int spec_err;
00267
00268 av_lfg_init(&prng, 1);
00269
00270 err_inf = 0;
00271 err2 = 0;
00272 for (i = 0; i < 64; i++)
00273 sysErr[i] = 0;
00274 for (it = 0; it < NB_ITS; it++) {
00275 init_block(block1, test, is_idct, &prng, vals);
00276 permute(block, block1, dct->format);
00277
00278 dct->func(block);
00279 emms_c();
00280
00281 if (dct->format == SCALE_PERM) {
00282 for (i = 0; i < 64; i++) {
00283 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00284 block[i] = (block[i] * scale) >> AANSCALE_BITS;
00285 }
00286 }
00287
00288 ref(block1);
00289
00290 blockSumErr = 0;
00291 for (i = 0; i < 64; i++) {
00292 int err = block[i] - block1[i];
00293 err_sum += err;
00294 v = abs(err);
00295 if (v > err_inf)
00296 err_inf = v;
00297 err2 += v * v;
00298 sysErr[i] += block[i] - block1[i];
00299 blockSumErr += v;
00300 if (abs(block[i]) > maxout)
00301 maxout = abs(block[i]);
00302 }
00303 if (blockSumErrMax < blockSumErr)
00304 blockSumErrMax = blockSumErr;
00305 }
00306 for (i = 0; i < 64; i++)
00307 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
00308
00309 for (i = 0; i < 64; i++) {
00310 if (i % 8 == 0)
00311 printf("\n");
00312 printf("%7d ", (int) sysErr[i]);
00313 }
00314 printf("\n");
00315
00316 omse = (double) err2 / NB_ITS / 64;
00317 ome = (double) err_sum / NB_ITS / 64;
00318
00319 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
00320
00321 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00322 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
00323 omse, ome, (double) sysErrMax / NB_ITS,
00324 maxout, blockSumErrMax);
00325
00326 if (spec_err && !dct->nonspec)
00327 return 1;
00328
00329 if (!speed)
00330 return 0;
00331
00332
00333
00334 init_block(block, test, is_idct, &prng, vals);
00335 permute(block1, block, dct->format);
00336
00337 ti = av_gettime();
00338 it1 = 0;
00339 do {
00340 for (it = 0; it < NB_ITS_SPEED; it++) {
00341 memcpy(block, block1, sizeof(block));
00342 dct->func(block);
00343 }
00344 emms_c();
00345 it1 += NB_ITS_SPEED;
00346 ti1 = av_gettime() - ti;
00347 } while (ti1 < 1000000);
00348
00349 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
00350 (double) it1 * 1000.0 / (double) ti1);
00351
00352 return 0;
00353 }
00354
00355 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00356 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00357
00358 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00359 {
00360 static int init;
00361 static double c8[8][8];
00362 static double c4[4][4];
00363 double block1[64], block2[64], block3[64];
00364 double s, sum, v;
00365 int i, j, k;
00366
00367 if (!init) {
00368 init = 1;
00369
00370 for (i = 0; i < 8; i++) {
00371 sum = 0;
00372 for (j = 0; j < 8; j++) {
00373 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
00374 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00375 sum += c8[i][j] * c8[i][j];
00376 }
00377 }
00378
00379 for (i = 0; i < 4; i++) {
00380 sum = 0;
00381 for (j = 0; j < 4; j++) {
00382 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
00383 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00384 sum += c4[i][j] * c4[i][j];
00385 }
00386 }
00387 }
00388
00389
00390 s = 0.5 * sqrt(2.0);
00391 for (i = 0; i < 4; i++) {
00392 for (j = 0; j < 8; j++) {
00393 block1[8 * (2 * i) + j] =
00394 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
00395 block1[8 * (2 * i + 1) + j] =
00396 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
00397 }
00398 }
00399
00400
00401 for (i = 0; i < 8; i++) {
00402 for (j = 0; j < 8; j++) {
00403 sum = 0;
00404 for (k = 0; k < 8; k++)
00405 sum += c8[k][j] * block1[8 * i + k];
00406 block2[8 * i + j] = sum;
00407 }
00408 }
00409
00410
00411 for (i = 0; i < 8; i++) {
00412 for (j = 0; j < 4; j++) {
00413
00414 sum = 0;
00415 for (k = 0; k < 4; k++)
00416 sum += c4[k][j] * block2[8 * (2 * k) + i];
00417 block3[8 * (2 * j) + i] = sum;
00418
00419
00420 sum = 0;
00421 for (k = 0; k < 4; k++)
00422 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
00423 block3[8 * (2 * j + 1) + i] = sum;
00424 }
00425 }
00426
00427
00428 for (i = 0; i < 8; i++) {
00429 for (j = 0; j < 8; j++) {
00430 v = block3[8 * i + j];
00431 if (v < 0) v = 0;
00432 else if (v > 255) v = 255;
00433 dest[i * linesize + j] = (int) rint(v);
00434 }
00435 }
00436 }
00437
00438 static void idct248_error(const char *name,
00439 void (*idct248_put)(uint8_t *dest, int line_size,
00440 int16_t *block),
00441 int speed)
00442 {
00443 int it, i, it1, ti, ti1, err_max, v;
00444 AVLFG prng;
00445
00446 av_lfg_init(&prng, 1);
00447
00448
00449
00450 err_max = 0;
00451 for (it = 0; it < NB_ITS; it++) {
00452
00453 for (i = 0; i < 64; i++)
00454 block1[i] = av_lfg_get(&prng) % 256 - 128;
00455 block1[0] += 1024;
00456
00457 for (i = 0; i < 64; i++)
00458 block[i] = block1[i];
00459 idct248_ref(img_dest1, 8, block);
00460
00461 for (i = 0; i < 64; i++)
00462 block[i] = block1[i];
00463 idct248_put(img_dest, 8, block);
00464
00465 for (i = 0; i < 64; i++) {
00466 v = abs((int) img_dest[i] - (int) img_dest1[i]);
00467 if (v == 255)
00468 printf("%d %d\n", img_dest[i], img_dest1[i]);
00469 if (v > err_max)
00470 err_max = v;
00471 }
00472 #if 0
00473 printf("ref=\n");
00474 for(i=0;i<8;i++) {
00475 int j;
00476 for(j=0;j<8;j++) {
00477 printf(" %3d", img_dest1[i*8+j]);
00478 }
00479 printf("\n");
00480 }
00481
00482 printf("out=\n");
00483 for(i=0;i<8;i++) {
00484 int j;
00485 for(j=0;j<8;j++) {
00486 printf(" %3d", img_dest[i*8+j]);
00487 }
00488 printf("\n");
00489 }
00490 #endif
00491 }
00492 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
00493
00494 if (!speed)
00495 return;
00496
00497 ti = av_gettime();
00498 it1 = 0;
00499 do {
00500 for (it = 0; it < NB_ITS_SPEED; it++) {
00501 for (i = 0; i < 64; i++)
00502 block[i] = block1[i];
00503 idct248_put(img_dest, 8, block);
00504 }
00505 emms_c();
00506 it1 += NB_ITS_SPEED;
00507 ti1 = av_gettime() - ti;
00508 } while (ti1 < 1000000);
00509
00510 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
00511 (double) it1 * 1000.0 / (double) ti1);
00512 }
00513
00514 static void help(void)
00515 {
00516 printf("dct-test [-i] [<test-number>] [<bits>]\n"
00517 "test-number 0 -> test with random matrixes\n"
00518 " 1 -> test with random sparse matrixes\n"
00519 " 2 -> do 3. test from mpeg4 std\n"
00520 "bits Number of time domain bits to use, 8 is default\n"
00521 "-i test IDCT implementations\n"
00522 "-4 test IDCT248 implementations\n"
00523 "-t speed test\n");
00524 }
00525
00526 #if !HAVE_GETOPT
00527 #include "compat/getopt.c"
00528 #endif
00529
00530 int main(int argc, char **argv)
00531 {
00532 int test_idct = 0, test_248_dct = 0;
00533 int c, i;
00534 int test = 1;
00535 int speed = 0;
00536 int err = 0;
00537 int bits=8;
00538
00539 cpu_flags = av_get_cpu_flags();
00540
00541 ff_ref_dct_init();
00542 idct_mmx_init();
00543
00544 for (;;) {
00545 c = getopt(argc, argv, "ih4t");
00546 if (c == -1)
00547 break;
00548 switch (c) {
00549 case 'i':
00550 test_idct = 1;
00551 break;
00552 case '4':
00553 test_248_dct = 1;
00554 break;
00555 case 't':
00556 speed = 1;
00557 break;
00558 default:
00559 case 'h':
00560 help();
00561 return 0;
00562 }
00563 }
00564
00565 if (optind < argc)
00566 test = atoi(argv[optind]);
00567 if(optind+1 < argc) bits= atoi(argv[optind+1]);
00568
00569 printf("ffmpeg DCT/IDCT test\n");
00570
00571 if (test_248_dct) {
00572 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
00573 } else {
00574 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
00575 for (i = 0; algos[i].name; i++)
00576 if (!(~cpu_flags & algos[i].mm_support)) {
00577 err |= dct_error(&algos[i], test, test_idct, speed, bits);
00578 }
00579 }
00580
00581 return err;
00582 }