00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00028 #include <stdlib.h>
00029 #include <stdio.h>
00030 #include <string.h>
00031 #include <sys/time.h>
00032 #include <unistd.h>
00033 #include <math.h>
00034
00035 #include "libavutil/cpu.h"
00036 #include "libavutil/common.h"
00037 #include "libavutil/lfg.h"
00038
00039 #include "simple_idct.h"
00040 #include "aandcttab.h"
00041 #include "faandct.h"
00042 #include "faanidct.h"
00043 #include "x86/idct_xvid.h"
00044 #include "dctref.h"
00045
00046 #undef printf
00047
00048 void ff_mmx_idct(DCTELEM *data);
00049 void ff_mmxext_idct(DCTELEM *data);
00050
00051 void odivx_idct_c(short *block);
00052
00053
00054 void ff_bfin_idct(DCTELEM *block);
00055 void ff_bfin_fdct(DCTELEM *block);
00056
00057
00058 void fdct_altivec(DCTELEM *block);
00059
00060
00061
00062 void ff_j_rev_dct_arm(DCTELEM *data);
00063 void ff_simple_idct_arm(DCTELEM *data);
00064 void ff_simple_idct_armv5te(DCTELEM *data);
00065 void ff_simple_idct_armv6(DCTELEM *data);
00066 void ff_simple_idct_neon(DCTELEM *data);
00067
00068 void ff_simple_idct_axp(DCTELEM *data);
00069
00070 struct algo {
00071 const char *name;
00072 void (*func)(DCTELEM *block);
00073 enum formattag { NO_PERM, MMX_PERM, MMX_SIMPLE_PERM, SCALE_PERM,
00074 SSE2_PERM, PARTTRANS_PERM, TRANSPOSE_PERM } format;
00075 int mm_support;
00076 int nonspec;
00077 };
00078
00079 #ifndef FAAN_POSTSCALE
00080 #define FAAN_SCALE SCALE_PERM
00081 #else
00082 #define FAAN_SCALE NO_PERM
00083 #endif
00084
00085 static int cpu_flags;
00086
00087 static const struct algo fdct_tab[] = {
00088 { "REF-DBL", ff_ref_fdct, NO_PERM },
00089 { "FAAN", ff_faandct, FAAN_SCALE },
00090 { "IJG-AAN-INT", fdct_ifast, SCALE_PERM },
00091 { "IJG-LLM-INT", ff_jpeg_fdct_islow_8, NO_PERM },
00092
00093 #if HAVE_MMX
00094 { "MMX", ff_fdct_mmx, NO_PERM, AV_CPU_FLAG_MMX },
00095 { "MMX2", ff_fdct_mmx2, NO_PERM, AV_CPU_FLAG_MMX2 },
00096 { "SSE2", ff_fdct_sse2, NO_PERM, AV_CPU_FLAG_SSE2 },
00097 #endif
00098
00099 #if HAVE_ALTIVEC
00100 { "altivecfdct", fdct_altivec, NO_PERM, AV_CPU_FLAG_ALTIVEC },
00101 #endif
00102
00103 #if ARCH_BFIN
00104 { "BFINfdct", ff_bfin_fdct, NO_PERM },
00105 #endif
00106
00107 { 0 }
00108 };
00109
00110 #if HAVE_MMX
00111 void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
00112 DCTELEM *block, int16_t *qmat);
00113
00114 static void ff_prores_idct_put_10_sse2_wrap(uint16_t *dst){
00115 int16_t qmat[64]; int i;
00116 int16_t tmp[64];
00117
00118 for(i=0; i<64; i++){
00119 qmat[i]=4;
00120 tmp[i]= dst[i];
00121 }
00122 ff_prores_idct_put_10_sse2(dst, 16, tmp, qmat);
00123 }
00124 #endif
00125
00126 static const struct algo idct_tab[] = {
00127 { "FAANI", ff_faanidct, NO_PERM },
00128 { "REF-DBL", ff_ref_idct, NO_PERM },
00129 { "INT", j_rev_dct, MMX_PERM },
00130 { "SIMPLE-C", ff_simple_idct_8, NO_PERM },
00131
00132 #if HAVE_MMX
00133 #if CONFIG_GPL
00134 { "LIBMPEG2-MMX", ff_mmx_idct, MMX_PERM, AV_CPU_FLAG_MMX, 1 },
00135 { "LIBMPEG2-MMX2", ff_mmxext_idct, MMX_PERM, AV_CPU_FLAG_MMX2, 1 },
00136 #endif
00137 { "SIMPLE-MMX", ff_simple_idct_mmx, MMX_SIMPLE_PERM, AV_CPU_FLAG_MMX },
00138 { "XVID-MMX", ff_idct_xvid_mmx, NO_PERM, AV_CPU_FLAG_MMX, 1 },
00139 { "XVID-MMX2", ff_idct_xvid_mmx2, NO_PERM, AV_CPU_FLAG_MMX2, 1 },
00140 { "XVID-SSE2", ff_idct_xvid_sse2, SSE2_PERM, AV_CPU_FLAG_SSE2, 1 },
00141 #if ARCH_X86_64
00142 { "PR-SSE2", ff_prores_idct_put_10_sse2_wrap, TRANSPOSE_PERM, AV_CPU_FLAG_SSE2, 1 },
00143 #endif
00144 #endif
00145
00146 #if ARCH_BFIN
00147 { "BFINidct", ff_bfin_idct, NO_PERM },
00148 #endif
00149
00150 #if ARCH_ARM
00151 { "SIMPLE-ARM", ff_simple_idct_arm, NO_PERM },
00152 { "INT-ARM", ff_j_rev_dct_arm, MMX_PERM },
00153 #endif
00154 #if HAVE_ARMV5TE
00155 { "SIMPLE-ARMV5TE", ff_simple_idct_armv5te,NO_PERM },
00156 #endif
00157 #if HAVE_ARMV6
00158 { "SIMPLE-ARMV6", ff_simple_idct_armv6, MMX_PERM },
00159 #endif
00160 #if HAVE_NEON
00161 { "SIMPLE-NEON", ff_simple_idct_neon, PARTTRANS_PERM },
00162 #endif
00163
00164 #if ARCH_ALPHA
00165 { "SIMPLE-ALPHA", ff_simple_idct_axp, NO_PERM },
00166 #endif
00167
00168 { 0 }
00169 };
00170
00171 #define AANSCALE_BITS 12
00172
00173 static uint8_t cropTbl[256 + 2 * MAX_NEG_CROP];
00174
00175 static int64_t gettime(void)
00176 {
00177 struct timeval tv;
00178 gettimeofday(&tv, NULL);
00179 return (int64_t)tv.tv_sec * 1000000 + tv.tv_usec;
00180 }
00181
00182 #define NB_ITS 20000
00183 #define NB_ITS_SPEED 50000
00184
00185 static short idct_mmx_perm[64];
00186
00187 static short idct_simple_mmx_perm[64] = {
00188 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
00189 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
00190 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
00191 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
00192 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
00193 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
00194 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
00195 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
00196 };
00197
00198 static const uint8_t idct_sse2_row_perm[8] = { 0, 4, 1, 5, 2, 6, 3, 7 };
00199
00200 static void idct_mmx_init(void)
00201 {
00202 int i;
00203
00204
00205 for (i = 0; i < 64; i++) {
00206 idct_mmx_perm[i] = (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
00207 }
00208 }
00209
00210 DECLARE_ALIGNED(16, static DCTELEM, block)[64];
00211 DECLARE_ALIGNED(8, static DCTELEM, block1)[64];
00212
00213 static inline void mmx_emms(void)
00214 {
00215 #if HAVE_MMX
00216 if (cpu_flags & AV_CPU_FLAG_MMX)
00217 __asm__ volatile ("emms\n\t");
00218 #endif
00219 }
00220
00221 static void init_block(DCTELEM block[64], int test, int is_idct, AVLFG *prng, int vals)
00222 {
00223 int i, j;
00224
00225 memset(block, 0, 64 * sizeof(*block));
00226
00227 switch (test) {
00228 case 0:
00229 for (i = 0; i < 64; i++)
00230 block[i] = (av_lfg_get(prng) % (2*vals)) -vals;
00231 if (is_idct) {
00232 ff_ref_fdct(block);
00233 for (i = 0; i < 64; i++)
00234 block[i] >>= 3;
00235 }
00236 break;
00237 case 1:
00238 j = av_lfg_get(prng) % 10 + 1;
00239 for (i = 0; i < j; i++)
00240 block[av_lfg_get(prng) % 64] = av_lfg_get(prng) % (2*vals) -vals;
00241 break;
00242 case 2:
00243 block[ 0] = av_lfg_get(prng) % (16*vals) - (8*vals);
00244 block[63] = (block[0] & 1) ^ 1;
00245 break;
00246 }
00247 }
00248
00249 static void permute(DCTELEM dst[64], const DCTELEM src[64], int perm)
00250 {
00251 int i;
00252
00253 if (perm == MMX_PERM) {
00254 for (i = 0; i < 64; i++)
00255 dst[idct_mmx_perm[i]] = src[i];
00256 } else if (perm == MMX_SIMPLE_PERM) {
00257 for (i = 0; i < 64; i++)
00258 dst[idct_simple_mmx_perm[i]] = src[i];
00259 } else if (perm == SSE2_PERM) {
00260 for (i = 0; i < 64; i++)
00261 dst[(i & 0x38) | idct_sse2_row_perm[i & 7]] = src[i];
00262 } else if (perm == PARTTRANS_PERM) {
00263 for (i = 0; i < 64; i++)
00264 dst[(i & 0x24) | ((i & 3) << 3) | ((i >> 3) & 3)] = src[i];
00265 } else if (perm == TRANSPOSE_PERM) {
00266 for (i = 0; i < 64; i++)
00267 dst[(i>>3) | ((i<<3)&0x38)] = src[i];
00268 } else {
00269 for (i = 0; i < 64; i++)
00270 dst[i] = src[i];
00271 }
00272 }
00273
00274 static int dct_error(const struct algo *dct, int test, int is_idct, int speed, const int bits)
00275 {
00276 void (*ref)(DCTELEM *block) = is_idct ? ff_ref_idct : ff_ref_fdct;
00277 int it, i, scale;
00278 int err_inf, v;
00279 int64_t err2, ti, ti1, it1, err_sum = 0;
00280 int64_t sysErr[64], sysErrMax = 0;
00281 int maxout = 0;
00282 int blockSumErrMax = 0, blockSumErr;
00283 AVLFG prng;
00284 const int vals=1<<bits;
00285 double omse, ome;
00286 int spec_err;
00287
00288 av_lfg_init(&prng, 1);
00289
00290 err_inf = 0;
00291 err2 = 0;
00292 for (i = 0; i < 64; i++)
00293 sysErr[i] = 0;
00294 for (it = 0; it < NB_ITS; it++) {
00295 init_block(block1, test, is_idct, &prng, vals);
00296 permute(block, block1, dct->format);
00297
00298 dct->func(block);
00299 mmx_emms();
00300
00301 if (dct->format == SCALE_PERM) {
00302 for (i = 0; i < 64; i++) {
00303 scale = 8 * (1 << (AANSCALE_BITS + 11)) / ff_aanscales[i];
00304 block[i] = (block[i] * scale) >> AANSCALE_BITS;
00305 }
00306 }
00307
00308 ref(block1);
00309
00310 blockSumErr = 0;
00311 for (i = 0; i < 64; i++) {
00312 int err = block[i] - block1[i];
00313 err_sum += err;
00314 v = abs(err);
00315 if (v > err_inf)
00316 err_inf = v;
00317 err2 += v * v;
00318 sysErr[i] += block[i] - block1[i];
00319 blockSumErr += v;
00320 if (abs(block[i]) > maxout)
00321 maxout = abs(block[i]);
00322 }
00323 if (blockSumErrMax < blockSumErr)
00324 blockSumErrMax = blockSumErr;
00325 }
00326 for (i = 0; i < 64; i++)
00327 sysErrMax = FFMAX(sysErrMax, FFABS(sysErr[i]));
00328
00329 for (i = 0; i < 64; i++) {
00330 if (i % 8 == 0)
00331 printf("\n");
00332 printf("%7d ", (int) sysErr[i]);
00333 }
00334 printf("\n");
00335
00336 omse = (double) err2 / NB_ITS / 64;
00337 ome = (double) err_sum / NB_ITS / 64;
00338
00339 spec_err = is_idct && (err_inf > 1 || omse > 0.02 || fabs(ome) > 0.0015);
00340
00341 printf("%s %s: max_err=%d omse=%0.8f ome=%0.8f syserr=%0.8f maxout=%d blockSumErr=%d\n",
00342 is_idct ? "IDCT" : "DCT", dct->name, err_inf,
00343 omse, ome, (double) sysErrMax / NB_ITS,
00344 maxout, blockSumErrMax);
00345
00346 if (spec_err && !dct->nonspec)
00347 return 1;
00348
00349 if (!speed)
00350 return 0;
00351
00352
00353
00354 init_block(block, test, is_idct, &prng, vals);
00355 permute(block1, block, dct->format);
00356
00357 ti = gettime();
00358 it1 = 0;
00359 do {
00360 for (it = 0; it < NB_ITS_SPEED; it++) {
00361 memcpy(block, block1, sizeof(block));
00362 dct->func(block);
00363 }
00364 it1 += NB_ITS_SPEED;
00365 ti1 = gettime() - ti;
00366 } while (ti1 < 1000000);
00367 mmx_emms();
00368
00369 printf("%s %s: %0.1f kdct/s\n", is_idct ? "IDCT" : "DCT", dct->name,
00370 (double) it1 * 1000.0 / (double) ti1);
00371
00372 return 0;
00373 }
00374
00375 DECLARE_ALIGNED(8, static uint8_t, img_dest)[64];
00376 DECLARE_ALIGNED(8, static uint8_t, img_dest1)[64];
00377
00378 static void idct248_ref(uint8_t *dest, int linesize, int16_t *block)
00379 {
00380 static int init;
00381 static double c8[8][8];
00382 static double c4[4][4];
00383 double block1[64], block2[64], block3[64];
00384 double s, sum, v;
00385 int i, j, k;
00386
00387 if (!init) {
00388 init = 1;
00389
00390 for (i = 0; i < 8; i++) {
00391 sum = 0;
00392 for (j = 0; j < 8; j++) {
00393 s = (i == 0) ? sqrt(1.0 / 8.0) : sqrt(1.0 / 4.0);
00394 c8[i][j] = s * cos(M_PI * i * (j + 0.5) / 8.0);
00395 sum += c8[i][j] * c8[i][j];
00396 }
00397 }
00398
00399 for (i = 0; i < 4; i++) {
00400 sum = 0;
00401 for (j = 0; j < 4; j++) {
00402 s = (i == 0) ? sqrt(1.0 / 4.0) : sqrt(1.0 / 2.0);
00403 c4[i][j] = s * cos(M_PI * i * (j + 0.5) / 4.0);
00404 sum += c4[i][j] * c4[i][j];
00405 }
00406 }
00407 }
00408
00409
00410 s = 0.5 * sqrt(2.0);
00411 for (i = 0; i < 4; i++) {
00412 for (j = 0; j < 8; j++) {
00413 block1[8 * (2 * i) + j] =
00414 (block[8 * (2 * i) + j] + block[8 * (2 * i + 1) + j]) * s;
00415 block1[8 * (2 * i + 1) + j] =
00416 (block[8 * (2 * i) + j] - block[8 * (2 * i + 1) + j]) * s;
00417 }
00418 }
00419
00420
00421 for (i = 0; i < 8; i++) {
00422 for (j = 0; j < 8; j++) {
00423 sum = 0;
00424 for (k = 0; k < 8; k++)
00425 sum += c8[k][j] * block1[8 * i + k];
00426 block2[8 * i + j] = sum;
00427 }
00428 }
00429
00430
00431 for (i = 0; i < 8; i++) {
00432 for (j = 0; j < 4; j++) {
00433
00434 sum = 0;
00435 for (k = 0; k < 4; k++)
00436 sum += c4[k][j] * block2[8 * (2 * k) + i];
00437 block3[8 * (2 * j) + i] = sum;
00438
00439
00440 sum = 0;
00441 for (k = 0; k < 4; k++)
00442 sum += c4[k][j] * block2[8 * (2 * k + 1) + i];
00443 block3[8 * (2 * j + 1) + i] = sum;
00444 }
00445 }
00446
00447
00448 for (i = 0; i < 8; i++) {
00449 for (j = 0; j < 8; j++) {
00450 v = block3[8 * i + j];
00451 if (v < 0) v = 0;
00452 else if (v > 255) v = 255;
00453 dest[i * linesize + j] = (int) rint(v);
00454 }
00455 }
00456 }
00457
00458 static void idct248_error(const char *name,
00459 void (*idct248_put)(uint8_t *dest, int line_size,
00460 int16_t *block),
00461 int speed)
00462 {
00463 int it, i, it1, ti, ti1, err_max, v;
00464 AVLFG prng;
00465
00466 av_lfg_init(&prng, 1);
00467
00468
00469
00470 err_max = 0;
00471 for (it = 0; it < NB_ITS; it++) {
00472
00473 for (i = 0; i < 64; i++)
00474 block1[i] = av_lfg_get(&prng) % 256 - 128;
00475 block1[0] += 1024;
00476
00477 for (i = 0; i < 64; i++)
00478 block[i] = block1[i];
00479 idct248_ref(img_dest1, 8, block);
00480
00481 for (i = 0; i < 64; i++)
00482 block[i] = block1[i];
00483 idct248_put(img_dest, 8, block);
00484
00485 for (i = 0; i < 64; i++) {
00486 v = abs((int) img_dest[i] - (int) img_dest1[i]);
00487 if (v == 255)
00488 printf("%d %d\n", img_dest[i], img_dest1[i]);
00489 if (v > err_max)
00490 err_max = v;
00491 }
00492 #if 0
00493 printf("ref=\n");
00494 for(i=0;i<8;i++) {
00495 int j;
00496 for(j=0;j<8;j++) {
00497 printf(" %3d", img_dest1[i*8+j]);
00498 }
00499 printf("\n");
00500 }
00501
00502 printf("out=\n");
00503 for(i=0;i<8;i++) {
00504 int j;
00505 for(j=0;j<8;j++) {
00506 printf(" %3d", img_dest[i*8+j]);
00507 }
00508 printf("\n");
00509 }
00510 #endif
00511 }
00512 printf("%s %s: err_inf=%d\n", 1 ? "IDCT248" : "DCT248", name, err_max);
00513
00514 if (!speed)
00515 return;
00516
00517 ti = gettime();
00518 it1 = 0;
00519 do {
00520 for (it = 0; it < NB_ITS_SPEED; it++) {
00521 for (i = 0; i < 64; i++)
00522 block[i] = block1[i];
00523 idct248_put(img_dest, 8, block);
00524 }
00525 it1 += NB_ITS_SPEED;
00526 ti1 = gettime() - ti;
00527 } while (ti1 < 1000000);
00528 mmx_emms();
00529
00530 printf("%s %s: %0.1f kdct/s\n", 1 ? "IDCT248" : "DCT248", name,
00531 (double) it1 * 1000.0 / (double) ti1);
00532 }
00533
00534 static void help(void)
00535 {
00536 printf("dct-test [-i] [<test-number>] [<bits>]\n"
00537 "test-number 0 -> test with random matrixes\n"
00538 " 1 -> test with random sparse matrixes\n"
00539 " 2 -> do 3. test from mpeg4 std\n"
00540 "bits Number of time domain bits to use, 8 is default\n"
00541 "-i test IDCT implementations\n"
00542 "-4 test IDCT248 implementations\n"
00543 "-t speed test\n");
00544 }
00545
00546 int main(int argc, char **argv)
00547 {
00548 int test_idct = 0, test_248_dct = 0;
00549 int c, i;
00550 int test = 1;
00551 int speed = 0;
00552 int err = 0;
00553 int bits=8;
00554
00555 cpu_flags = av_get_cpu_flags();
00556
00557 ff_ref_dct_init();
00558 idct_mmx_init();
00559
00560 for (i = 0; i < 256; i++)
00561 cropTbl[i + MAX_NEG_CROP] = i;
00562 for (i = 0; i < MAX_NEG_CROP; i++) {
00563 cropTbl[i] = 0;
00564 cropTbl[i + MAX_NEG_CROP + 256] = 255;
00565 }
00566
00567 for (;;) {
00568 c = getopt(argc, argv, "ih4t");
00569 if (c == -1)
00570 break;
00571 switch (c) {
00572 case 'i':
00573 test_idct = 1;
00574 break;
00575 case '4':
00576 test_248_dct = 1;
00577 break;
00578 case 't':
00579 speed = 1;
00580 break;
00581 default:
00582 case 'h':
00583 help();
00584 return 0;
00585 }
00586 }
00587
00588 if (optind < argc)
00589 test = atoi(argv[optind]);
00590 if(optind+1 < argc) bits= atoi(argv[optind+1]);
00591
00592 printf("ffmpeg DCT/IDCT test\n");
00593
00594 if (test_248_dct) {
00595 idct248_error("SIMPLE-C", ff_simple_idct248_put, speed);
00596 } else {
00597 const struct algo *algos = test_idct ? idct_tab : fdct_tab;
00598 for (i = 0; algos[i].name; i++)
00599 if (!(~cpu_flags & algos[i].mm_support)) {
00600 err |= dct_error(&algos[i], test, test_idct, speed, bits);
00601 }
00602 }
00603
00604 return err;
00605 }