[Ffmpeg-devel] int vs. float profiler, take 2

Gabriel Gerhardsson gabrielg
Sat May 21 10:59:30 CEST 2005


On Fri, 2005-05-20 at 17:05 -0600, Mike Melanson wrote:
> Hi,
> 	Since the first version of my little profiler generated a reasonable 
> amount of activity, attached is a slightly improved version. This one 
> does the following:
> 
> * runs all 4 of the functions n times as a cache warmup (n=1000 in the 
> code); this actually does help with cycle count consistency
> * fetches an overhead cycle count as a baseline
> * C code can fetch iteration count
> 
> The ASM code has ITERATIONS set to 1 right now. I would be interested to 
> know the results from varying CPUs using 1, 10, and 100 iterations.
> 
> 	Results from my VIA P3-class CPU:
> 
>    warming up with 1000 cycles...
> integer_adder(), 1 adds, 27 cycles used (overhead = 26)
> float_adder(), 1 adds, 27 cycles used (overhead = 26)
> integer_mult(), 1 mults, 34 cycles used (overhead = 28)
> float_mult(), 1 mults, 27 cycles used (overhead = 26)
> 
>    warming up with 1000 cycles...
> integer_adder(), 10 adds, 36 cycles used (overhead = 26)
> float_adder(), 10 adds, 36 cycles used (overhead = 26)
> integer_mult(), 10 mults, 115 cycles used (overhead = 28)
> float_mult(), 10 mults, 36 cycles used (overhead = 26)
> 
>    warming up with 1000 cycles...
> integer_adder(), 100 adds, 156 cycles used (overhead = 26)
> float_adder(), 100 adds, 576 cycles used (overhead = 26)
> integer_mult(), 100 mults, 925 cycles used (overhead = 28)
> float_mult(), 100 mults, 578 cycles used (overhead = 26)
> 
> The benchmark still suffers from the result dependency problem. But the 
> same problem comes up in the multimedia decoding algorithms, right? If 
> the multiplications were parallelizable, wouldn't we be using 
> SSE/SSE2/AltiVec instructions to parallelize them?
> 
> 	Thanks...

Hello

When timing a few instructions like this, it's important to serialize
the rdtsc instructions. Otherwise they will just be executed
out-of-order on any modern processor. Please try the attached math.asm.
Only the timing code has changed, not the timed instructions. Please use
this timing code in your next version of this benchmark.

/Gabriel
-------------- next part --------------
global get_iterations:function
global integer_adder:function
global float_adder:function
global integer_mult:function
global float_mult:function

%define ITERATIONS 1

; int get_iterations(void)
get_iterations:
  mov    eax, ITERATIONS
  ret

; int integer_adder(
;  unsigned int *cycle_count,
;  unsigned int *overhead_cycle_count);
integer_adder:
  push   ebp       ; set up context and save registers
  mov    ebp,esp
  push   ebx
  push   ecx
  push   edx

  ; profile overhead cycle count
  xor    eax, eax
  cpuid
  rdtsc
  push   eax       ; save
  xor    eax, eax
  cpuid

  push   ecx       ; operation, just the push here since we're measuring overhead

  xor    eax, eax
  cpuid
  rdtsc
  push   eax
  xor    eax, eax
  cpuid
  pop    eax
  pop    ecx
  pop    ebx
  sub    eax, ebx
  mov    ebx, [ebp+12]
  mov    [ebx], eax


  mov    ecx, 0    ; ecx will serve as the accumulator

  ; first timestamp
  xor    eax, eax
  cpuid
  rdtsc
  push   eax       ; save
  xor    eax, eax
  cpuid

  times ITERATIONS  add  ecx, 5  ; perform n additions
  push   ecx       ; save result

  ; second timestamp
  xor    eax, eax
  cpuid
  rdtsc
  push   eax
  xor    eax, eax
  cpuid
  pop    eax
  pop    ecx
  pop    ebx


  sub    eax, ebx  ; calculate the cycles elapsed
  mov    ebx, [ebp+8]  ; load the address of the cycle count parameter
  mov    [ebx], eax  ; save the cycle count

  mov    eax, ecx  ; return the sum through eax

  pop    edx       ; restore the CPU state
  pop    ecx
  pop    ebx
  pop    ebp
  ret

; double float_adder(unsigned int *cycle_count,
;  unsigned int *overhead_cycle_count);
float_adder:
  push   ebp       ; set up context and save registers
  mov    ebp,esp
  push   eax
  push   ebx
  push   ecx
  push   edx

  ; profile overhead cycle count
  xor    eax, eax
  cpuid
  rdtsc
  push   eax       ; save
  xor    eax, eax
  cpuid

  xor    eax, eax
  cpuid
  rdtsc
  push   eax
  xor    eax, eax
  cpuid
  pop    eax
  pop    ebx
  sub    eax, ebx
  mov    ebx, [ebp+12]
  mov    [ebx], eax


  fld1             ; push 1 on the stack
  fld1             ; push 1 on the stack
  times 4  fadd  ST1  ; turn 1 into 5
  fldz             ; push zero on the stack

  ; first timestamp
  xor    eax, eax
  cpuid
  rdtsc
  push   eax   ; save
  xor    eax, eax
  cpuid
  
  times ITERATIONS  fadd  ST1  ; perform n float adds
  
  ; second timestamp
  xor    eax, eax
  cpuid
  rdtsc
  push   eax
  xor    eax, eax
  cpuid
  pop    eax
  pop    ebx

  sub    eax, ebx  ; calculate the cycles elapsed
  mov    ebx, [ebp+8]  ; load the address of the cycle count parameter
  mov    [ebx], eax

  pop    edx       ; restore the CPU state
  pop    ecx
  pop    ebx
  pop    eax
  pop    ebp
  ret

; int integer_mult(unsigned int *cycle_count,
;  unsigned int *overhead_cycle_count);
integer_mult:
  push   ebp       ; set up context and save registers
  mov    ebp,esp
  push   ebx
  push   ecx
  push   edx

  ; profile overhead cycle count
  xor    eax, eax
  cpuid
  rdtsc
  push   eax       ; save
  xor    eax, eax
  cpuid

  push   ecx       ; operation, just the push here since we're measuring overhead

  xor    eax, eax
  cpuid
  rdtsc
  push   eax
  xor    eax, eax
  cpuid
  pop    eax
  pop    ecx
  pop    ebx
  sub    eax, ebx
  mov    ebx, [ebp+12]
  mov    [ebx], eax


  mov    ecx, 5    ; ecx holds the multiplier

  ; first timestamp
  xor    eax, eax
  cpuid
  rdtsc
  push   eax       ; save
  xor    eax, eax
  cpuid
  
  mov    eax, 1    ; set up the base for multiplication
  times ITERATIONS  mul  ecx  ; perform n int mults (eax *= ecx)
  push   eax       ; save result

  ; second timestamp
  xor    eax, eax
  cpuid
  rdtsc
  push   eax
  xor    eax, eax
  cpuid
  pop    eax
  pop    ecx
  pop    ebx

  sub    eax, ebx  ; calculate the cycles elapsed
  mov    ebx, [ebp+8]  ; load the address of the cycle count parameter
  mov    [ebx], eax

  mov    eax, ecx  ; return the sum through eax

  pop    edx       ; restore the CPU state
  pop    ecx
  pop    ebx
  pop    ebp
  ret

; double float_mult(unsigned int *cycle_count,
;  unsigned int *overhead_cycle_count);
float_mult:
  push   ebp       ; set up context and save registers
  mov    ebp,esp
  push   eax
  push   ebx
  push   ecx
  push   edx

  ; profile overhead cycle count
  xor    eax, eax
  cpuid
  rdtsc
  push   eax       ; save
  xor    eax, eax
  cpuid

  xor    eax, eax
  cpuid
  rdtsc
  push   eax
  xor    eax, eax
  cpuid
  pop    eax
  pop    ebx
  sub    eax, ebx
  mov    ebx, [ebp+12]
  mov    [ebx], eax


  fld1             ; push 1 on the stack
  fld1             ; push 1 on the stack
  times 4  fadd  ST1  ; turn 1 into 5 (use as the multiplier)
  fld1             ; push 1 again (base for multiplication)

  ; first timestamp
  xor    eax, eax
  cpuid
  rdtsc
  push   eax       ; save
  xor    eax, eax
  cpuid
  
  times ITERATIONS  fmul  ST1  ; perform n float ops (ST0 *= ST1)
  
  ; second timestamp
  xor    eax, eax
  cpuid
  rdtsc
  push   eax
  xor    eax, eax
  cpuid
  pop    eax
  pop    ebx

  sub    eax, ebx  ; calculate the cycles elapsed
  mov    ebx, [ebp+8]  ; load the address of the cycle count parameter
  mov    [ebx], eax

  pop    edx       ; restore the CPU state
  pop    ecx
  pop    ebx
  pop    eax
  pop    ebp
  ret



More information about the ffmpeg-devel mailing list