[FFmpeg-devel] [PATCH] VC-1 MMX DSP functions

Zuxy Meng zuxy.meng
Sun Jul 8 17:05:07 CEST 2007


Hi,

2007/7/8, Christophe GISQUET <christophe.gisquet at free.fr>:
> Hello,
>
> Zuxy Meng a ?crit :
> > I did a quick test on 64-bit K8 tonight thanks to Stephan's testbed.
>
> And myself on a x86-64 core2 system.
>
> > The result wasn't promising. In short, from fastest to slowest:
> > MMX > SSE2 w/o sw pipeling > SSE2 w/ sw pipeling
>
> I haven't tested the mid-performer, but I can confirm this. Using
> START/STOP_TIMER, the figures are (on a 1080p sequence): ~2800
> dezicycles for MMX, ~3800 for SSE2.

I doubt if there's anything wrong. IIRC 32-bit SSE2 (w/ sw pipelining)
is faster than MMX on your Conroe. How can it be more than 25% slower
under 64-bit?
>
> > So the conclusion is that I can't make  a conclusion. Any suggestions?
>
> Maybe have a look at the attached opannotate (based on 4 runs) for your
> s/w pipelined SSE2 functions?
>
> The 1/4 and 3/4 seem well pipelined, with only the output that's costly.
> However, if opannotate is to be believed (because some timings are very
> surprising), the 1/2 gets quite a lot of stalls, probably up to the
> point where they make up for most of the execution time.
>
> Best regards,
> --
> Christophe GISQUET
>
> /*
>  * Command line: opannotate -a -i vc1_put_shift1_sse2,vc1_put_shift2_sse2,vc1_put_shift3_sse2
>  *
>  * Interpretation of command line:
>  * Output annotated assembly listing with samples
>  *
>  * CPU: Core 2, speed 2167 MHz (estimated)
>  * Counted CPU_CLK_UNHALTED events (Clock cycles when not halted) with a unit mask of 0x00 (Unhalted core cycles) count 100000
>  */
>               :
>               :/home/chris/src/ffmpeg/ffmpeg_g:     file format elf64-x86-64
>               :
>               :Disassembly of section .init:
>               :Disassembly of section .plt:
>               :Disassembly of section .text:
>               :
> 000000000050c970 <vc1_put_shift2_sse2>: /* vc1_put_shift2_sse2 total:  30389 59.9874 */
>   311  0.6139 :  50c970:       mov    %r9d,0xfffffffffffffffc(%rsp)
>    96  0.1895 :  50c975:       mov    $0x8,%eax
>               :  50c97a:       movslq 0x8(%rsp),%r9
>   144  0.2843 :  50c97f:       sub    0xfffffffffffffffc(%rsp),%eax
>   195  0.3849 :  50c983:       movslq %ecx,%rcx
>               :  50c986:       movslq %esi,%rsi
>    83  0.1638 :  50c989:       sub    %r9,%rdx
>     6  0.0118 :  50c98c:       mov    %eax,0xfffffffffffffffc(%rsp)
>    85  0.1678 :  50c990:       lea    (%r9,%r9,2),%rax
>     1  0.0020 :  50c994:       movd   0xfffffffffffffffc(%rsp),%xmm7
>   386  0.7620 :  50c99a:       punpcklwd %xmm7,%xmm7
>   506  0.9988 :  50c99e:       pshufd $0x0,%xmm7,%xmm7
>   580  1.1449 :  50c9a3:       movq   (%rdx),%xmm11
>   306  0.6040 :  50c9a8:       movq   (%rdx,%r9,1),%xmm12
>   202  0.3987 :  50c9ae:       movq   (%rdx,%r9,2),%xmm5
>   201  0.3968 :  50c9b4:       movq   (%rdx,%rax,1),%xmm6
>   356  0.7027 :  50c9b9:       punpcklbw %xmm0,%xmm11
>    37  0.0730 :  50c9be:       punpcklbw %xmm0,%xmm12
>   131  0.2586 :  50c9c3:       punpcklbw %xmm0,%xmm5
>   151  0.2981 :  50c9c7:       punpcklbw %xmm0,%xmm6
>   156  0.3079 :  50c9cb:       nopl   0x0(%rax,%rax,1)
>  1170  2.3096 :  50c9d0:       add    %rcx,%rdx
>   128  0.2527 :  50c9d3:       movdqa %xmm11,%xmm1
>    24  0.0474 :  50c9d8:       movdqa %xmm12,%xmm2
>   152  0.3000 :  50c9dd:       movdqa %xmm5,%xmm3
>  1172  2.3135 :  50c9e1:       movdqa %xmm6,%xmm4
>    18  0.0355 :  50c9e5:       movq   (%rdx),%xmm11
>  4028  7.9512 :  50c9ea:       movq   (%rdx,%r9,1),%xmm12
>  2040  4.0269 :  50c9f0:       movq   (%rdx,%r9,2),%xmm5
>  3116  6.1509 :  50c9f6:       movq   (%rdx,%rax,1),%xmm6
>  2812  5.5508 :  50c9fb:       punpcklbw %xmm0,%xmm11
>  1291  2.5484 :  50ca00:       punpcklbw %xmm0,%xmm12
>   308  0.6080 :  50ca05:       punpcklbw %xmm0,%xmm5
>  1250  2.4675 :  50ca09:       punpcklbw %xmm0,%xmm6
>  1436  2.8346 :  50ca0d:       paddsw %xmm2,%xmm3
>    14  0.0276 :  50ca11:       paddsw %xmm1,%xmm4
>    92  0.1816 :  50ca15:       movdqa %xmm3,%xmm2
>   993  1.9602 :  50ca19:       psllw  $0x3,%xmm3
>   215  0.4244 :  50ca1e:       paddw  %xmm2,%xmm3
>    71  0.1402 :  50ca22:       psubsw %xmm4,%xmm3
>   170  0.3356 :  50ca26:       paddsw %xmm7,%xmm3
>  1113  2.1970 :  50ca2a:       psraw  $0x4,%xmm3
>   344  0.6791 :  50ca2f:       packuswb %xmm3,%xmm3
>  3386  6.6839 :  50ca33:       movq   %xmm3,(%rdi)
>  1034  2.0411 :  50ca37:       add    %rsi,%rdi
>     8  0.0158 :  50ca3a:       dec    %r8d
>    50  0.0987 :  50ca3d:       jne    50c9d0 <vc1_put_shift2_sse2+0x60>
>    21  0.0415 :  50ca3f:       retq
>               :Disassembly of section .fini:
>               :
>               :/home/chris/src/ffmpeg/ffmpeg_g:     file format elf64-x86-64
>               :
>               :Disassembly of section .init:
>               :Disassembly of section .plt:
>               :Disassembly of section .text:
>               :
> 000000000050c7c0 <vc1_put_shift1_sse2>: /* vc1_put_shift1_sse2 total:  10557 20.8393 */
>   169  0.3336 :  50c7c0:       mov    %r9d,0xfffffffffffffffc(%rsp)
>    61  0.1204 :  50c7c5:       mov    $0x20,%eax
>               :  50c7ca:       movslq 0x8(%rsp),%r9
>    62  0.1224 :  50c7cf:       sub    0xfffffffffffffffc(%rsp),%eax
>    74  0.1461 :  50c7d3:       movslq %ecx,%rcx
>               :  50c7d6:       movslq %esi,%rsi
>    26  0.0513 :  50c7d9:       sub    %r9,%rdx
>     2  0.0039 :  50c7dc:       mov    %eax,0xfffffffffffffffc(%rsp)
>    41  0.0809 :  50c7e0:       lea    (%r9,%r9,2),%rax
>               :  50c7e4:       movd   0xfffffffffffffffc(%rsp),%xmm7
>   191  0.3770 :  50c7ea:       punpcklwd %xmm7,%xmm7
>   242  0.4777 :  50c7ee:       pshufd $0x0,%xmm7,%xmm7
>   263  0.5192 :  50c7f3:       movq   (%rdx),%xmm11
>   205  0.4047 :  50c7f8:       movq   (%rdx,%r9,1),%xmm12
>    99  0.1954 :  50c7fe:       movq   (%rdx,%r9,2),%xmm5
>   129  0.2546 :  50c804:       movq   (%rdx,%rax,1),%xmm6
>   167  0.3297 :  50c809:       punpcklbw %xmm0,%xmm11
>    14  0.0276 :  50c80e:       punpcklbw %xmm0,%xmm12
>    78  0.1540 :  50c813:       punpcklbw %xmm0,%xmm5
>    80  0.1579 :  50c817:       punpcklbw %xmm0,%xmm6
>    69  0.1362 :  50c81b:       nopl   0x0(%rax,%rax,1)
>   553  1.0916 :  50c820:       add    %rcx,%rdx
>   112  0.2211 :  50c823:       movdqa %xmm11,%xmm1
>    18  0.0355 :  50c828:       movdqa %xmm12,%xmm2
>    72  0.1421 :  50c82d:       movdqa %xmm5,%xmm3
>   557  1.0995 :  50c831:       movdqa %xmm6,%xmm4
>    13  0.0257 :  50c835:       punpcklbw %xmm0,%xmm11
>   669  1.3206 :  50c83a:       punpcklbw %xmm0,%xmm12
>   100  0.1974 :  50c83f:       punpcklbw %xmm0,%xmm5
>   659  1.3009 :  50c843:       punpcklbw %xmm0,%xmm6
>   700  1.3818 :  50c847:       pmullw %xmm8,%xmm2
>    24  0.0474 :  50c84c:       psllw  $0x2,%xmm1
>    38  0.0750 :  50c851:       pmullw %xmm9,%xmm3
>   586  1.1568 :  50c856:       psubsw %xmm4,%xmm2
>    34  0.0671 :  50c85a:       paddsw %xmm4,%xmm1
>    29  0.0572 :  50c85e:       psubsw %xmm4,%xmm3
>    59  0.1165 :  50c862:       psubsw %xmm1,%xmm2
>   606  1.1962 :  50c866:       paddsw %xmm2,%xmm3
>   183  0.3612 :  50c86a:       paddsw %xmm7,%xmm3
>   325  0.6415 :  50c86e:       psraw  $0x6,%xmm3
>   371  0.7323 :  50c873:       packuswb %xmm3,%xmm3
>  2246  4.4336 :  50c877:       movq   %xmm3,(%rdi)
>   569  1.1232 :  50c87b:       add    %rsi,%rdi
>    62  0.1224 :  50c87e:       dec    %r8d
>               :  50c881:       jne    50c820 <vc1_put_shift1_sse2+0x60>
>               :  50c883:       retq
>               :  50c884:       nopw   0x0(%rax,%rax,1)
>               :  50c88a:       nopw   0x0(%rax,%rax,1)
>               :Disassembly of section .fini:
>               :
>               :/home/chris/src/ffmpeg/ffmpeg_g:     file format elf64-x86-64
>               :
>               :Disassembly of section .init:
>               :Disassembly of section .plt:
>               :Disassembly of section .text:
>               :
> 000000000050c8a0 <vc1_put_shift3_sse2>: /* vc1_put_shift3_sse2 total:   9713 19.1733 */
>   147  0.2902 :  50c8a0:       mov    %r9d,0xfffffffffffffffc(%rsp)
>    57  0.1125 :  50c8a5:       mov    $0x20,%eax
>               :  50c8aa:       movslq 0x8(%rsp),%r9
>    89  0.1757 :  50c8af:       sub    0xfffffffffffffffc(%rsp),%eax
>   178  0.3514 :  50c8b3:       movslq %ecx,%rcx
>               :  50c8b6:       movslq %esi,%rsi
>    46  0.0908 :  50c8b9:       sub    %r9,%rdx
>     7  0.0138 :  50c8bc:       mov    %eax,0xfffffffffffffffc(%rsp)
>    43  0.0849 :  50c8c0:       lea    (%r9,%r9,2),%rax
>     9  0.0178 :  50c8c4:       movd   0xfffffffffffffffc(%rsp),%xmm7
>   159  0.3139 :  50c8ca:       punpcklwd %xmm7,%xmm7
>   200  0.3948 :  50c8ce:       pshufd $0x0,%xmm7,%xmm7
>   242  0.4777 :  50c8d3:       movq   (%rdx,%rax,1),%xmm11
>   229  0.4520 :  50c8d9:       movq   (%rdx,%r9,2),%xmm12
>    90  0.1777 :  50c8df:       movq   (%rdx,%r9,1),%xmm5
>    92  0.1816 :  50c8e5:       movq   (%rdx),%xmm6
>   139  0.2744 :  50c8e9:       punpcklbw %xmm0,%xmm11
>     6  0.0118 :  50c8ee:       punpcklbw %xmm0,%xmm12
>    63  0.1244 :  50c8f3:       punpcklbw %xmm0,%xmm5
>    62  0.1224 :  50c8f7:       punpcklbw %xmm0,%xmm6
>    68  0.1342 :  50c8fb:       nopl   0x0(%rax,%rax,1)
>   541  1.0679 :  50c900:       add    %rcx,%rdx
>    71  0.1402 :  50c903:       movdqa %xmm11,%xmm1
>    13  0.0257 :  50c908:       movdqa %xmm12,%xmm2
>    58  0.1145 :  50c90d:       movdqa %xmm5,%xmm3
>   556  1.0975 :  50c911:       movdqa %xmm6,%xmm4
>    15  0.0296 :  50c915:       punpcklbw %xmm0,%xmm11
>   599  1.1824 :  50c91a:       punpcklbw %xmm0,%xmm12
>    90  0.1777 :  50c91f:       punpcklbw %xmm0,%xmm5
>   585  1.1548 :  50c923:       punpcklbw %xmm0,%xmm6
>   592  1.1686 :  50c927:       pmullw %xmm8,%xmm2
>    15  0.0296 :  50c92c:       psllw  $0x2,%xmm1
>    43  0.0849 :  50c931:       pmullw %xmm9,%xmm3
>   524  1.0344 :  50c936:       psubsw %xmm4,%xmm2
>    36  0.0711 :  50c93a:       paddsw %xmm4,%xmm1
>    19  0.0375 :  50c93e:       psubsw %xmm4,%xmm3
>    40  0.0790 :  50c942:       psubsw %xmm1,%xmm2
>   542  1.0699 :  50c946:       paddsw %xmm2,%xmm3
>   124  0.2448 :  50c94a:       paddsw %xmm7,%xmm3
>   244  0.4817 :  50c94e:       psraw  $0x6,%xmm3
>   296  0.5843 :  50c953:       packuswb %xmm3,%xmm3
>  2162  4.2678 :  50c957:       movq   %xmm3,(%rdi)
>   580  1.1449 :  50c95b:       add    %rsi,%rdi
>    42  0.0829 :  50c95e:       dec    %r8d
>               :  50c961:       jne    50c900 <vc1_put_shift3_sse2+0x60>
>               :  50c963:       retq
>               :  50c964:       nopw   0x0(%rax,%rax,1)
>               :  50c96a:       nopw   0x0(%rax,%rax,1)
>               :Disassembly of section .fini:
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at mplayerhq.hu
> http://lists.mplayerhq.hu/mailman/listinfo/ffmpeg-devel
>


-- 
Zuxy
Beauty is truth,
While truth is beauty.
PGP KeyID: E8555ED6




More information about the ffmpeg-devel mailing list