[FFmpeg-devel] [PATCH 3/7] x86: sbrdsp: implement SSE qmf_deint_bfly
Michael Niedermayer
michaelni at gmx.at
Sat Apr 6 15:11:50 CEST 2013
On Sat, Apr 06, 2013 at 10:52:10AM +0000, Christophe Gisquet wrote:
> From 312 to 89/68 (sse/sse2) cycles on Arrandale and Win64.
> Sandybridge: 68/47 cycles.
>
> Having a loop counter is a 7 cycle gain.
> Unrolling is another 7 cycle gain.
> Working in reverse scan is another 6 cycles.
following gives 1 cycle speedup on SSE2 SB
@@ -322,6 +322,7 @@ cglobal sbr_qmf_deint_neg, 2,3,3,v,src,vrev
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
+ sub vrevq, src1q
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
@@ -342,12 +343,11 @@ cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
subps m0, m7
addps m1, m6
subps m4, m3
- mova [vrevq], m1
- mova [vrevq+mmsize], m5
+ mova [vrevq+src1q], m1
+ mova [vrevq+src1q+mmsize], m5
mova [vq+cq], m0
mova [vq+cq+mmsize], m4
add src1q, 2*mmsize
- add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET
[...]
--
Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
No snowflake in an avalanche ever feels responsible. -- Voltaire
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 198 bytes
Desc: Digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20130406/a31d5629/attachment.asc>
More information about the ffmpeg-devel
mailing list