[FFmpeg-devel] [PATCH 7/7] autocorrelate
Christophe Gisquet
christophe.gisquet at gmail.com
Sat Apr 6 12:52:14 CEST 2013
---
libavcodec/x86/sbrdsp.asm | 71 +++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 71 insertions(+)
diff --git a/libavcodec/x86/sbrdsp.asm b/libavcodec/x86/sbrdsp.asm
index a7998fa..77535a4 100644
--- a/libavcodec/x86/sbrdsp.asm
+++ b/libavcodec/x86/sbrdsp.asm
@@ -25,6 +25,7 @@ SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
+ps_mask3 dd 0, 0, -1, -1
ps_neg times 4 dd 1<<31
ps_noise0 times 2 dd 1.0, 0.0,
ps_noise2 times 2 dd -1.0, 0.0
@@ -503,3 +504,73 @@ SBR_HF_APPLY_NOISE
INIT_XMM avx
SBR_HF_APPLY_NOISE
+
+INIT_XMM sse
+; void sbr_autocorrelate_c(const float x[40][2], float phi[3][2][2])
+cglobal sbr_autocorrelate, 2,3,8, x, phi, l
+ ; m6 = real_sum0 m7 = real_sum*
+ ; store:
+ ; m5l <- (real_sum0) x[ 0][0] * x[ 0][0] + x[ 0][1] * x[ 0][1]
+ ; m5h0 <- (real_sum1) x[ 0][0] * x[ 1][0] + x[ 0][1] * x[ 1][1]
+ ; m5h1 <- (imag_sum1) x[ 0][0] * x[ 1][1] - x[ 0][1] * x[ 1][0]
+ ; m7h0 <- real_sum2 = x[ 0][0] * x[ 2][0] + x[ 0][1] * x[ 2][1]
+ ; m7h1 <- imag_sum2 = x[ 0][0] * x[ 2][1] - x[ 0][1] * x[ 2][0]
+ movlps m7, [xq]
+ add xq, 8
+ mova m2, m7
+ mova m5, m7
+ movu m3, [xq]
+ shufps m7, m7, q0000
+ shufps m2, m2, q1111
+ mulps m7, m3
+ mulps m2, m3
+ mova m4, [ps_mask2]
+ mulps m5, m5
+ shufps m2, m2, q2301
+ xorps m6, m6 ; real_sum0 = 0
+ xorps m2, m4 ; [ps_mask2]
+ movlhps m3, m3 ; x2 x2
+ addps m7, m2 ; init real_sum2/imag_sum2
+ movlhps m5, m7 ; store final value for real_sum1/imag_sum1
+ movlhps m7, m6 ; real_sum1/imag_sum1 = 0
+ mov lq, -37*8
+ sub xq, lq
+.loop:
+ ;~ real_sum0 += x[i][0] * x[i ][0] + x[i][1] * x[i ][1];
+ ;~ real_sum1 += x[i][0] * x[i+1][0] + x[i][1] * x[i+1][1];
+ ;~ imag_sum1 += x[i][0] * x[i+1][1] - x[i][1] * x[i+1][0];
+ ;~ real_sum2 += x[i][0] * x[i+2][0] + x[i][1] * x[i+2][1];
+ ;~ imag_sum2 += x[i][0] * x[i+2][1] - x[i][1] * x[i+2][0];
+ movhps m3, [xq + lq] ; x3 x2
+ mova m1, m0
+ mova m2, m0
+ shufps m0, m0, q0000 ; x0
+ shufps m1, m1, q1111 ; x1
+ mulps m2, m2 ; x0*x0
+ mulps m0, m3 ; x0*V
+ mulps m1, m3 ; x1*V
+ addps m6, m2 ; real_sum0 += x0*x0
+ addps m7, m0 ; real_sum* += x0*V
+ shufps m1, m1, q2301
+ xorps m1, m4 ; [ps_mask2] ; x1*Vrev
+ movhlps m0, m3 ; x1 -> x0
+ addps m7, m1 ; real_sum* += x1*Vrev
+ movlhps m3, m3 ; x2 x2
+ add lq, 8
+ jl .loop
+ ; phi[2-2][1][0] = real_sum2
+ ; phi[2-2][1][1] = imag_sum2
+ movhps [phiq + 1*8], m7 ; store real_sum2/imag_sum2
+
+ ; m5l <- (real_sum0) x[ 0][0] * x[ 0][0] + x[ 0][1] * x[ 0][1]
+ ; m5h0 <- (real_sum1) x[ 0][0] * x[ 1][0] + x[ 0][1] * x[ 1][1]
+ ; m5h1 <- (imag_sum1) x[ 0][0] * x[ 1][1] - x[ 0][1] * x[ 1][0]
+ ; m7h0 <- real_sum2 = x[ 0][0] * x[ 2][0] + x[ 0][1] * x[ 2][1]
+ ; m7h1 <- imag_sum2 = x[ 0][0] * x[ 2][1] - x[ 0][1] * x[ 2][0]
+
+ ; phi[2 ][1][0] = real_sum0 + x[ 0][0] * x[ 0][0] + x[ 0][1] * x[ 0][1]
+ ; phi[1 ][0][0] = real_sum0 + x[38][0] * x[38][0] + x[38][1] * x[38][1]
+ ; phi[2-1][1][0] = real_sum1 + x[ 0][0] * x[ 1][0] + x[ 0][1] * x[ 1][1]
+ ; phi[2-1][1][1] = imag_sum1 + x[ 0][0] * x[ 1][1] - x[ 0][1] * x[ 1][0]
+ ; phi[0 ][0][0] = real_sum1 + x[38][0] * x[39][0] + x[38][1] * x[39][1]
+ ; phi[0 ][0][1] = imag_sum1 + x[38][0] * x[39][1] - x[38][1] * x[39][0]
--
1.8.0.msysgit.0
More information about the ffmpeg-devel
mailing list