[FFmpeg-devel] [PATCH] sse4 flac lpc encoder

Fri Feb 7 15:10:33 CET 2014

Patch 1:
The sse4 code and function setup.  It should now work correctly for
everyone.  No more hacks, assumptions of calling convention, or
unconditional selection of the function.  It also handles the copy of
input samples.  One final test really should be done and that is on a
64-bit linux system.

Patch 2:
Unrolls the loop somewhat by using more xmm registers to process 12
samples per iteration.  More unrolling could be done on 64-bit arches
That might come later.

Patch 3:
The encoder now needs more space for reading and writing.  Is this way
acceptable?

Patch 4:
Cosmetic alignment of operands in the assembly.  Is there a particular
style I should be following here?

Patch 5:
Add a comment, based on my experience, to explain the original code.
-------------- next part --------------
From 04ccd0a4db34754f09128fcb7cd9a702bfed01a5 Mon Sep 17 00:00:00 2001
From: James Darnley <james.darnley at gmail.com>
Date: Wed, 5 Feb 2014 23:39:18 +0100
Subject: [PATCH 1/5] lavc/flacenc: add sse4 version of the lpc encoder

From 1.2 to 2.4 times faster.  Runtime is reduced by 4 to 32%.  The
speed-up generally increases greatly with compression_level.

This lpc encoder is not used with levels < 3 so it provides no
speed-up in these cases.
---
 libavcodec/x86/Makefile         |    4 ++
 libavcodec/x86/flac_dsp_gpl.asm |   75 +++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/flacdsp_init.c   |    4 ++
 3 files changed, 83 insertions(+), 0 deletions(-)
 create mode 100644 libavcodec/x86/flac_dsp_gpl.asm

diff --git a/libavcodec/x86/Makefile b/libavcodec/x86/Makefile
index 6d5d008..2f63475 100644
--- a/libavcodec/x86/Makefile
+++ b/libavcodec/x86/Makefile
@@ -13,6 +13,7 @@ OBJS-$(CONFIG_ENCODERS)                += x86/dsputilenc_mmx.o          \
                                           x86/motion_est.o
 OBJS-$(CONFIG_FFT)                     += x86/fft_init.o
 OBJS-$(CONFIG_FLAC_DECODER)            += x86/flacdsp_init.o
+OBJS-$(CONFIG_FLAC_ENCODER)            += x86/flacdsp_init.o
 OBJS-$(CONFIG_H263DSP)                 += x86/h263dsp_init.o
 OBJS-$(CONFIG_H264CHROMA)              += x86/h264chroma_init.o
 OBJS-$(CONFIG_H264DSP)                 += x86/h264dsp_init.o
@@ -72,6 +73,9 @@ YASM-OBJS-$(CONFIG_DSPUTIL)            += x86/dsputil.o                 \
 YASM-OBJS-$(CONFIG_ENCODERS)           += x86/dsputilenc.o
 YASM-OBJS-$(CONFIG_FFT)                += x86/fft.o
 YASM-OBJS-$(CONFIG_FLAC_DECODER)       += x86/flacdsp.o
+ifdef CONFIG_GPL
+YASM-OBJS-$(CONFIG_FLAC_ENCODER)       += x86/flac_dsp_gpl.o
+endif
 YASM-OBJS-$(CONFIG_H263DSP)            += x86/h263_loopfilter.o
 YASM-OBJS-$(CONFIG_H264CHROMA)         += x86/h264_chromamc.o           \
                                           x86/h264_chromamc_10bit.o
diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
new file mode 100644
index 0000000..c6e71c3
--- /dev/null
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -0,0 +1,75 @@
+;*****************************************************************************
+;* FLAC DSP functions
+;*
+;* Copyright (c) 2014 James Darnley <james.darnley at gmail.com>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License along
+;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
+;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+INIT_XMM sse4
+%if ARCH_X86_64
+    cglobal flac_enc_lpc_16, 6, 8, 4, 0, res, smp, len, order, coefs, shift
+    %define posj r6
+    %define negj r7
+%else
+    cglobal flac_enc_lpc_16, 6, 6, 4, 0, res, smp, len, order, coefs, shift
+    %define posj r2
+    %define negj r5
+%endif
+
+; Is it worth looping correctly over the first samples?  The most that ever need
+; to be copied is 32 so we might as well just unroll the loop and do all 32.
+%assign iter 0
+%rep 32/(mmsize/4)
+    movu m0, [smpq+iter]
+    movu [resq+iter], m0
+    %assign iter iter+mmsize
+%endrep
+
+lea resq, [resq+orderq*4]
+lea smpq, [smpq+orderq*4]
+sub lenmp, orderq
+movd m3, shiftmp
+
+loop_len:
+    pxor m0,  m0
+    xor posj, posj
+    xor negj, negj
+    loop_order:
+        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        SPLATD m2
+        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        pmulld m1,  m2
+        paddd  m0,  m1             ; p += c * s
+
+        add posj, 1
+        sub negj, 1
+        cmp posj, ordermp
+    jne loop_order
+
+    psrad m0, m3                   ; p >>= shift
+    movu  m1, [smpq]
+    psubd m1, m0                   ; smp[i] - p
+    movu  [resq], m1               ; res[i] = smp[i] - (p >> shift)
+
+    add resq, mmsize
+    add smpq, mmsize
+    sub lenmp, mmsize/4
+jg loop_len
+RET
diff --git a/libavcodec/x86/flacdsp_init.c b/libavcodec/x86/flacdsp_init.c
index d30a41e..3df6b0a 100644
--- a/libavcodec/x86/flacdsp_init.c
+++ b/libavcodec/x86/flacdsp_init.c
@@ -25,6 +25,8 @@
 void ff_flac_lpc_32_sse4(int32_t *samples, const int coeffs[32], int order,
                          int qlevel, int len);
 
+void ff_flac_enc_lpc_16_sse4(int32_t *, const int32_t *, int, int,  const int32_t *,int);
+
 av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt,
                                  int bps)
 {
@@ -34,6 +36,8 @@ av_cold void ff_flacdsp_init_x86(FLACDSPContext *c, enum AVSampleFormat fmt,
     if (EXTERNAL_SSE4(cpu_flags)) {
         if (bps > 16)
             c->lpc = ff_flac_lpc_32_sse4;
+        if (CONFIG_GPL && bps == 16)
+            c->lpc_encode = ff_flac_enc_lpc_16_sse4;
     }
 #endif
 }
-- 
1.7.9

-------------- next part --------------
From 968dedf50e4cda4d3e258c16ff0d53fc3431ab0c Mon Sep 17 00:00:00 2001
From: James Darnley <james.darnley at gmail.com>
Date: Wed, 5 Feb 2014 23:40:50 +0100
Subject: [PATCH 2/5] lavc/flacenc: partially unroll loop in
 flac_enc_lpc_16_sse4

It now does 12 samples per iteration, up from 4.

From 1.1 to 2.1 times faster again.  1.3 to 5.0 times faster overall.
Runtime is reduced by a further 4 to 20%.  Overall runtime reduced by
8 to 45%.

Same conditions as before apply.
---
 libavcodec/x86/flac_dsp_gpl.asm |   26 +++++++++++++++++++++-----
 1 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index c6e71c3..ab3cde2 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -24,11 +24,11 @@
 
 INIT_XMM sse4
 %if ARCH_X86_64
-    cglobal flac_enc_lpc_16, 6, 8, 4, 0, res, smp, len, order, coefs, shift
+    cglobal flac_enc_lpc_16, 6, 8, 8, 0, res, smp, len, order, coefs, shift
     %define posj r6
     %define negj r7
 %else
-    cglobal flac_enc_lpc_16, 6, 6, 4, 0, res, smp, len, order, coefs, shift
+    cglobal flac_enc_lpc_16, 6, 6, 8, 0, res, smp, len, order, coefs, shift
     %define posj r2
     %define negj r5
 %endif
@@ -49,14 +49,22 @@ movd m3, shiftmp
 
 loop_len:
     pxor m0,  m0
+    pxor m4,  m4
+    pxor m6,  m6
     xor posj, posj
     xor negj, negj
     loop_order:
         movd   m2, [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
         movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5, [smpq+negj*4-4+mmsize]
+        movu   m7, [smpq+negj*4-4+mmsize*2]
         pmulld m1,  m2
+        pmulld m5,  m2
+        pmulld m7,  m2
         paddd  m0,  m1             ; p += c * s
+        paddd  m4,  m5
+        paddd  m6,  m7
 
         add posj, 1
         sub negj, 1
@@ -64,12 +72,20 @@ loop_len:
     jne loop_order
 
     psrad m0, m3                   ; p >>= shift
+    psrad m4, m3
+    psrad m6, m3
     movu  m1, [smpq]
+    movu  m5, [smpq+mmsize]
+    movu  m7, [smpq+mmsize*2]
     psubd m1, m0                   ; smp[i] - p
+    psubd m5, m4
+    psubd m7, m6
     movu  [resq], m1               ; res[i] = smp[i] - (p >> shift)
+    movu  [resq+mmsize], m5
+    movu  [resq+mmsize*2], m7
 
-    add resq, mmsize
-    add smpq, mmsize
-    sub lenmp, mmsize/4
+    add resq, mmsize*3
+    add smpq, mmsize*3
+    sub lenmp, (3*mmsize)/4
 jg loop_len
 RET
-- 
1.7.9

-------------- next part --------------
From 77de810594768b5935550711c1a1e61720bb8504 Mon Sep 17 00:00:00 2001
From: James Darnley <james.darnley at gmail.com>
Date: Wed, 5 Feb 2014 22:23:35 +0100
Subject: [PATCH 3/5] lavc/flacenc: increase buffer sizes in FlacSubFrame
 struct

New functions read and write 12 samples per iteration.  Therefore they
may need up to 11 samples more in the buffer.
---
 libavcodec/flacenc.c |    4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libavcodec/flacenc.c b/libavcodec/flacenc.c
index 1fc8c4c..554d062 100644
--- a/libavcodec/flacenc.c
+++ b/libavcodec/flacenc.c
@@ -79,8 +79,8 @@ typedef struct FlacSubframe {
     int32_t coefs[MAX_LPC_ORDER];
     int shift;
     RiceContext rc;
-    int32_t samples[FLAC_MAX_BLOCKSIZE];
-    int32_t residual[FLAC_MAX_BLOCKSIZE+1];
+    int32_t samples[FLAC_MAX_BLOCKSIZE+11];
+    int32_t residual[FLAC_MAX_BLOCKSIZE+11];
 } FlacSubframe;
 
 typedef struct FlacFrame {
-- 
1.7.9

-------------- next part --------------
From b4a90268040fd75b5ba502187d5c15519c028c61 Mon Sep 17 00:00:00 2001
From: James Darnley <james.darnley at gmail.com>
Date: Wed, 5 Feb 2014 22:48:09 +0100
Subject: [PATCH 4/5] cosmetic alignment

---
 libavcodec/x86/flac_dsp_gpl.asm |   66 +++++++++++++++++++-------------------
 1 files changed, 33 insertions(+), 33 deletions(-)

diff --git a/libavcodec/x86/flac_dsp_gpl.asm b/libavcodec/x86/flac_dsp_gpl.asm
index ab3cde2..1528cb8 100644
--- a/libavcodec/x86/flac_dsp_gpl.asm
+++ b/libavcodec/x86/flac_dsp_gpl.asm
@@ -37,8 +37,8 @@ INIT_XMM sse4
 ; to be copied is 32 so we might as well just unroll the loop and do all 32.
 %assign iter 0
 %rep 32/(mmsize/4)
-    movu m0, [smpq+iter]
-    movu [resq+iter], m0
+    movu  m0,         [smpq+iter]
+    movu [resq+iter],  m0
     %assign iter iter+mmsize
 %endrep
 
@@ -48,44 +48,44 @@ sub lenmp, orderq
 movd m3, shiftmp
 
 loop_len:
-    pxor m0,  m0
-    pxor m4,  m4
-    pxor m6,  m6
-    xor posj, posj
-    xor negj, negj
+    pxor m0,   m0
+    pxor m4,   m4
+    pxor m6,   m6
+    xor  posj, posj
+    xor  negj, negj
     loop_order:
-        movd   m2, [coefsq+posj*4] ; c = coefs[j]
+        movd   m2,  [coefsq+posj*4] ; c = coefs[j]
         SPLATD m2
-        movu   m1, [smpq+negj*4-4] ; s = smp[i-j-1]
-        movu   m5, [smpq+negj*4-4+mmsize]
-        movu   m7, [smpq+negj*4-4+mmsize*2]
-        pmulld m1,  m2
-        pmulld m5,  m2
-        pmulld m7,  m2
-        paddd  m0,  m1             ; p += c * s
-        paddd  m4,  m5
-        paddd  m6,  m7
+        movu   m1,  [smpq+negj*4-4] ; s = smp[i-j-1]
+        movu   m5,  [smpq+negj*4-4+mmsize]
+        movu   m7,  [smpq+negj*4-4+mmsize*2]
+        pmulld m1,   m2
+        pmulld m5,   m2
+        pmulld m7,   m2
+        paddd  m0,   m1             ; p += c * s
+        paddd  m4,   m5
+        paddd  m6,   m7
 
-        add posj, 1
-        sub negj, 1
-        cmp posj, ordermp
+        add    posj, 1
+        sub    negj, 1
+        cmp    posj, ordermp
     jne loop_order
 
-    psrad m0, m3                   ; p >>= shift
-    psrad m4, m3
-    psrad m6, m3
-    movu  m1, [smpq]
-    movu  m5, [smpq+mmsize]
-    movu  m7, [smpq+mmsize*2]
-    psubd m1, m0                   ; smp[i] - p
-    psubd m5, m4
-    psubd m7, m6
-    movu  [resq], m1               ; res[i] = smp[i] - (p >> shift)
+    psrad  m0,    m3                ; p >>= shift
+    psrad  m4,    m3
+    psrad  m6,    m3
+    movu   m1,   [smpq]
+    movu   m5,   [smpq+mmsize]
+    movu   m7,   [smpq+mmsize*2]
+    psubd  m1,    m0                ; smp[i] - p
+    psubd  m5,    m4
+    psubd  m7,    m6
+    movu  [resq], m1                ; res[i] = smp[i] - (p >> shift)
     movu  [resq+mmsize], m5
     movu  [resq+mmsize*2], m7
 
-    add resq, mmsize*3
-    add smpq, mmsize*3
-    sub lenmp, (3*mmsize)/4
+    add    resq,  mmsize*3
+    add    smpq,  mmsize*3
+    sub    lenmp, (3*mmsize)/4
 jg loop_len
 RET
-- 
1.7.9

-------------- next part --------------
From c2194678427b6ab469fc563243bee7a4fc489e24 Mon Sep 17 00:00:00 2001
From: James Darnley <james.darnley at gmail.com>
Date: Wed, 5 Feb 2014 01:41:46 +0100
Subject: [PATCH 5/5] flacdsp_lpc_template: add comment to explain the
 CONFIG_SMALL code

I found the optimisation of 2 samples per iteration obscured the
underlying algorithm.  I had to write it out on paper and translate into
a mathematical sum to see that the two samples are unconnected.  I hope
that if anyone else is struggling to understand the code that this will
be useful.
---
 libavcodec/flacdsp_lpc_template.c |   18 ++++++++++++++++++
 1 files changed, 18 insertions(+), 0 deletions(-)

diff --git a/libavcodec/flacdsp_lpc_template.c b/libavcodec/flacdsp_lpc_template.c
index 0c453ae..22a0a4b 100644
--- a/libavcodec/flacdsp_lpc_template.c
+++ b/libavcodec/flacdsp_lpc_template.c
@@ -139,3 +139,21 @@ static void FUNC(flac_lpc_encode_c)(int32_t *res, const int32_t *smp, int len,
     }
 #endif
 }
+
+/* Comment for clarity/de-obfuscation.
+ *
+ * for (int i = order; i < len; i++) {
+ *     int32_t p = 0;
+ *     for (int j = 0; j < order; j++) {
+ *         int c = coefs[j];
+ *         int s = smp[(i-1)-j];
+ *         p    += c*s;
+ *     }
+ *     res[i] = smp[i] - (p >> shift);
+ * }
+ *
+ * The CONFIG_SMALL code above simplifies to this, in the case of SAMPLE_SIZE
+ * not being equal to 32 (at the present time that means for 16-bit audio). The
+ * code above does 2 samples per iteration.  Commit bfdd5bc ( made all the way
+ * back in 2007) says that way is faster.
+ */
-- 
1.7.9

-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 683 bytes
Desc: OpenPGP digital signature
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140207/56eaf33d/attachment.asc>