[FFmpeg-devel] [PATCH v2] avfilter/vf_bwdif: add x86 SIMD

Thomas Mundt loudmax at yahoo.de
Sat Mar 5 21:20:48 CET 2016


This new patch adds x86 SIMD support up to 12 bit.
Please comment.

Signed-off-by: Thomas Mundt <loudmax at yahoo.de>
---
 libavfilter/bwdif.h             |  72 +++++++++++
 libavfilter/vf_bwdif.c          |  69 +++--------
 libavfilter/x86/Makefile        |   2 +
 libavfilter/x86/vf_bwdif.asm    | 266 ++++++++++++++++++++++++++++++++++++++++
 libavfilter/x86/vf_bwdif_init.c |  78 ++++++++++++
 5 files changed, 432 insertions(+), 55 deletions(-)
 create mode 100644 libavfilter/bwdif.h
 create mode 100644 libavfilter/x86/vf_bwdif.asm
 create mode 100644 libavfilter/x86/vf_bwdif_init.c

diff --git a/libavfilter/bwdif.h b/libavfilter/bwdif.h
new file mode 100644
index 0000000..8b42c76
--- /dev/null
+++ b/libavfilter/bwdif.h
@@ -0,0 +1,72 @@
+/*
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#ifndef AVFILTER_BWDIF_H
+#define AVFILTER_BWDIF_H
+
+#include "libavutil/pixdesc.h"
+#include "avfilter.h"
+
+enum BWDIFMode {
+    BWDIF_MODE_SEND_FRAME = 0, ///< send 1 frame for each frame
+    BWDIF_MODE_SEND_FIELD = 1, ///< send 1 frame for each field
+};
+
+enum BWDIFParity {
+    BWDIF_PARITY_TFF  =  0, ///< top field first
+    BWDIF_PARITY_BFF  =  1, ///< bottom field first
+    BWDIF_PARITY_AUTO = -1, ///< auto detection
+};
+
+enum BWDIFDeint {
+    BWDIF_DEINT_ALL        = 0, ///< deinterlace all frames
+    BWDIF_DEINT_INTERLACED = 1, ///< only deinterlace frames marked as interlaced
+};
+
+typedef struct BWDIFContext {
+    const AVClass *class;
+
+    int mode;           ///< BWDIFMode
+    int parity;         ///< BWDIFParity
+    int deint;          ///< BWDIFDeint
+
+    int frame_pending;
+
+    AVFrame *cur;
+    AVFrame *next;
+    AVFrame *prev;
+    AVFrame *out;
+
+    void (*filter_intra)(void *dst1, void *cur1, int w, int prefs, int mrefs,
+                         int prefs3, int mrefs3, int parity, int clip_max);
+    void (*filter_line)(void *dst, void *prev, void *cur, void *next,
+                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                        int prefs3, int mrefs3, int prefs4, int mrefs4,
+                        int parity, int clip_max);
+    void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
+                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                        int parity, int clip_max, int spat);
+
+    const AVPixFmtDescriptor *csp;
+    int inter_field;
+    int eof;
+} BWDIFContext;
+
+void ff_bwdif_init_x86(BWDIFContext *bwdif);
+
+#endif /* AVFILTER_BWDIF_H */
diff --git a/libavfilter/vf_bwdif.c b/libavfilter/vf_bwdif.c
index 7985054..d402aa4 100644
--- a/libavfilter/vf_bwdif.c
+++ b/libavfilter/vf_bwdif.c
@@ -37,6 +37,7 @@
 #include "formats.h"
 #include "internal.h"
 #include "video.h"
+#include "bwdif.h"
 
 /*
  * Filter coefficients coef_lf and coef_hf taken from BBC PH-2071 (Weston 3 Field Deinterlacer).
@@ -48,51 +49,6 @@ static const uint16_t coef_lf[2] = { 4309, 213 };
 static const uint16_t coef_hf[3] = { 5570, 3801, 1016 };
 static const uint16_t coef_sp[2] = { 5077, 981 };
 
-enum BWDIFMode {
-    BWDIF_MODE_SEND_FRAME = 0, ///< send 1 frame for each frame
-    BWDIF_MODE_SEND_FIELD = 1, ///< send 1 frame for each field
-};
-
-enum BWDIFParity {
-    BWDIF_PARITY_TFF  =  0, ///< top field first
-    BWDIF_PARITY_BFF  =  1, ///< bottom field first
-    BWDIF_PARITY_AUTO = -1, ///< auto detection
-};
-
-enum BWDIFDeint {
-    BWDIF_DEINT_ALL        = 0, ///< deinterlace all frames
-    BWDIF_DEINT_INTERLACED = 1, ///< only deinterlace frames marked as interlaced
-};
-
-typedef struct BWDIFContext {
-    const AVClass *class;
-
-    int mode;           ///< BWDIFMode
-    int parity;         ///< BWDIFParity
-    int deint;          ///< BWDIFDeint
-
-    int frame_pending;
-
-    AVFrame *cur;
-    AVFrame *next;
-    AVFrame *prev;
-    AVFrame *out;
-
-    void (*filter_intra)(void *dst1, void *cur1, int w, int prefs, int mrefs,
-                         int prefs3, int mrefs3, int parity, int clip_max);
-    void (*filter_line)(void *dst, void *prev, void *cur, void *next,
-                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                        int prefs3, int mrefs3, int prefs4, int mrefs4,
-                        int parity, int clip_max);
-    void (*filter_edge)(void *dst, void *prev, void *cur, void *next,
-                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                        int parity, int clip_max, int spat);
-
-    const AVPixFmtDescriptor *csp;
-    int inter_field;
-    int eof;
-} BWDIFContext;
-
 typedef struct ThreadData {
     AVFrame *frame;
     int plane;
@@ -177,10 +133,10 @@ static void filter_intra(void *dst1, void *cur1, int w, int prefs, int mrefs,
     FILTER_INTRA()
 }
 
-static void filter_line(void *dst1, void *prev1, void *cur1, void *next1,
-                        int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                        int prefs3, int mrefs3, int prefs4, int mrefs4,
-                        int parity, int clip_max)
+static void filter_line_c(void *dst1, void *prev1, void *cur1, void *next1,
+                          int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                          int prefs3, int mrefs3, int prefs4, int mrefs4,
+                          int parity, int clip_max)
 {
     uint8_t *dst   = dst1;
     uint8_t *prev  = prev1;
@@ -222,10 +178,10 @@ static void filter_intra_16bit(void *dst1, void *cur1, int w, int prefs, int mre
     FILTER_INTRA()
 }
 
-static void filter_line_16bit(void *dst1, void *prev1, void *cur1, void *next1,
-                              int w, int prefs, int mrefs, int prefs2, int mrefs2,
-                              int prefs3, int mrefs3, int prefs4, int mrefs4,
-                              int parity, int clip_max)
+static void filter_line_c_16bit(void *dst1, void *prev1, void *cur1, void *next1,
+                                int w, int prefs, int mrefs, int prefs2, int mrefs2,
+                                int prefs3, int mrefs3, int prefs4, int mrefs4,
+                                int parity, int clip_max)
 {
     uint16_t *dst   = dst1;
     uint16_t *prev  = prev1;
@@ -557,14 +513,17 @@ static int config_props(AVFilterLink *link)
     s->csp = av_pix_fmt_desc_get(link->format);
     if (s->csp->comp[0].depth > 8) {
         s->filter_intra = filter_intra_16bit;
-        s->filter_line  = filter_line_16bit;
+        s->filter_line  = filter_line_c_16bit;
         s->filter_edge  = filter_edge_16bit;
     } else {
         s->filter_intra = filter_intra;
-        s->filter_line  = filter_line;
+        s->filter_line  = filter_line_c;
         s->filter_edge  = filter_edge;
     }
 
+    if (ARCH_X86)
+        ff_bwdif_init_x86(s);
+
     return 0;
 }
 
diff --git a/libavfilter/x86/Makefile b/libavfilter/x86/Makefile
index 33de380..ed294e0 100644
--- a/libavfilter/x86/Makefile
+++ b/libavfilter/x86/Makefile
@@ -1,4 +1,5 @@
 OBJS-$(CONFIG_BLEND_FILTER)                  += x86/vf_blend_init.o
+OBJS-$(CONFIG_BWDIF_FILTER)                  += x86/vf_bwdif_init.o
 OBJS-$(CONFIG_EQ_FILTER)                     += x86/vf_eq.o
 OBJS-$(CONFIG_FSPP_FILTER)                   += x86/vf_fspp_init.o
 OBJS-$(CONFIG_GRADFUN_FILTER)                += x86/vf_gradfun_init.o
@@ -21,6 +22,7 @@ OBJS-$(CONFIG_W3FDIF_FILTER)                 += x86/vf_w3fdif_init.o
 OBJS-$(CONFIG_YADIF_FILTER)                  += x86/vf_yadif_init.o
 
 YASM-OBJS-$(CONFIG_BLEND_FILTER)             += x86/vf_blend.o
+YASM-OBJS-$(CONFIG_BWDIF_FILTER)             += x86/vf_bwdif.o
 YASM-OBJS-$(CONFIG_FSPP_FILTER)              += x86/vf_fspp.o
 YASM-OBJS-$(CONFIG_GRADFUN_FILTER)           += x86/vf_gradfun.o
 YASM-OBJS-$(CONFIG_HQDN3D_FILTER)            += x86/vf_hqdn3d.o
diff --git a/libavfilter/x86/vf_bwdif.asm b/libavfilter/x86/vf_bwdif.asm
new file mode 100644
index 0000000..11aa025
--- /dev/null
+++ b/libavfilter/x86/vf_bwdif.asm
@@ -0,0 +1,266 @@
+;*****************************************************************************
+;* x86-optimized functions for bwdif filter
+;*
+;* Copyright (C) 2016 Thomas Mundt <loudmax at yahoo.de>
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or
+;* modify it under the terms of the GNU Lesser General Public
+;* License as published by the Free Software Foundation; either
+;* version 2.1 of the License, or (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+;* Lesser General Public License for more details.
+;*
+;* You should have received a copy of the GNU Lesser General Public
+;* License along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;******************************************************************************
+
+%include "libavutil/x86/x86util.asm"
+
+SECTION_RODATA
+
+pw_coefhf:  times 4 dw  1016, 5570
+pw_coefhf1: times 8 dw -3801
+pw_coefsp:  times 4 dw  5077, -981
+pw_splfdif: times 4 dw  -768,  768
+
+SECTION .text
+
+%macro LOAD8 2
+    movh         %1, %2
+    punpcklbw    %1, m7
+%endmacro
+
+%macro LOAD12 2
+    movu         %1, %2
+%endmacro
+
+%macro DISP8 0
+    packuswb     m2, m2
+    movh     [dstq], m2
+%endmacro
+
+%macro DISP12 0
+    CLIPW        m2, m7, m12
+    movu     [dstq], m2
+%endmacro
+
+%macro FILTER 5
+    pxor         m7, m7
+.loop%1:
+    LOAD%4       m0, [curq+t0*%5]
+    LOAD%4       m1, [curq+t1*%5]
+    LOAD%4       m2, [%2]
+    LOAD%4       m3, [%3]
+    mova         m4, m3
+    paddw        m3, m2
+    psubw        m2, m4
+    ABS1         m2, m4
+    mova         m8, m3
+    mova         m9, m2
+    LOAD%4       m3, [prevq+t0*%5]
+    LOAD%4       m4, [prevq+t1*%5]
+    psubw        m3, m0
+    psubw        m4, m1
+    ABS2         m3, m4, m5, m6
+    paddw        m3, m4
+    psrlw        m2, 1
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+    LOAD%4       m3, [nextq+t0*%5]
+    LOAD%4       m4, [nextq+t1*%5]
+    psubw        m3, m0
+    psubw        m4, m1
+    ABS2         m3, m4, m5, m6
+    paddw        m3, m4
+    psrlw        m3, 1
+    pmaxsw       m2, m3
+
+    LOAD%4       m3, [%2+t0*2*%5]
+    LOAD%4       m4, [%3+t0*2*%5]
+    LOAD%4       m5, [%2+t1*2*%5]
+    LOAD%4       m6, [%3+t1*2*%5]
+    paddw        m3, m4
+    paddw        m5, m6
+    mova         m6, m3
+    paddw        m6, m5
+    mova        m10, m6
+    psrlw        m3, 1
+    psrlw        m5, 1
+    psubw        m3, m0
+    psubw        m5, m1
+    mova         m6, m3
+    pminsw       m3, m5
+    pmaxsw       m5, m6
+    mova         m4, m8
+    psraw        m4, 1
+    mova         m6, m4
+    psubw        m6, m0
+    psubw        m4, m1
+    pmaxsw       m3, m6
+    pminsw       m5, m6
+    pmaxsw       m3, m4
+    pminsw       m5, m4
+    mova         m6, m7
+    psubw        m6, m3
+    pmaxsw       m6, m5
+    mova         m3, m2
+    pcmpgtw      m3, m7
+    pand         m6, m3
+    pmaxsw       m2, m6
+    mova        m11, m2
+
+    LOAD%4       m2, [%2+t0*4*%5]
+    LOAD%4       m3, [%3+t0*4*%5]
+    LOAD%4       m4, [%2+t1*4*%5]
+    LOAD%4       m5, [%3+t1*4*%5]
+    paddw        m2, m3
+    paddw        m4, m5
+    paddw        m2, m4
+    mova         m3, m2
+    punpcklwd    m2, m8
+    punpckhwd    m3, m8
+    pmaddwd      m2, [pw_coefhf]
+    pmaddwd      m3, [pw_coefhf]
+    mova         m4, m10
+    mova         m6, m4
+    pmullw       m4, [pw_coefhf1]
+    pmulhw       m6, [pw_coefhf1]
+    mova         m5, m4
+    punpcklwd    m4, m6
+    punpckhwd    m5, m6
+    paddd        m2, m4
+    paddd        m3, m5
+    psrad        m2, 2
+    psrad        m3, 2
+
+    mova         m4, m0
+    paddw        m0, m1
+%if ARCH_X86_64
+    LOAD%4       m5, [curq+t2*%5]
+    LOAD%4       m6, [curq+t3*%5]
+%else
+    mov          r4, prefs3mp
+    mov          r5, mrefs3mp
+    LOAD%4       m5, [curq+t0*%5]
+    LOAD%4       m6, [curq+t1*%5]
+    mov          r4, prefsmp
+    mov          r5, mrefsmp
+%endif
+    paddw        m6, m5
+    psubw        m1, m4
+    ABS1         m1, m4
+    pcmpgtw      m1, m9
+    mova         m4, m1
+    punpcklwd    m1, m4
+    punpckhwd    m4, m4
+    pand         m2, m1
+    pand         m3, m4
+    mova         m5, [pw_splfdif]
+    mova         m7, m5
+    pand         m5, m1
+    pand         m7, m4
+    paddw        m5, [pw_coefsp]
+    paddw        m7, [pw_coefsp]
+    mova         m4, m0
+    punpcklwd    m0, m6
+    punpckhwd    m4, m6
+    pmaddwd      m0, m5
+    pmaddwd      m4, m7
+    paddd        m2, m0
+    paddd        m3, m4
+    psrad        m2, 13
+    psrad        m3, 13
+    packssdw     m2, m3
+
+    mova         m4, m8
+    psraw        m4, 1
+    mova         m0, m11
+    mova         m3, m4
+    psubw        m4, m0
+    paddw        m3, m0
+    CLIPW        m2, m4, m3
+    pxor         m7, m7
+    DISP%4
+
+    add        dstq, STEP
+    add       prevq, STEP
+    add        curq, STEP
+    add       nextq, STEP
+    sub    DWORD wm, mmsize/2
+    jg .loop%1
+%endmacro
+
+%macro PROC 2
+%if ARCH_X86_64
+    movsxd       r5, DWORD prefsm
+    movsxd       r6, DWORD mrefsm
+    movsxd       r7, DWORD prefs3m
+    movsxd       r8, DWORD mrefs3m
+    DECLARE_REG_TMP 5, 6, 7, 8
+%else
+    %define m8  [rsp+ 0]
+    %define m9  [rsp+16]
+    %define m10 [rsp+32]
+    %define m11 [rsp+48]
+    mov          r4, prefsmp
+    mov          r5, mrefsmp
+    DECLARE_REG_TMP 4, 5
+%endif
+    cmp DWORD paritym, 0
+    je .parity0
+    FILTER 1, prevq, curq, %1, %2
+    jmp .ret
+.parity0:
+    FILTER 0, curq, nextq, %1, %2
+.ret:
+    RET
+%endmacro
+
+%macro BWDIF 0
+%if ARCH_X86_64
+cglobal bwdif_filter_line, 4, 9, 12, 0, dst, prev, cur, next, w, prefs, \
+                                        mrefs, prefs2, mrefs2, prefs3, mrefs3, \
+                                        prefs4, mrefs4, parity, clip_max
+%else
+cglobal bwdif_filter_line, 4, 6, 8, 64, dst, prev, cur, next, w, prefs, \
+                                        mrefs, prefs2, mrefs2, prefs3, mrefs3, \
+                                        prefs4, mrefs4, parity, clip_max
+%endif
+    %define STEP mmsize/2
+    PROC 8, 1
+
+%if ARCH_X86_64
+cglobal bwdif_filter_line_12bit, 4, 9, 13, 0, dst, prev, cur, next, w, \
+                                              prefs, mrefs, prefs2, mrefs2, \
+                                              prefs3, mrefs3, prefs4, \
+                                              mrefs4, parity, clip_max
+    movd        m12, DWORD clip_maxm
+    SPLATW      m12, m12, 0
+%else
+cglobal bwdif_filter_line_12bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
+                                              prefs, mrefs, prefs2, mrefs2, \
+                                              prefs3, mrefs3, prefs4, \
+                                              mrefs4, parity, clip_max
+    %define m12 [rsp+64]
+    movd         m0, DWORD clip_maxm
+    SPLATW       m0, m0, 0
+    mova        m12, m0
+%endif
+    %define STEP mmsize
+    PROC 12, 2
+%endmacro
+
+INIT_XMM ssse3
+BWDIF
+INIT_XMM sse2
+BWDIF
+%if ARCH_X86_32
+INIT_MMX mmxext
+BWDIF
+%endif
diff --git a/libavfilter/x86/vf_bwdif_init.c b/libavfilter/x86/vf_bwdif_init.c
new file mode 100644
index 0000000..1cb8438
--- /dev/null
+++ b/libavfilter/x86/vf_bwdif_init.c
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2016 Thomas Mundt <loudmax at yahoo.de>
+ *
+ * This file is part of FFmpeg.
+ *
+ * FFmpeg is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * FFmpeg is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with FFmpeg; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/attributes.h"
+#include "libavutil/cpu.h"
+#include "libavutil/mem.h"
+#include "libavutil/x86/asm.h"
+#include "libavutil/x86/cpu.h"
+#include "libavfilter/bwdif.h"
+
+void ff_bwdif_filter_line_mmxext(void *dst, void *prev, void *cur, void *next,
+                                 int w, int prefs, int mrefs, int prefs2,
+                                 int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                 int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_sse2(void *dst, void *prev, void *cur, void *next,
+                               int w, int prefs, int mrefs, int prefs2,
+                               int mrefs2, int prefs3, int mrefs3, int prefs4,
+                               int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_ssse3(void *dst, void *prev, void *cur, void *next,
+                                int w, int prefs, int mrefs, int prefs2,
+                                int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                int mrefs4, int parity, int clip_max);
+
+void ff_bwdif_filter_line_12bit_mmxext(void *dst, void *prev, void *cur, void *next,
+                                       int w, int prefs, int mrefs, int prefs2,
+                                       int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                       int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_12bit_sse2(void *dst, void *prev, void *cur, void *next,
+                                     int w, int prefs, int mrefs, int prefs2,
+                                     int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                     int mrefs4, int parity, int clip_max);
+void ff_bwdif_filter_line_12bit_ssse3(void *dst, void *prev, void *cur, void *next,
+                                      int w, int prefs, int mrefs, int prefs2,
+                                      int mrefs2, int prefs3, int mrefs3, int prefs4,
+                                      int mrefs4, int parity, int clip_max);
+
+av_cold void ff_bwdif_init_x86(BWDIFContext *bwdif)
+{
+    int cpu_flags = av_get_cpu_flags();
+    int bit_depth = (!bwdif->csp) ? 8 : bwdif->csp->comp[0].depth;
+
+    if (bit_depth <= 8) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            bwdif->filter_line = ff_bwdif_filter_line_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            bwdif->filter_line = ff_bwdif_filter_line_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            bwdif->filter_line = ff_bwdif_filter_line_ssse3;
+    } else if (bit_depth <= 12) {
+#if ARCH_X86_32
+        if (EXTERNAL_MMXEXT(cpu_flags))
+            bwdif->filter_line = ff_bwdif_filter_line_12bit_mmxext;
+#endif /* ARCH_X86_32 */
+        if (EXTERNAL_SSE2(cpu_flags))
+            bwdif->filter_line = ff_bwdif_filter_line_12bit_sse2;
+        if (EXTERNAL_SSSE3(cpu_flags))
+            bwdif->filter_line = ff_bwdif_filter_line_12bit_ssse3;
+    }
+}
-- 
1.9.2




More information about the ffmpeg-devel mailing list