[FFmpeg-devel] [FFMpeg-Devel][GSoC][PATCH 1/2] postproc: Updated postprocess_template to use new sse/avx deinterlace functions

Tucker DiNapoli t.dinapoli42 at gmail.com
Thu Apr 23 06:20:37 CEST 2015


A few notes on changes from the last version of this patch.
The main issue with the previous code was with the sse2/avx2
implementation of the blockCopy function, so for the time being the MMX2 version
is used instead. I tried to place the MMX2 version into a function, but this 
did not work, my best guess as to why is alignment, but I really don't know. 
The way it's done now is a bit ugly but it works and I don't have time to figure
out the issue right now.

This commit adds several new files containing yasm assembly code, they are:
PPContext.asm; Defines the PPContext struct using the yasm struc command
PPUtil.asm; Various utility macros used in the other asm code
block_copy.asm; Implements the block copy function, the sse2 and avx2
versions copy multiple blocks at once.
deinterlace.asm; Contains implemenations of the postprocessing filters
with support for sse2 and avx2.

Adding these new functions to postprocess_template entailed adding a new
templates for AVX2 and modifying the current SSE2 template to use the
sse2 functions.  A new deinterlace function was added to move the logic
of which deinterlace function to use out of the postprocess function and
make adding the new functions eaiser. The inline code for packing QP
into pQPb was moved into a seperate asm file and uptaded for sse2/avx2.

Currently the sse2/avx2 deinterlace filters don't give results which are
bitexact to the C results, so I've modified one of the postprocessing
tests so that only the C funcitons are tested by fate. Ultimately either
the sse2/avx2 code will need to be fixed or diffrent tests will need to
be added. I'm not sure if this is a problem with my code, a problem inherent
in using sse2/avx2 or a problem that's a result of deinterlacing being done
blockwise.
---
 libpostproc/postprocess.c          |  16 +-
 libpostproc/postprocess_internal.h |  11 +-
 libpostproc/postprocess_template.c | 285 +++++++++++++++++++++++------
 libpostproc/x86/Makefile           |   2 +
 libpostproc/x86/PPContext.asm      |  77 ++++++++
 libpostproc/x86/PPUtil.asm         | 224 +++++++++++++++++++++++
 libpostproc/x86/block_copy.asm     | 132 ++++++++++++++
 libpostproc/x86/deinterlace.asm    | 359 +++++++++++++++++++++++++++++++++++++
 tests/fate/filter-video.mak        |   2 +-
 9 files changed, 1051 insertions(+), 57 deletions(-)
 create mode 100644 libpostproc/x86/Makefile
 create mode 100644 libpostproc/x86/PPContext.asm
 create mode 100644 libpostproc/x86/PPUtil.asm
 create mode 100644 libpostproc/x86/block_copy.asm
 create mode 100644 libpostproc/x86/deinterlace.asm

diff --git a/libpostproc/postprocess.c b/libpostproc/postprocess.c
index af70bb3..20df267 100644
--- a/libpostproc/postprocess.c
+++ b/libpostproc/postprocess.c
@@ -542,8 +542,13 @@ static av_always_inline void do_a_deblock_C(uint8_t *src, int step,
 #        include "postprocess_template.c"
 #        define TEMPLATE_PP_SSE2 1
 #        include "postprocess_template.c"
+#        define TEMPLATE_PP_AVX2 1
+#        include "postprocess_template.c"
 #    else
-#        if HAVE_SSE2_INLINE
+#        if HAVE_AVX2_INLINE
+#            define TEMPLATE_PP_AVX2 1
+#            include "postprocess_template.c"
+#        elif HAVE_SSE2_INLINE
 #            define TEMPLATE_PP_SSE2 1
 #            include "postprocess_template.c"
 #        elif HAVE_MMXEXT_INLINE
@@ -574,7 +579,8 @@ static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[]
 #if CONFIG_RUNTIME_CPUDETECT
 #if ARCH_X86 && HAVE_INLINE_ASM
         // ordered per speed fastest first
-        if      (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_SSE2;
+        if      (c->cpuCaps & AV_CPU_FLAG_AVX2)     pp = postProcess_avx2;
+        else if (c->cpuCaps & AV_CPU_FLAG_SSE2)     pp = postProcess_sse2;
         else if (c->cpuCaps & AV_CPU_FLAG_MMXEXT)   pp = postProcess_MMX2;
         else if (c->cpuCaps & AV_CPU_FLAG_3DNOW)    pp = postProcess_3DNow;
         else if (c->cpuCaps & AV_CPU_FLAG_MMX)      pp = postProcess_MMX;
@@ -582,8 +588,10 @@ static inline void postProcess(const uint8_t src[], int srcStride, uint8_t dst[]
         if      (c->cpuCaps & AV_CPU_FLAG_ALTIVEC)  pp = postProcess_altivec;
 #endif
 #else /* CONFIG_RUNTIME_CPUDETECT */
-#if     HAVE_SSE2_INLINE
-        pp = postProcess_SSE2;
+#if     HAVE_AVX2_INLINE
+        pp = postProcess_avx2;
+#elif     HAVE_SSE2_INLINE
+        pp = postProcess_sse2;
 #elif   HAVE_MMXEXT_INLINE
         pp = postProcess_MMX2;
 #elif HAVE_AMD3DNOW_INLINE
diff --git a/libpostproc/postprocess_internal.h b/libpostproc/postprocess_internal.h
index c1a306d..022f87e 100644
--- a/libpostproc/postprocess_internal.h
+++ b/libpostproc/postprocess_internal.h
@@ -180,5 +180,14 @@ static inline void linecpy(void *dest, const void *src, int lines, int stride) {
         memcpy((uint8_t*)dest+(lines-1)*stride, (const uint8_t*)src+(lines-1)*stride, -lines*stride);
     }
 }
-
+extern void ff_deInterlaceInterpolateLinear_mmx2(uint8_t *, int);
+extern void ff_deInterlaceInterpolateCubic_mmx2(uint8_t *, int);
+extern void ff_deInterlaceFF_mmx2(uint8_t *, int, uint8_t *);
+extern void ff_deInterlaceL5_mmx2(uint8_t *, int,
+                                  uint8_t *,uint8_t*);
+extern void ff_deInterlaceBlendLinear_mmx2(uint8_t *, int, uint8_t *);
+extern void ff_deInterlaceMedian_mmx2(uint8_t *, int);
+extern void ff_blockCopy_mmx2(uint8_t*,int,const uint8_t*,int,int,int64_t*);
+extern void ff_packQP_mmx2(PPContext*);
+extern void ff_packQP_sse2(PPContext*);
 #endif /* POSTPROC_POSTPROCESS_INTERNAL_H */
diff --git a/libpostproc/postprocess_template.c b/libpostproc/postprocess_template.c
index b7296c4..fd94255 100644
--- a/libpostproc/postprocess_template.c
+++ b/libpostproc/postprocess_template.c
@@ -69,10 +69,21 @@
 #   define TEMPLATE_PP_MMX 1
 #   undef  TEMPLATE_PP_MMXEXT
 #   define TEMPLATE_PP_MMXEXT 1
-#   define RENAME(a) a ## _SSE2
+#   define RENAME(a) a ## _sse2
 #else
 #   define TEMPLATE_PP_SSE2 0
 #endif
+#ifdef TEMPLATE_PP_AVX2
+#   undef  TEMPLATE_PP_MMX
+#   define TEMPLATE_PP_MMX 1
+#   undef  TEMPLATE_PP_MMXEXT
+#   define TEMPLATE_PP_MMXEXT 1
+#   undef  TEMPLATE_PP_SSE2
+#   define TEMPLATE_PP_SSE2 1
+#   define RENAME(a) a ## _avx2
+#else
+#   define TEMPLATE_PP_AVX2 0
+#endif
 
 #undef REAL_PAVGB
 #undef PAVGB
@@ -102,7 +113,64 @@
     "psubusb " #a ", " #b " \n\t"\
     "paddb " #a ", " #b " \n\t"
 #endif
+//inline wrapper functions around the yasm simd code
 
+#if TEMPLATE_PP_SSE2
+extern void RENAME(ff_deInterlaceInterpolateLinear)(uint8_t *, int);
+extern void RENAME(ff_deInterlaceInterpolateCubic)(uint8_t *, int);
+extern void RENAME(ff_deInterlaceFF)(uint8_t *, int, uint8_t *);
+extern void RENAME(ff_deInterlaceL5)(uint8_t *, int, uint8_t *, uint8_t*);
+extern void RENAME(ff_deInterlaceBlendLinear)(uint8_t *, int, uint8_t *);
+extern void RENAME(ff_deInterlaceMedian)(uint8_t *, int);
+extern void RENAME(ff_blockCopy)(uint8_t*,int,const uint8_t*,
+                                 int,int,int64_t*);
+extern void RENAME(ff_duplicate)(uint8_t*, int);
+static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[],
+                                                        int stride)
+{
+    RENAME(ff_deInterlaceInterpolateLinear)(src, stride);
+}
+static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[],
+                                                        int stride)
+{
+    RENAME(ff_deInterlaceInterpolateCubic)(src, stride);
+}
+static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride,
+                                         uint8_t *tmp)
+{
+    RENAME(ff_deInterlaceFF)(src, stride, tmp);
+}
+static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride,
+                                         uint8_t *tmp, uint8_t *tmp2)
+{
+    RENAME(ff_deInterlaceL5)(src, stride, tmp, tmp2);
+}
+static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride,
+                                                  uint8_t *tmp)
+{
+    RENAME(ff_deInterlaceBlendLinear)(src, stride, tmp);
+}
+static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride)
+{
+    RENAME(ff_deInterlaceMedian)(src, stride);
+}
+/*
+static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride,
+                                     const uint8_t src[], int srcStride,
+                                     int levelFix, int64_t *packedOffsetAndScale)
+{
+    RENAME(ff_blockCopy)(dst,dstStride,src,srcStride,
+                         levelFix,packedOffsetAndScale);
+}*/
+static inline void RENAME(duplicate)(uint8_t *src, int stride)
+{
+/*    RENAME(ff_duplicate)(src, stride);*/
+    int i;
+    for(i=0;i<32;i+=8){
+        duplicate_MMX2(src+i, stride);
+    }
+}
+#else
 //FIXME? |255-0| = 1 (should not be a problem ...)
 #if TEMPLATE_PP_MMX
 /**
@@ -3241,6 +3309,7 @@ static inline void RENAME(duplicate)(uint8_t src[], int stride)
     }
 #endif
 }
+#endif //initial TEMPLATE_PP_SSE2
 
 #if ARCH_X86 && TEMPLATE_PP_MMXEXT
 static inline void RENAME(prefetchnta)(const void *p)
@@ -3305,6 +3374,44 @@ static inline void RENAME(prefetcht2)(const void *p)
     return;
 }
 #endif
+/*
+  This calls a rather trivial assembly function, there is some performance
+  overhead to the function call vs using inline asm, but (at least I think)
+  that having less inline asm is worth it.
+*/
+#if TEMPLATE_PP_MMX
+static inline void RENAME(packQP)(PPContext *c)
+{
+#if TEMPLATE_PP_SSE2
+    ff_packQP_sse2(c);
+#else //despite the name this only uses mmx instructions
+    ff_packQP_mmx2(c);
+#endif
+}
+#endif
+
+static inline void RENAME(deInterlace)(uint8_t *dstBlock, int dstStride,
+                                       uint8_t *tmp, uint8_t *tmp2,
+                                       int mode, const int duplicate)
+{
+    if(duplicate){
+        RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
+    }
+    if(mode & LINEAR_IPOL_DEINT_FILTER){
+        RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
+    } else if(mode & LINEAR_BLEND_DEINT_FILTER){
+        RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, tmp);
+    } else if(mode & MEDIAN_DEINT_FILTER){
+        RENAME(deInterlaceMedian)(dstBlock, dstStride);
+    } else if(mode & CUBIC_IPOL_DEINT_FILTER){
+        RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
+    } else if(mode & FFMPEG_DEINT_FILTER){
+        RENAME(deInterlaceFF)(dstBlock, dstStride, tmp);
+    } else if(mode & LOWPASS5_DEINT_FILTER){
+        RENAME(deInterlaceL5)(dstBlock, dstStride, tmp, tmp2);
+    }
+}
+
 /**
  * Filter array of bytes (Y or U or V values)
  */
@@ -3421,6 +3528,10 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
         QPCorrecture= 256*256;
     }
 
+#if TEMPLATE_PP_SSE2
+#undef RENAME
+#define RENAME(x) x ## _MMX2
+#endif
     /* copy & deinterlace first row of blocks */
     y=-BLOCK_SIZE;
     {
@@ -3436,26 +3547,60 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
             RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
             RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
 
+#if TEMPLATE_PP_AVX2
+            if(x + BLOCK_SIZE*4 <= width){
+                blockCopy_MMX2(dstBlock + dstStride*copyAhead, dstStride,
+                               srcBlock + srcStride*copyAhead, srcStride,
+                               mode & LEVEL_FIX, &c.packedYOffset);
+                blockCopy_MMX2(dstBlock + dstStride*copyAhead + 8, dstStride,
+                               srcBlock + srcStride*copyAhead + 8, srcStride,
+                               mode & LEVEL_FIX, &c.packedYOffset);
+                blockCopy_MMX2(dstBlock + dstStride*copyAhead + 16, dstStride,
+                               srcBlock + srcStride*copyAhead + 16, srcStride,
+                               mode & LEVEL_FIX, &c.packedYOffset);
+                blockCopy_MMX2(dstBlock + dstStride*copyAhead + 24, dstStride,
+                               srcBlock + srcStride*copyAhead + 24, srcStride,
+                               mode & LEVEL_FIX, &c.packedYOffset);
+
+                RENAME(deInterlace)(dstBlock, dstStride, c.deintTemp +x,
+                                    c.deintTemp + width + x, mode, 1);
+                dstBlock+=24;
+                srcBlock+=24;
+                //kinda hacky but ohwell
+                x+=3*BLOCK_SIZE;
+            } else {
+                blockCopy_MMX2(dstBlock + dstStride*8, dstStride,
+                               srcBlock + srcStride*8, srcStride,
+                               mode & LEVEL_FIX, &c.packedYOffset);
+                deInterlace_MMX2(dstBlock, dstStride, c.deintTemp +x,
+                                 c.deintTemp + width + x, mode, 1);
+            }
+#elif TEMPLATE_PP_SSE2
+            if(x + BLOCK_SIZE*2 <= width){
+                blockCopy_MMX2(dstBlock + dstStride*copyAhead, dstStride,
+                               srcBlock + srcStride*copyAhead, srcStride,
+                               mode & LEVEL_FIX, &c.packedYOffset);
+                blockCopy_MMX2(dstBlock + dstStride*copyAhead + 8, dstStride,
+                               srcBlock + srcStride*copyAhead + 8, srcStride,
+                               mode & LEVEL_FIX, &c.packedYOffset);
+                RENAME(deInterlace)(dstBlock, dstStride, c.deintTemp +x,
+                                    c.deintTemp + width + x, mode, 1);
+                dstBlock+=8;
+                srcBlock+=8;
+                x+=BLOCK_SIZE;
+            } else {
+                blockCopy_MMX2(dstBlock + dstStride*8, dstStride,
+                               srcBlock + srcStride*8, srcStride,
+                               mode & LEVEL_FIX, &c.packedYOffset);
+                deInterlace_MMX2(dstBlock, dstStride, c.deintTemp +x,
+                                 c.deintTemp + width + x, mode, 1);
+            }
+#else
             RENAME(blockCopy)(dstBlock + dstStride*8, dstStride,
                               srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
-
-            RENAME(duplicate)(dstBlock + dstStride*8, dstStride);
-
-            if(mode & LINEAR_IPOL_DEINT_FILTER)
-                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
-            else if(mode & LINEAR_BLEND_DEINT_FILTER)
-                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
-            else if(mode & MEDIAN_DEINT_FILTER)
-                RENAME(deInterlaceMedian)(dstBlock, dstStride);
-            else if(mode & CUBIC_IPOL_DEINT_FILTER)
-                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
-            else if(mode & FFMPEG_DEINT_FILTER)
-                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
-            else if(mode & LOWPASS5_DEINT_FILTER)
-                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
-/*          else if(mode & CUBIC_BLEND_DEINT_FILTER)
-                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
-*/
+            RENAME(deInterlace)(dstBlock, dstStride, c.deintTemp +x,
+                                c.deintTemp + width + x, mode, 1);
+#endif
             dstBlock+=8;
             srcBlock+=8;
         }
@@ -3524,47 +3669,84 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
             c.QP_block[qp_index] = QP;
             c.nonBQP_block[qp_index] = nonBQP;
 #if TEMPLATE_PP_MMX
-            __asm__ volatile(
-                "movd %1, %%mm7         \n\t"
-                "packuswb %%mm7, %%mm7  \n\t" // 0, 0, 0, QP, 0, 0, 0, QP
-                "packuswb %%mm7, %%mm7  \n\t" // 0,QP, 0, QP, 0,QP, 0, QP
-                "packuswb %%mm7, %%mm7  \n\t" // QP,..., QP
-                "movq %%mm7, %0         \n\t"
-                : "=m" (c.pQPb_block[qp_index])
-                : "r" (QP)
-            );
+            RENAME(packQP)(&c);
 #endif
-            }
-          for(; x < endx; x+=BLOCK_SIZE){
-            RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
-            RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
-            RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
-            RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
-
+        }
+            for(;x<endx;x+=BLOCK_SIZE){
+                RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead)*srcStride + 32);
+                RENAME(prefetchnta)(srcBlock + (((x>>2)&6) + copyAhead+1)*srcStride + 32);
+                RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead)*dstStride + 32);
+                RENAME(prefetcht0)(dstBlock + (((x>>2)&6) + copyAhead+1)*dstStride + 32);
+#if TEMPLATE_PP_AVX2
+                if(x + BLOCK_SIZE*4 <= endx){
+                    blockCopy_MMX2(dstBlock + dstStride*copyAhead, dstStride,
+                                   srcBlock + srcStride*copyAhead, srcStride,
+                                   mode & LEVEL_FIX, &c.packedYOffset);
+                    blockCopy_MMX2(dstBlock + dstStride*copyAhead + 8, dstStride,
+                                   srcBlock + srcStride*copyAhead + 8, srcStride,
+                                   mode & LEVEL_FIX, &c.packedYOffset);
+                    blockCopy_MMX2(dstBlock + dstStride*copyAhead + 16, dstStride,
+                                   srcBlock + srcStride*copyAhead + 16, srcStride,
+                                   mode & LEVEL_FIX, &c.packedYOffset);
+                    blockCopy_MMX2(dstBlock + dstStride*copyAhead + 24, dstStride,
+                                   srcBlock + srcStride*copyAhead + 24, srcStride,
+                                   mode & LEVEL_FIX, &c.packedYOffset);
+
+                    RENAME(deInterlace)(dstBlock, dstStride, c.deintTemp +x,
+                                        c.deintTemp + width + x, mode, 0);
+                    dstBlock+=24;
+                    srcBlock+=24;
+                    //kinda hacky but ohwell
+                    x+=3*BLOCK_SIZE;
+                } else {
+                    blockCopy_MMX2(dstBlock + dstStride*copyAhead, dstStride,
+                                   srcBlock + srcStride*copyAhead, srcStride,
+                                   mode & LEVEL_FIX, &c.packedYOffset);
+                    deInterlace_MMX2(dstBlock, dstStride, c.deintTemp +x,
+                                     c.deintTemp + width + x, mode, 0);
+                }
+#elif TEMPLATE_PP_SSE2
+                if(x + BLOCK_SIZE*2 <= endx){
+/*                    RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
+                                      srcBlock + srcStride*copyAhead, srcStride,
+                                      mode & LEVEL_FIX, &c.packedYOffset);
+                    RENAME(deInterlace)(dstBlock, dstStride, c.deintTemp +x,
+                                        c.deintTemp + width + x, mode, 0);*/
+                    blockCopy_MMX2(dstBlock + dstStride*copyAhead, dstStride,
+                                   srcBlock + srcStride*copyAhead, srcStride,
+                                   mode & LEVEL_FIX, &c.packedYOffset);
+                    blockCopy_MMX2(dstBlock + dstStride*copyAhead + 8, dstStride,
+                                   srcBlock + srcStride*copyAhead + 8, srcStride,
+                                   mode & LEVEL_FIX, &c.packedYOffset);
+                    RENAME(deInterlace)(dstBlock, dstStride, c.deintTemp +x,
+                                     c.deintTemp + width + x, mode, 0);
+                    dstBlock+=8;
+                    srcBlock+=8;
+                    x+=BLOCK_SIZE;
+                } else {
+                    blockCopy_MMX2(dstBlock + dstStride*copyAhead, dstStride,
+                                   srcBlock + srcStride*copyAhead, srcStride,
+                                   mode & LEVEL_FIX, &c.packedYOffset);
+                    deInterlace_MMX2(dstBlock, dstStride, c.deintTemp +x,
+                                     c.deintTemp + width + x, mode, 0);
+                }
+#else
             RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride,
                               srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset);
-
-            if(mode & LINEAR_IPOL_DEINT_FILTER)
-                RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride);
-            else if(mode & LINEAR_BLEND_DEINT_FILTER)
-                RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x);
-            else if(mode & MEDIAN_DEINT_FILTER)
-                RENAME(deInterlaceMedian)(dstBlock, dstStride);
-            else if(mode & CUBIC_IPOL_DEINT_FILTER)
-                RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride);
-            else if(mode & FFMPEG_DEINT_FILTER)
-                RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x);
-            else if(mode & LOWPASS5_DEINT_FILTER)
-                RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x);
-/*          else if(mode & CUBIC_BLEND_DEINT_FILTER)
-                RENAME(deInterlaceBlendCubic)(dstBlock, dstStride);
-*/
+            RENAME(deInterlace)(dstBlock, dstStride, c.deintTemp +x,
+                                c.deintTemp + width + x, mode, 0);
+#endif
             dstBlock+=8;
             srcBlock+=8;
           }
 
           dstBlock = dstBlockStart;
           srcBlock = srcBlockStart;
+//change back to mmx, if using sse2 or avx2
+#if TEMPLATE_PP_SSE2
+#undef RENAME
+#define RENAME(a) a ## _MMX2
+#endif
 
           for(x = startx, qp_index = 0; x < endx; x+=BLOCK_SIZE, qp_index++){
             const int stride= dstStride;
@@ -3744,3 +3926,4 @@ static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[
 #undef TEMPLATE_PP_MMXEXT
 #undef TEMPLATE_PP_3DNOW
 #undef TEMPLATE_PP_SSE2
+#undef TEMPLATE_PP_AVX2
diff --git a/libpostproc/x86/Makefile b/libpostproc/x86/Makefile
new file mode 100644
index 0000000..8a7503b
--- /dev/null
+++ b/libpostproc/x86/Makefile
@@ -0,0 +1,2 @@
+YASM-OBJS-$(CONFIG_POSTPROC) += x86/deinterlace.o
+YASM-OBJS-$(CONFIG_POSTPROC) += x86/block_copy.o
diff --git a/libpostproc/x86/PPContext.asm b/libpostproc/x86/PPContext.asm
new file mode 100644
index 0000000..d691db0
--- /dev/null
+++ b/libpostproc/x86/PPContext.asm
@@ -0,0 +1,77 @@
+;*
+;* Definition of the PPContext and PPMode structs in assembly
+;* Copyright (C) 2015 Tucker DiNapoli (T.Dinapoli at gmail.com)
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*
+%if ARCH_X86_64
+%define pointer resq
+%else
+%define pointer resd
+%endif
+struc PPMode
+    .lum_mode: resd 1
+    .chrom_mode: resd 1
+    .error: resd 1
+    .min_allowed_y: resd 1
+    .max_allowed_y: resd 1
+    .max_clipped_threshold: resd 1
+    .max_tmp_noise: resd 3
+    .base_dc_diff: resd 1
+    .flatness_threshold: resd 1
+    .forced_quant: resd 1
+endstruc
+
+struc PPContext
+    .av_class pointer 1
+    .temp_blocks pointer 1
+    .y_historgam pointer 1
+    alignb 8
+    .packed_yoffset resq 1
+    .packed_yscale resq 1; 8 byte aligned by default
+    .temp_blurred pointer 3
+    .temp_blurred_past pointer 3
+    .temp_dst pointer 1
+    .temp_src pointer 1
+    .deint_temp pointer 1
+    alignb 8
+    .pQPb resq 1
+    .pQPb2 resq 1
+    alignb 32
+    .pQPb_block resq 4
+    alignb 32
+    .pQPb2_block resq 4
+;; These next fields & next alignment may need to be changed for 128/256 bit registers
+    alignb 32
+    .mmx_dc_offset resq 64
+    .mmx_dc_threshold resq 64
+    .std_QP_table pointer 1
+    .non_BQP_table pointer 1
+    .forced_QP_table pointer 1
+    .QP resd 1
+    .nonBQP resd 1
+;; check header to see if these are bytes or ints
+    .QP_block resb 4
+    .nonBQP_block resb 4
+    .frame_num resd 1
+    .cpu_caps resd 1
+    .qp_stride resd 1
+    .stride resd 1
+    .h_chroma_subsample resd 1
+    .v_chroma_subsample resd 1
+    .ppMode resb PPMode_size
+endstruc
diff --git a/libpostproc/x86/PPUtil.asm b/libpostproc/x86/PPUtil.asm
new file mode 100644
index 0000000..0288ce5
--- /dev/null
+++ b/libpostproc/x86/PPUtil.asm
@@ -0,0 +1,224 @@
+;******************************************************************************
+;*
+;* Copyright (c) 2015 Tucker DiNapoli
+;*
+;* Utility code/marcos used in asm files for libpostproc
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*
+%include "libavutil/x86/x86util.asm"
+%include "PPContext.asm"
+;; Macros to simplify moving packed data
+
+;; copy low quadword to upper quadword(s)
+;; no-op for mmx
+%macro dup_low_quadword 1
+%if cpuflag(avx2)
+    vpermq %1, %1, 0x00
+%elif cpuflag(sse2)
+    pshufd %1, %1, 00001010b
+%endif
+%endmacro
+;; copy low byte into all other bytes
+;; Optional argument is a temporary register to allow using pshufb
+%macro dup_low_byte 1-2
+%if cpuflag(avx2)
+;; copy the lower 128 bits to the upper 128 bits, so avx shuffles work correctly
+    vpermq %1,%1,0x00
+%endif
+;;would it be faster to use pshufb if the seond argument were in memory?
+;;because we could store a 0 vector in memory for use in pshufb with 1 argument
+%if cpuflag(ssse3) && %0 == 2
+    pxor %2,%2
+    pshufb %1,%2
+%else
+    punpcklbw %1, %1
+%if cpuflag(sse2)
+    pshuflw %1,%1, 0x00
+    pshufd %1,%1, 0x00
+%else ;; mmx
+    pshufw %1,%1, 0x00
+%endif
+%endif
+%endmacro
+
+;; It may be useful to have macros to do the following:
+;; fill each quadword with it's low byte
+;; unpack the low 2-4 bytes into the low byte of each quadword
+
+
+;; move the low half of the mmx/xmm/ymm register in %2 into %1
+;; %1 should be a memory location
+%macro mov_vector_low_half 2
+%if mmsize == 32
+vextractf128 %1, %2, 0x00
+%elif mmsize == 16
+movlpd %1, %2
+%elif mmsize == 8
+movd %1, %2
+%else
+%error "mmsize defined to unsupported value"
+%endif
+%endmacro
+
+;; move the high half of the mmx/xmm/ymm register in %2 into %1
+;; %1 should be a memory location
+%macro mov_vector_high_half 2-3 m0
+%if mmsize == 32
+vextractf128 %1, %2, 0x01
+%elif mmsize == 16
+movhpd %1, %2
+%elif mmsize == 8
+;; there's no instruction, pre sse4.1, to move the high 32 bits of an mmx
+;; register, so use the optional third argument as a temporary register
+;; shift it right by 32 bits and extract the low doubleword
+movq %3, %2
+psrl %3, 32
+movd %1, %3
+%else
+%error "mmsize defined to unsupported value"
+%endif
+%endmacro
+
+;; Macros for defining simd constants,
+;; Always defines a 256 bit, 32 byte aligned constant, which is more
+;; size/alignment than is necessary for sse/mmx, but ensures the same
+;; constant will work for all simd instruction sets
+%macro define_vector_constant 5
+%xdefine %%section __SECT__
+ SECTION .rodata
+alignb 32
+%1:
+    dq %2
+    dq %3
+    dq %4
+    dq %5
+%%section
+%endmacro
+;; convenience macro to define a simd constant where each quadword is the same
+%macro define_vector_constant 2
+    define_vector_constant %1,%2,%2,%2,%2
+%endmacro
+
+;; Macros to emulate the ptest instruction for pre-sse41 cpus
+;; Used to allow branching based on the values of simd registers
+;; set zf if dst & src == 0
+%macro ptest_neq 2-4 ;; dst, src, tmp1, tmp2 ; tmp1,2 are  general purpose registers
+%if cpuflag(sse4)
+    ptest %1, %2
+%elif cpuflag(sse)
+    pcmpeqb %1, %2
+    pmovmskb %3, %1
+    test %3, %3
+%else ;;mmx
+    pand %1, %2
+    movd %3, %1
+    psrlq %1, 32
+    movd %4, %1
+    or %3, %4
+    test %3, %3
+%endif
+%endmacro
+;; set cf if dst & ~src == 0 (i.e dst == src)
+%macro ptest_eq 2-4 ;;dst, src, tmp1, tmp1 ;tmp1,2 are general purpose registers
+%if cpuflag(sse4)
+    ptest %1, %2
+%elif cpuflag(sse)
+    pcmpeqb %1, %2
+    pmovmskb %3, %1
+    neg %3 ;;sets cf if operand is non-zero
+%else ;;mmx
+    pand %1, %2
+    movd %3, %1
+    psrlq %1, 32
+    movd %4, %1
+    or %3, %4
+    neg %3
+%endif
+%endmacro
+
+;;make pshufw work with xmm/ymm registers, via shuffling
+;;the low and high words seperately
+%macro pshufw 3
+%if cpuflag(sse2) | cpuflag(avx2)
+    pshuflw %1,%2,%3
+    pshufhw %1,%2,%3
+%else
+    pshufw %1,%2,%3
+%endif
+%endmacro
+;;find the minimum/maixum byte in a simd register
+;;the phsufw's can/should probably be changed for
+;;sse/avx since it's two instructions
+%macro horiz_min_max_ub 2-3 ;;src, tmp, op
+    mova %2, %1
+    psrlq %1, 8
+    %3 %1, %2
+    pshufw %2, %1, 0b11111001
+    %3 %1,%2
+    pshufw %2, %1, 0b11111110
+    %3 %1, %2
+%endmacro
+%macro phminub 2
+    horiz_min_max_ub %1,%2,pminub
+%endmacro
+%macro phmaxub 2
+    horiz_min_max_ub %1,%2,pmaxub
+%endmacro
+;; define packed conditional moves, of the form:
+;; pcmovXXS dst, src, arg1, arg2, tmp
+;; where XX is a comparision (eq,ne,gt,...) and S is a size(b,w,d,q)
+;; copy src to dest, then compare arg1 with arg2 and store
+;; the result in tmp, finally AND src with tmp.
+%macro do_simd_sizes 2
+%1 %2b
+%1 %2w
+%1 %2d
+%1 %2q
+%endmacro
+;; macro generating macro
+%macro gen_pcmovxx 1
+%macro pcmov%1 4-6 ,%1 ;;dst, src, cmp1, cmp2, [tmp = cmp2]
+%if %0 == 5
+%ifnidn %5,%3
+    mova %5,%3
+%endif
+%endif
+    pcmp%6 %5,%4
+    mova %1, %2
+    pand %1, %5
+%endmacro
+%endmacro
+do_simd_sizes gen_pcmovxx,eq
+do_simd_sizes gen_pcmovxx,ne
+do_simd_sizes gen_pcmovxx,lt
+do_simd_sizes gen_pcmovxx,le
+do_simd_sizes gen_pcmovxx,gt
+do_simd_sizes gen_pcmovxx,ge
+
+
+
+define_vector_constant b01, 0x0101010101010101
+define_vector_constant b02, 0x0202020202020202
+define_vector_constant b08, 0x0808080808080808
+define_vector_constant b80, 0x8080808080808080
+define_vector_constant w04, 0x0004000400040004
+define_vector_constant w05, 0x0005000500050005
+define_vector_constant w20, 0x0020002000200020
+define_vector_constant q20, 0x0000000000000020 ;;dering threshold
+%xdefine packed_dering_threshold q20
+%assign dering_threshold 0x20
diff --git a/libpostproc/x86/block_copy.asm b/libpostproc/x86/block_copy.asm
new file mode 100644
index 0000000..febb51b
--- /dev/null
+++ b/libpostproc/x86/block_copy.asm
@@ -0,0 +1,132 @@
+;******************************************************************************
+;*
+;* Copyright (c) 2015 Tucker DiNapoli
+;*
+;* Code to copy, and optionally scale blocks
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*
+%include "PPUtil.asm"
+;; void duplicate(uint8 *src, int stride)
+;; duplicate block_size pixels 5 times upwards
+;; this should probably ultimately be inlined
+%macro gen_duplicate 0
+cglobal duplicate, 2, 2, 1
+    neg r1
+    mova m0, [r0]
+    mova [r0 + r1 * 4], m0
+    add r0, r1
+    mova [r0], m0
+    mova [r0 + r1], m0
+    mova [r0 + r1 * 2], m0
+    mova [r0 + r1 * 4], m0
+    neg r1
+    RET
+%endmacro
+;; void blockcopy(uint8_t *dst, int dst_stride, uint8_t *src, int src_stride,
+;;                int level_fix, int64_t *packed_offset_and_scale)
+;; Copy src to dst, and possibly fix the brightness
+%macro gen_block_copy 0
+cglobal blockCopy, 6, 6, 8
+    test r4, r4
+    jz .simple
+    mova m5, [r5] ;;offset
+    mova m6, [r5 + 32] ;;scale
+    lea r5, [r2 + r3] ;;dst + dst_stride
+    lea r4, [r0 + r1] ;;src + src_stride
+;; I don't know a ton about how to properly order instructions
+;; to maximize pipelining, so this might not be optimial
+%ifnmacro scaled_copy
+%macro scaled_copy 4
+    mova m0, %1
+    mova m1, m0
+    mova m2, %2
+    mova m3, m2
+%assign i 0
+%rep 4
+    punpcklbw m %+ i,m %+ i
+    pmulhuw m %+ i, m6
+    psubw m %+ i, m5
+%assign i i+1
+%endrep
+    packuswb m0, m1
+    packuswb m2, m3
+    mova %3, m0
+    mova %4, m2
+%endmacro
+%endif
+    scaled_copy [r0], [r0 + r1], [r2], [r2 + r3]
+    scaled_copy [r0 + r1*2], [r4 + r1*2], [r2 + r3*2], [r5 + r3*2]
+    scaled_copy [r0 + r1*4], [r4 + r1*4], [r2 + r3*4], [r5 + r3*4]
+    lea r4, [r4 + r1*4]
+    lea r5, [r5 + r2*4]
+    scaled_copy [r4 + r1], [r4 + r1*2], [r5 + r3], [r5 + r3*2]
+    jmp .end
+.simple: ;;just a simple memcpy
+    ;;if there's a better way to do this feel free to change it
+    ;;Any necessary prefetching is done by the caller
+    lea r4, [r0 + r1] ;;src + src_stride
+    lea r5, [r4 + r1*4] ;;dst + dst_stride
+    mova m0, [r0]
+    mova m1, [r0 + r1]
+    mova m2, [r0 + r1*2]
+    mova m3, [r4 + r1*2]
+    mova m4, [r0 + r1*4]
+    mova m5, [r4 + r1*4]
+    mova m6, [r5 + r1]
+    mova m7, [r5 + r1*2]
+    lea r4, [r2 + r3]
+    lea r5, [r4 + r3*4]
+    mova m0, [r2]
+    mova [r2 + r3], m1
+    mova [r2 + r3*2], m2
+    mova [r4 + r3*2], m3
+    mova [r2 + r3*4], m4
+    mova [r4 + r3*4], m5
+    mova [r5 + r3], m6
+    mova [r5 + r3*2], m7
+.end:
+    REP_RET
+%endmacro
+
+;;this is too small to warrent it's own file
+%macro gen_packQP 0
+cglobal packQP, 1,1,1
+    mova m0, [r0 + PPContext.QP_block]
+    punpcklbw m0, m0
+    punpcklwd m0, m0
+    punpckldq m0, m0
+%if cpuflag(sse2)
+    punpcklqdq m0, m0
+%endif
+    mova [r0 + PPContext.pQPb_block], m0
+    RET
+%endmacro
+
+INIT_MMX mmx2
+gen_duplicate
+gen_block_copy
+gen_packQP
+
+INIT_XMM sse2
+gen_duplicate
+gen_block_copy
+gen_packQP
+
+INIT_YMM avx2
+gen_duplicate
+gen_block_copy
diff --git a/libpostproc/x86/deinterlace.asm b/libpostproc/x86/deinterlace.asm
new file mode 100644
index 0000000..b3e0fe8
--- /dev/null
+++ b/libpostproc/x86/deinterlace.asm
@@ -0,0 +1,359 @@
+;*
+;* DeInterlacing filters written using SIMD extensions
+;* Copyright (C) 2015 Tucker DiNapoli (T.Dinapoli at gmail.com)
+;*
+;* Adapted from inline assembly:
+;* Copyright (C) 2001-2002 Michael Niedermayer (michaelni at gmx.at)
+;*
+;* This file is part of FFmpeg.
+;*
+;* FFmpeg is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* FFmpeg is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with FFmpeg; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+;*
+
+%include "PPUtil.asm"
+;; All deinterlace functions operate on N 8x8 blocks at a time, where N
+;; is the size (in bytes) of the simd registers being used divided
+;; by 8, so 2 for xmm, and 4 for ymm.
+
+;; Deinterlace blocks using linear interpolation
+;; Set each line 2n+1 to (line 2n + line 2n+2)/2
+%macro gen_deinterlace_interpolate_linear 0
+cglobal deInterlaceInterpolateLinear, 2, 4, 5;, src, stride
+    lea r0, [r0 + r1 * 4]
+    lea r2, [r0 + r1]
+    lea r3, [r2 + r1 * 4]
+
+    mova m0, [r0] ;0
+    mova m1, [r2 + r1] ;2
+    mova m2, [r0 + 4*r1] ;4
+    mova m3, [r3 + r1] ;6
+    mova m4, [r0 + 8*r1] ;8
+
+    pavgb m0, m1 ;1
+    pavgb m1, m2 ;3
+    pavgb m2, m3 ;5
+    pavgb m3, m4 ;7
+
+    mova [r2], m0
+    mova [r2 + r1 * 2], m1
+    mova [r3], m2
+    mova [r3 + r1 * 2], m3
+    RET
+%endmacro
+;; Deinterlace blocks using cubic interpolation
+;; Line 2n+1 = (9(2n) + 9(2n+2) - (2n-2) - (2n+4))/16
+%macro gen_deinterlace_interpolate_cubic 0
+cglobal deInterlaceInterpolateCubic, 2, 5, 5;, src, stride
+    lea r0, [r0 + r1 * 2]
+    add r0, r1
+    lea r2, [r0 + r1]
+    lea r3, [r2 + r1 * 4]
+    lea r4, [r3 + r1 * 4]
+    add r4, r1
+    pxor m4, m4
+
+;; TODO: See if there is speed gained by interleaving invocations of
+;; deint_cubic when we have more registers. (i.e doing the computations
+;; of two lines at once).
+%ifnmacro deint_cubic
+;; given 5 lines a,b,c,d,e: a = c-3, b = c-1, d = c+1, e = c + 2
+;; set c = (9b + 9d - a - b)/16
+%macro deint_cubic 5;;L1,L2,L3,L4,L5
+    mova m0,%1
+    mova m1,%2
+    mova m2,%4
+    mova m3,%5
+    pavgb m1,m2 ;(L2+L4)/2
+    pavgb m0,m3 ;(L1+L5)/2
+
+    mova m3, m0
+    punpcklbw m0, m4
+    punpckhbw m3, m4 ;;(L1+L5)/2
+
+    mova m2, m1
+    punpcklbw m1, m4
+    punpckhbw m2, m4 ;;(L2+L4)/2
+
+    psubw m0, m1
+    psubw m3, m2 ;;(L1+L5 - (L2+L4))/2
+    psraw m0, 3
+    psraw m3, 3
+    psubw m1, m0
+    psubw m2, m3 ;(9(L2+L4) - (L1+L5))/16
+    ;; convert the words back into bytes using unsigned saturation
+    packuswb m1, m2
+    mova %3, m1
+%endmacro
+%endif
+    deint_cubic [r0], [r2 + r1], [r2 + r1 *2],\
+                [r0 + r1 *4], [r3 + r1]
+    deint_cubic [r2 + r1], [r0 + r1 * 4], [r3],\
+                [r3 + r1], [r0 + r1 * 8]
+    deint_cubic [r0 + r1 * 4], [r3 + r1], [r3 + r1 * 2],\
+                [r0 + r1 * 8], [r4]
+    deint_cubic [r3 + r1], [r0 + r1 * 8], [r3 + r1 * 4],\
+                [r4], [r4 + r1 * 2]
+    RET
+%endmacro
+
+;; deinterlace blocks by seting every line n to (n-1 + 2n + n+1)/4
+%macro gen_deinterlace_blend_linear 0
+cglobal deInterlaceBlendLinear, 3, 5, 3 ;src, stride, tmp
+    lea r0, [r0 + r1 * 4]
+    lea r3, [r0 + r1]
+    lea r4, [r3 + r1 * 4]
+
+    mova m0, [r2] ;L0 (tmp)
+    mova m1, [r3] ;L2
+    mova m2, [r0] ;L1
+    pavgb m0, m1 ;L0+L2
+    pavgb m0, m2 ;L0 + 2L1 + L2 / 4
+    mova [r0], m0
+
+    mova m0, [r3 + r1] ;L3
+    pavgb m2, m0 ;;L1+L3/2
+    pavgb m2, m1 ;;L1+L3+2L2/4
+    mova [r3], m2
+
+    mova m2, [r3 + r1 * 2]  ;L4
+    pavgb m1, m2 ;;L2+L4/2
+    pavgb m1, m0 ;;2L3+L2+L4/4
+    mova [r3+r1], m1
+
+    mova m1, [r0 + r1 * 4]  ;L5
+    pavgb m0, m1 ;;L3 + L5/2
+    pavgb m0, m2 ;;L3 + 2L4 + L5/4
+    mova [r3 + r1 * 2], m0
+
+    mova m0, [r4]  ;L6
+    pavgb m2, m0 ;;L4+L6/2
+    pavgb m2, m1 ;;L4 + 2L5 + L6/4
+    mova [r0 + r1 * 4], m2
+
+    mova m2, [r4 + r1]  ;L7
+    pavgb m1, m2 ;;L5 + L7/2
+    pavgb m1, m0 ;;L5 + 2L6 + L7/4
+    mova [r4], m1
+
+    mova m1, [r4 + r1 * 2] ;L8
+    pavgb m0, m1 ;;L6 + L8/2
+    pavgb m0, m2 ;;L6 + 2L7 + L8/4
+    mova [r4 + r1], m0
+
+    mova m0, [r0 + r1 * 8] ;L9
+    pavgb m2, m0 ;;L7+L9
+    pavgb m2, m1 ;;L7+2L8+L9
+    mova [r4 + r1 * 2], m2
+
+    mova [r2], m1 ;tmp
+    RET
+%endmacro
+;;set every other line Ln to  (-(Ln-2) + 4(Ln-1) + 2(2Ln) + 4(Ln+1) -(Ln+2))/8
+%macro gen_deinterlace_FF 0
+cglobal deInterlaceFF, 3, 5, 8 ;;src, stride, tmp
+    lea r0, [r0 + 4*r1]
+    lea r3, [r0 + r1]
+    lea r4, [r3 + r1 * 4]
+    pxor m7, m7
+    mova m0, [r2] ;;L0 (tmp)
+
+%ifnmacro deint_ff
+%macro deint_ff 4
+;;  Ln-1 is in m0 by default
+    mova m1, %1 ;;Ln-1
+    mova m2, %2 ;;Ln
+    mova m3, %3 ;;Ln+1
+    mova m4, %4 ;;Ln+2
+
+    pavgb m1, m3 ;;(Ln-1 + Ln+1)/2
+    pavgb m0, m4 ;;(Ln-2 + Ln+2)/2
+    mova m3, m0
+    punpcklbw m0, m7
+    punpckhbw m3, m7
+
+    mova m4, m1
+    punpcklbw m1, m7
+    punpckhbw m4, m7
+
+    psllw m1, 2
+    psllw m4, 2 ;;(Ln-1 + Ln+1)*2
+
+    psubw m1, m0
+    psubw m4, m3 ;;(Ln-1 + Ln+1)*2 - (Ln-2 + Ln+2)/2
+
+    mova m5, m2
+    mova m0, m2 ;;Ln-2 for next loop
+    punpcklbw m2, m7
+    punpckhbw m5, m7
+
+    paddw m1, m2
+    paddw m4, m5 ;;(Ln-1 + Ln+1)*2 + Ln - (Ln-2 + Ln+2)/2
+
+    psraw m1, 2
+    psraw m4, 2 ;;(4(Ln-1 + Ln+1) + 2Ln - (Ln-2 + Ln+2))/8
+
+    packuswb m1, m4
+    mova %2, m1
+%endmacro
+%endif
+    deint_ff [r0], [r3], [r3 + r1], [r3 + r1 * 2]
+    deint_ff [r3 + r1], [r3 + r1 *2], [r0 + r1 * 4], [r4]
+    deint_ff [r0 + r1 * 4], [r4], [r4 + r1], [r4 + r1 * 2]
+    deint_ff [r4 + r1], [r4 + r1 * 2], [r0 + r1 * 8], [r4 + r1 * 4]
+    mova [r2], m0
+    RET
+%endmacro
+
+%macro gen_deinterlace_L5 0
+;; set each line Ln to (-(Ln-2) + 2(Ln-1) + 6Ln + 2(Ln+1) -(Ln+2))/8
+cglobal deInterlaceL5, 4, 6, 8 ;;src, stride, tmp1, tmp2
+    lea r0, [r0 + r1 * 4]
+    lea r4, [r0 + r1]
+    lea r5, [r5 + r1 * 4]
+    pxor m7, m7
+    mova m0, [r2] ;;Ln-2 (tmp1)
+    mova m1, [r3] ;;Ln-1 (tmp2)
+
+%ifnmacro deint_L5
+%macro deint_L5 5 ;;Ln-2, Ln-1, Ln, Ln+1, Ln+2
+    mova m2, %3;%3
+    mova m3, %4;%4
+    mova m4, %5;%5
+
+    pavgb m4, %1 ;;(Ln-2 + Ln+2)/2
+    pavgb m3, %2 ;;(Ln-1 + Ln+1)/2
+
+    mova %1, m2 ;;Ln-2 for next n
+    mova m5, m2
+    mova m6, m3
+
+    punpcklbw m2, m7
+    punpckhbw m5, m7
+
+    mova m6, m2
+    paddw m2, m2
+    paddw m2, m6
+    mova m6, m5
+    paddw m5, m5
+    paddw m5, m6 ;;3*Ln
+
+    mova m6, m3
+    punpcklbw m3, m7
+    punpckhbw m6, m7
+
+    paddw m3, m3
+    paddw m6, m6
+    paddw m2, m3
+    paddw m5, m6 ;;Ln-1 + 3Ln + Ln+1
+
+    mova m6, m4
+    punpcklbw m4, m7
+    punpckhbw m6, m7
+
+    psubsw m2, m4
+    psubsw m5, m6 ;;(-Ln-2 + 2Ln-1 + 6Ln + 2Ln+1 - Ln+2)/2
+    psraw m2, 2
+    psraw m5, 2 ;;(...)/8 (same as above)
+
+    packuswb m2, m5
+    mova %3, m2
+%endmacro
+%endif
+    deint_L5 m0, m1, [r0], [r4], [r4 + r1]
+    deint_L5 m1, m0, [r4], [r4 + r1], [r4 + r1 * 2]
+    deint_L5 m0, m1, [r4 + r1], [r4 + r1 * 2], [r0 + r1 * 4]
+    deint_L5 m1, m0, [r4 + r1 * 2], [r0 + r1 * 4], [r5]
+    deint_L5 m0, m1, [r0 + r1 * 4], [r5], [r5 + r1]
+    deint_L5 m1, m0, [r5], [r5 + r1], [r5 + r1 * 2]
+    deint_L5 m0, m1, [r5 + r1], [r5 + r1 * 2], [r0 + r1 * 8]
+    deint_L5 m1, m0, [r5 + r1 * 2], [r0 + r1 * 8], [r5 + r1 * 4]
+
+    mova [r2], m0 ;;Ln-2 (tmp1)
+    mova [r3], m1 ;;Ln-1 (tmp2)
+    RET
+%endmacro
+
+%macro gen_deinterlace_median 0
+;; Apply a median filter to every second line
+;; i.e for each set of bytes a,b,c in Ln-1,Ln,Ln+1
+;; set d,e,f equal to a,b,c such that d <= e <= f, set the byte in Ln equal to d
+cglobal deInterlaceMedian, 2, 4, 4 ;;src, stride
+    lea r0, [r0 + r1 * 4]
+    lea r2, [r0 + r1]
+    lea r3, [r2 + r1 * 4]
+%ifnmacro deint_median
+%macro deint_median 4
+    mova %4, %1
+    pmaxub %1, %2
+    pminub %2, %4
+    pmaxub %2, %3
+    pminub %1, %2
+%endmacro
+%endif
+    mova m0, [r0] ;0
+    mova m1, [r2] ;1
+    mova m2, [r2 + r1] ;2
+    deint_median m0, m1, m2, m3
+    mova [r2], m0 ;1
+
+    ;; m2 = 2
+    mova m1, [r2 + r1 * 2] ;3
+    mova m0, [r0 + r1 * 4] ;4
+    deint_median m2, m1, m0, m3
+    mova [r2 + r1 * 2], m2 ;3
+
+
+    ;; m0 = 4
+    mova m2, [r3] ;5
+    mova m1, [r3 + r1] ;6
+;;This confuses me, why isn't m0, m1, m2, m3
+    deint_median m2, m0, m1, m3
+    mova [r3], m2 ;5
+
+    mova m2, [r3 + r1 * 2] ;7
+    mova m0, [r0 + r1 * 8] ;8
+;; and shouldn't this be m1, m0, m2, m3
+    deint_median m2, m1, m0, m3
+    mova [r3 + r1 * 2], m2 ;7
+    RET
+%endmacro
+;; I'm not exactly sure how to insure the following only get built if
+;; the specified instruction set is available.
+;; If the INIT_XXX macros do that then great, otherwise I'll correct it
+SECTION_TEXT
+
+INIT_MMX mmx2
+gen_deinterlace_interpolate_linear
+gen_deinterlace_interpolate_cubic
+gen_deinterlace_blend_linear
+gen_deinterlace_FF
+gen_deinterlace_L5
+gen_deinterlace_median
+
+INIT_XMM sse2
+gen_deinterlace_interpolate_linear
+gen_deinterlace_interpolate_cubic
+gen_deinterlace_blend_linear
+gen_deinterlace_FF
+gen_deinterlace_L5
+gen_deinterlace_median
+
+INIT_YMM avx2
+gen_deinterlace_interpolate_linear
+gen_deinterlace_interpolate_cubic
+gen_deinterlace_blend_linear
+gen_deinterlace_FF
+gen_deinterlace_L5
+gen_deinterlace_median
diff --git a/tests/fate/filter-video.mak b/tests/fate/filter-video.mak
index 7153f4e..0de4f34 100644
--- a/tests/fate/filter-video.mak
+++ b/tests/fate/filter-video.mak
@@ -268,7 +268,7 @@ fate-filter-pp1: CMD = video_filter "pp=fq|4/be/hb/vb/tn/l5/al"
 fate-filter-pp2: CMD = video_filter "qp=x+y,pp=be/h1/v1/lb"
 fate-filter-pp3: CMD = video_filter "qp=x+y,pp=be/ha|128|7/va/li"
 fate-filter-pp4: CMD = video_filter "pp=be/ci"
-fate-filter-pp5: CMD = video_filter "pp=md"
+fate-filter-pp5: CMD = video_filter "pp=be/md"
 fate-filter-pp6: CMD = video_filter "pp=be/fd"
 
 FATE_FILTER_VSYNTH-$(call ALLYES, QP_FILTER PP_FILTER) += fate-filter-qp
-- 
2.3.5



More information about the ffmpeg-devel mailing list