[FFmpeg-cvslog] avfilter/x86/vf_hflip : add avx2 version for hflip_byte and hflip_short

Martin Vignali git at videolan.org
Tue Dec 19 22:12:02 EET 2017


ffmpeg | branch: master | Martin Vignali <martin.vignali at gmail.com> | Tue Dec 19 21:06:01 2017 +0100| [f181648176c0d93851d4a89410bbdd9c85e1fa7c] | committer: Martin Vignali

avfilter/x86/vf_hflip : add avx2 version for hflip_byte and hflip_short

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f181648176c0d93851d4a89410bbdd9c85e1fa7c
---

 libavfilter/x86/vf_hflip.asm    | 12 +++++++++++-
 libavfilter/x86/vf_hflip_init.c | 20 ++++++++++++++++----
 2 files changed, 27 insertions(+), 5 deletions(-)

diff --git a/libavfilter/x86/vf_hflip.asm b/libavfilter/x86/vf_hflip.asm
index 82e1154d21..6bd1782da4 100644
--- a/libavfilter/x86/vf_hflip.asm
+++ b/libavfilter/x86/vf_hflip.asm
@@ -32,7 +32,7 @@ SECTION .text
 ;%1 byte or short, %2 b or w, %3 size in byte (1 for byte, 2 for short)
 %macro HFLIP 3
 cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
-    mova    m0, [pb_flip_%1]
+    VBROADCASTI128    m0, [pb_flip_%1]
     xor     xq, xq
 %if %3 == 1
     movsxdifnidn wq, wd
@@ -47,8 +47,13 @@ cglobal hflip_%1, 3, 5, 3, src, dst, w, r, x
 
     .loop0:
         neg     xq
+%if mmsize == 32
+        vpermq  m1, [srcq + xq -     mmsize + %3], 0x4e; flip each lane at load
+        vpermq  m2, [srcq + xq - 2 * mmsize + %3], 0x4e; flip each lane at load
+%else
         movu    m1, [srcq + xq -     mmsize + %3]
         movu    m2, [srcq + xq - 2 * mmsize + %3]
+%endif
         pshufb  m1, m0
         pshufb  m2, m0
         neg     xq
@@ -78,3 +83,8 @@ INIT_XMM ssse3
 HFLIP byte, b, 1
 HFLIP short, w, 2
 
+%if HAVE_AVX2_EXTERNAL
+INIT_YMM avx2
+HFLIP byte, b, 1
+HFLIP short, w, 2
+%endif
diff --git a/libavfilter/x86/vf_hflip_init.c b/libavfilter/x86/vf_hflip_init.c
index 2b5c9d3bf3..0ac399b0d4 100644
--- a/libavfilter/x86/vf_hflip_init.c
+++ b/libavfilter/x86/vf_hflip_init.c
@@ -24,7 +24,9 @@
 #include "libavfilter/hflip.h"
 
 void ff_hflip_byte_ssse3(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_byte_avx2(const uint8_t *src, uint8_t *dst, int w);
 void ff_hflip_short_ssse3(const uint8_t *src, uint8_t *dst, int w);
+void ff_hflip_short_avx2(const uint8_t *src, uint8_t *dst, int w);
 
 av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
 {
@@ -32,10 +34,20 @@ av_cold void ff_hflip_init_x86(FlipContext *s, int step[4], int nb_planes)
     int i;
 
     for (i = 0; i < nb_planes; i++) {
-        if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 1) {
-            s->flip_line[i] = ff_hflip_byte_ssse3;
-        } else if (EXTERNAL_SSSE3(cpu_flags) && step[i] == 2) {
-            s->flip_line[i] = ff_hflip_short_ssse3;
+        if (step[i] == 1) {
+            if (EXTERNAL_SSSE3(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_byte_ssse3;
+            }
+            if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_byte_avx2;
+            }
+        } else if (step[i] == 2) {
+            if (EXTERNAL_SSSE3(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_short_ssse3;
+            }
+            if (EXTERNAL_AVX2_FAST(cpu_flags)) {
+                s->flip_line[i] = ff_hflip_short_avx2;
+            }
         }
     }
 }



More information about the ffmpeg-cvslog mailing list