[FFmpeg-cvslog] avfilter/x86/vf_blend : add SIMD for 16 bit version of

Martin Vignali git at videolan.org
Thu Apr 5 22:47:53 EEST 2018


ffmpeg | branch: master | Martin Vignali <martin.vignali at gmail.com> | Sat Mar 17 19:37:06 2018 +0100| [f3df42e81d367547756e7955e36c8af7c9c18db2] | committer: Martin Vignali

avfilter/x86/vf_blend : add SIMD for 16 bit version of

grainextract
grainmerge
average
extremity
negation

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=f3df42e81d367547756e7955e36c8af7c9c18db2
---

 libavfilter/x86/vf_blend.asm    | 168 ++++++++++++++++++++++++++--------------
 libavfilter/x86/vf_blend_init.c |  20 +++++
 2 files changed, 128 insertions(+), 60 deletions(-)

diff --git a/libavfilter/x86/vf_blend.asm b/libavfilter/x86/vf_blend.asm
index 9cd5ee7acb..251bbb5a12 100644
--- a/libavfilter/x86/vf_blend.asm
+++ b/libavfilter/x86/vf_blend.asm
@@ -27,6 +27,8 @@
 SECTION_RODATA
 
 ps_255: times 4 dd 255.0
+pd_32768 : times 4 dd 32768
+pd_65535 : times 4 dd 65535
 pw_1:   times 8 dw 1
 pw_128: times 8 dw 128
 pw_255: times 8 dw 255
@@ -79,26 +81,33 @@ BLEND_INIT %1, 2, %3
 BLEND_END
 %endmacro
 
-%macro GRAINEXTRACT 0
-BLEND_INIT grainextract, 6
+; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
+%macro GRAINEXTRACT 3-4
+BLEND_INIT %1, 6, %4
     pxor           m4, m4
+%if %0 == 4 ; 16 bit
+    VBROADCASTI128 m5, [pd_32768]
+%else
     VBROADCASTI128 m5, [pw_128]
+%endif
 .nextrow:
     mov        xq, widthq
     .loop:
         movu           m1, [topq + xq]
         movu           m3, [bottomq + xq]
-        punpcklbw      m0, m1, m4
-        punpckhbw      m1, m4
-        punpcklbw      m2, m3, m4
-        punpckhbw      m3, m4
 
-        paddw          m0, m5
-        paddw          m1, m5
-        psubw          m0, m2
-        psubw          m1, m3
+        punpckl%2%3      m0, m1, m4
+        punpckh%2%3      m1, m4
+        punpckl%2%3      m2, m3, m4
+        punpckh%2%3      m3, m4
+
+        padd%3          m0, m5
+        padd%3          m1, m5
+        psub%3          m0, m2
+        psub%3          m1, m3
+
+        packus%3%2       m0, m1
 
-        packuswb       m0, m1
         mova  [dstq + xq], m0
         add            xq, mmsize
     jl .loop
@@ -172,8 +181,9 @@ BLEND_INIT screen, 7
 BLEND_END
 %endmacro
 
-%macro AVERAGE 0
-BLEND_INIT average, 3
+;%1 name, %2 (b or w), %3 (set if 16 bit)
+%macro AVERAGE 2-3
+BLEND_INIT %1, 3, %3
     pcmpeqb        m2, m2
 
 .nextrow:
@@ -184,7 +194,7 @@ BLEND_INIT average, 3
     movu           m1, [bottomq + xq]
     pxor           m0, m2
     pxor           m1, m2
-    pavgb          m0, m1
+    pavg%2         m0, m1
     pxor           m0, m2
     mova  [dstq + xq], m0
     add            xq, mmsize
@@ -192,29 +202,34 @@ BLEND_INIT average, 3
 BLEND_END
 %endmacro
 
-
-%macro GRAINMERGE 0
-BLEND_INIT grainmerge, 6
+; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
+%macro GRAINMERGE 3-4
+BLEND_INIT %1, 6, %4
     pxor       m4, m4
-
+%if %0 == 4 ; 16 bit
+    VBROADCASTI128       m5, [pd_32768]
+%else
     VBROADCASTI128       m5, [pw_128]
+%endif
 .nextrow:
     mov        xq, widthq
 
     .loop:
         movu           m1, [topq + xq]
         movu           m3, [bottomq + xq]
-        punpcklbw      m0, m1, m4
-        punpckhbw      m1, m4
-        punpcklbw      m2, m3, m4
-        punpckhbw      m3, m4
 
-        paddw           m0, m2
-        paddw           m1, m3
-        psubw           m0, m5
-        psubw           m1, m5
+        punpckl%2%3    m0, m1, m4
+        punpckh%2%3    m1, m4
+        punpckl%2%3    m2, m3, m4
+        punpckh%2%3    m3, m4
+
+        padd%3         m0, m2
+        padd%3         m1, m3
+        psub%3         m0, m5
+        psub%3         m1, m5
+
+        packus%3%2     m0, m1
 
-        packuswb       m0, m1
         mova  [dstq + xq], m0
         add            xq, mmsize
     jl .loop
@@ -324,52 +339,73 @@ BLEND_INIT %1, 5, %4
 BLEND_END
 %endmacro
 
-%macro BLEND_ABS 0
-BLEND_INIT extremity, 8
+; %1 name , %2 src (b or w), %3 inter (w or d), %4 (1 if 16bit, not set if 8 bit)
+%macro EXTREMITY 3-4
+BLEND_INIT %1, 8, %4
     pxor       m2, m2
+%if %0 == 4; 16 bit
+    VBROADCASTI128       m4, [pd_65535]
+%else
     VBROADCASTI128       m4, [pw_255]
+%endif
 .nextrow:
     mov        xq, widthq
 
     .loop:
         movu            m0, [topq + xq]
         movu            m1, [bottomq + xq]
-        punpckhbw       m5, m0, m2
-        punpcklbw       m0, m2
-        punpckhbw       m6, m1, m2
-        punpcklbw       m1, m2
-        psubw           m3, m4, m0
-        psubw           m7, m4, m5
-        psubw           m3, m1
-        psubw           m7, m6
+        punpckh%2%3     m5, m0, m2
+        punpckl%2%3     m0, m2
+        punpckh%2%3     m6, m1, m2
+        punpckl%2%3     m1, m2
+        psub%3          m3, m4, m0
+        psub%3          m7, m4, m5
+        psub%3          m3, m1
+        psub%3          m7, m6
+%if %0 == 4; 16 bit
+        pabsd           m3, m3
+        pabsd           m7, m7
+%else
         ABS2            m3, m7, m1, m6
-        packuswb        m3, m7
+%endif
+        packus%3%2      m3, m7
         mova   [dstq + xq], m3
         add             xq, mmsize
     jl .loop
 BLEND_END
+%endmacro
 
-BLEND_INIT negation, 8
+%macro NEGATION 3-4
+BLEND_INIT %1, 8, %4
     pxor       m2, m2
+%if %0 == 4; 16 bit
+    VBROADCASTI128       m4, [pd_65535]
+%else
     VBROADCASTI128       m4, [pw_255]
+%endif
 .nextrow:
     mov        xq, widthq
 
     .loop:
         movu            m0, [topq + xq]
         movu            m1, [bottomq + xq]
-        punpckhbw       m5, m0, m2
-        punpcklbw       m0, m2
-        punpckhbw       m6, m1, m2
-        punpcklbw       m1, m2
-        psubw           m3, m4, m0
-        psubw           m7, m4, m5
-        psubw           m3, m1
-        psubw           m7, m6
+        punpckh%2%3     m5, m0, m2
+        punpckl%2%3     m0, m2
+        punpckh%2%3     m6, m1, m2
+        punpckl%2%3     m1, m2
+        psub%3          m3, m4, m0
+        psub%3          m7, m4, m5
+        psub%3          m3, m1
+        psub%3          m7, m6
+%if %0 == 4; 16 bit
+        pabsd           m3, m3
+        pabsd           m7, m7
+%else
         ABS2            m3, m7, m1, m6
-        psubw           m0, m4, m3
-        psubw           m1, m4, m7
-        packuswb        m0, m1
+%endif
+        psub%3          m0, m4, m3
+        psub%3          m1, m4, m7
+        packus%3%2      m0, m1
         mova   [dstq + xq], m0
         add             xq, mmsize
     jl .loop
@@ -384,17 +420,17 @@ BLEND_SIMPLE addition, addusb
 BLEND_SIMPLE subtract, subusb
 BLEND_SIMPLE darken,   minub
 BLEND_SIMPLE lighten,  maxub
-GRAINEXTRACT
+GRAINEXTRACT grainextract, b, w
 BLEND_MULTIPLY
 BLEND_SCREEN
-AVERAGE
-GRAINMERGE
+AVERAGE       average,    b
+GRAINMERGE    grainmerge, b, w
 HARDMIX
 PHOENIX phoenix, b
 DIFFERENCE difference, b, w
 DIVIDE
-
-BLEND_ABS
+EXTREMITY extremity, b, w
+NEGATION negation, b, w
 
 %if ARCH_X86_64
 BLEND_SIMPLE addition_16, addusw, 1
@@ -402,18 +438,24 @@ BLEND_SIMPLE and_16,      and,    1
 BLEND_SIMPLE or_16,       or,     1
 BLEND_SIMPLE subtract_16, subusw, 1
 BLEND_SIMPLE xor_16,      xor,    1
+AVERAGE      average_16,  w,      1
 %endif
 
 INIT_XMM ssse3
 DIFFERENCE difference, b, w
-BLEND_ABS
+EXTREMITY extremity, b, w
+NEGATION negation, b, w
 
 INIT_XMM sse4
 %if ARCH_X86_64
 BLEND_SIMPLE darken_16,   minuw, 1
 BLEND_SIMPLE lighten_16,  maxuw, 1
+GRAINEXTRACT grainextract_16, w, d, 1
+GRAINMERGE   grainmerge_16, w, d, 1
 PHOENIX      phoenix_16,      w, 1
 DIFFERENCE   difference_16, w, d, 1
+EXTREMITY    extremity_16, w, d, 1
+NEGATION     negation_16, w, d, 1
 %endif
 
 %if HAVE_AVX2_EXTERNAL
@@ -425,16 +467,17 @@ BLEND_SIMPLE addition, addusb
 BLEND_SIMPLE subtract, subusb
 BLEND_SIMPLE darken,   minub
 BLEND_SIMPLE lighten,  maxub
-GRAINEXTRACT
+GRAINEXTRACT grainextract, b, w
 BLEND_MULTIPLY
 BLEND_SCREEN
-AVERAGE
-GRAINMERGE
+AVERAGE    average,    b
+GRAINMERGE grainmerge, b, w
 HARDMIX
 PHOENIX phoenix, b
 
 DIFFERENCE difference, b, w
-BLEND_ABS
+EXTREMITY extremity, b, w
+NEGATION negation, b, w
 
 %if ARCH_X86_64
 BLEND_SIMPLE addition_16, addusw, 1
@@ -444,7 +487,12 @@ BLEND_SIMPLE lighten_16,  maxuw,  1
 BLEND_SIMPLE or_16,       or,     1
 BLEND_SIMPLE subtract_16, subusw, 1
 BLEND_SIMPLE xor_16,      xor,    1
+GRAINEXTRACT grainextract_16, w, d, 1
+AVERAGE      average_16,  w,      1
+GRAINMERGE   grainmerge_16, w, d, 1
 PHOENIX      phoenix_16,       w, 1
 DIFFERENCE   difference_16, w, d, 1
+EXTREMITY    extremity_16, w, d, 1
+NEGATION     negation_16, w, d, 1
 %endif
 %endif
diff --git a/libavfilter/x86/vf_blend_init.c b/libavfilter/x86/vf_blend_init.c
index 0962f6d7fd..acf28559ff 100644
--- a/libavfilter/x86/vf_blend_init.c
+++ b/libavfilter/x86/vf_blend_init.c
@@ -72,12 +72,22 @@ BLEND_FUNC(negation, avx2)
 #if ARCH_X86_64
 BLEND_FUNC(addition_16, sse2)
 BLEND_FUNC(addition_16, avx2)
+BLEND_FUNC(grainmerge_16, sse4)
+BLEND_FUNC(grainmerge_16, avx2)
+BLEND_FUNC(average_16, sse2)
+BLEND_FUNC(average_16, avx2)
 BLEND_FUNC(and_16, sse2)
 BLEND_FUNC(and_16, avx2)
 BLEND_FUNC(darken_16, sse4)
 BLEND_FUNC(darken_16, avx2)
+BLEND_FUNC(grainextract_16, sse4)
+BLEND_FUNC(grainextract_16, avx2)
 BLEND_FUNC(difference_16, sse4)
 BLEND_FUNC(difference_16, avx2)
+BLEND_FUNC(extremity_16, sse4)
+BLEND_FUNC(extremity_16, avx2)
+BLEND_FUNC(negation_16, sse4)
+BLEND_FUNC(negation_16, avx2)
 BLEND_FUNC(lighten_16, sse4)
 BLEND_FUNC(lighten_16, avx2)
 BLEND_FUNC(or_16, sse2)
@@ -152,6 +162,7 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
             switch (param->mode) {
             case BLEND_ADDITION: param->blend = ff_blend_addition_16_sse2; break;
             case BLEND_AND:      param->blend = ff_blend_and_16_sse2;      break;
+            case BLEND_AVERAGE:  param->blend = ff_blend_average_16_sse2;  break;
             case BLEND_OR:       param->blend = ff_blend_or_16_sse2;       break;
             case BLEND_SUBTRACT: param->blend = ff_blend_subtract_16_sse2; break;
             case BLEND_XOR:      param->blend = ff_blend_xor_16_sse2;      break;
@@ -159,8 +170,12 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
         }
         if (EXTERNAL_SSE4(cpu_flags) && param->opacity == 1) {
             switch (param->mode) {
+            case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_16_sse4; break;
             case BLEND_DARKEN:   param->blend = ff_blend_darken_16_sse4;     break;
+            case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_16_sse4; break;
             case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_sse4; break;
+            case BLEND_EXTREMITY:  param->blend = ff_blend_extremity_16_sse4;    break;
+            case BLEND_NEGATION:  param->blend = ff_blend_negation_16_sse4;     break;
             case BLEND_LIGHTEN:  param->blend = ff_blend_lighten_16_sse4;    break;
             case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_16_sse4;    break;
             }
@@ -168,9 +183,14 @@ av_cold void ff_blend_init_x86(FilterParams *param, int is_16bit)
         if (EXTERNAL_AVX2_FAST(cpu_flags) && param->opacity == 1) {
             switch (param->mode) {
             case BLEND_ADDITION: param->blend = ff_blend_addition_16_avx2; break;
+            case BLEND_GRAINMERGE: param->blend = ff_blend_grainmerge_16_avx2;   break;
             case BLEND_AND:      param->blend = ff_blend_and_16_avx2;      break;
+            case BLEND_AVERAGE:  param->blend = ff_blend_average_16_avx2;  break;
             case BLEND_DARKEN:   param->blend = ff_blend_darken_16_avx2;   break;
+            case BLEND_GRAINEXTRACT: param->blend = ff_blend_grainextract_16_avx2; break;
             case BLEND_DIFFERENCE: param->blend = ff_blend_difference_16_avx2; break;
+            case BLEND_EXTREMITY:  param->blend = ff_blend_extremity_16_avx2;    break;
+            case BLEND_NEGATION:  param->blend = ff_blend_negation_16_avx2;     break;
             case BLEND_LIGHTEN:  param->blend = ff_blend_lighten_16_avx2;  break;
             case BLEND_OR:       param->blend = ff_blend_or_16_avx2;       break;
             case BLEND_PHOENIX:  param->blend = ff_blend_phoenix_16_avx2;  break;



More information about the ffmpeg-cvslog mailing list