[FFmpeg-devel] avfilter/x86/vf_threshold : add SSE4 and AVX2 for threshold 16

James Almer jamrial at gmail.com
Thu Dec 7 22:52:55 EET 2017


On 12/7/2017 5:10 PM, Martin Vignali wrote:
> 2017-12-03 21:28 GMT+01:00 Martin Vignali <martin.vignali at gmail.com>:
> 
>>
>>
>> 2017-12-03 21:15 GMT+01:00 James Darnley <james.darnley at gmail.com>:
>>
>>> On 2017-12-03 19:30, Martin Vignali wrote:
>>>>  libavfilter/x86/vf_threshold.asm    | 19 ++++++++++++++-----
>>>>  libavfilter/x86/vf_threshold_init.c | 34
>>> ++++++++++++++++++++--------------
>>>>  2 files changed, 34 insertions(+), 19 deletions(-)
>>>>
>>>> diff --git a/libavfilter/x86/vf_threshold.asm
>>> b/libavfilter/x86/vf_threshold.asm
>>>> index fb008c376a..7b929c6bd2 100644
>>>> --- a/libavfilter/x86/vf_threshold.asm
>>>> +++ b/libavfilter/x86/vf_threshold.asm
>>>> @@ -27,14 +27,21 @@
>>>>  SECTION_RODATA
>>>>
>>>>  pb_128: times 16 db 128
>>>> +pb_128_0 : times 16 dw 32768
>>>
>>> No.  Please use db and the values you want.
>>>
>>> I assume this is supposed to be "times 8 db 0, 128".
>>
>>
>>
> Hello,
> 
> new patch in attach (you're right, it's "times 8 db 0, 128")
> 
> 
> Martin
> From ac91cb26724b6e8fe294e0bf9ad2dd17fe0eada9 Mon Sep 17 00:00:00 2001
> From: Martin Vignali <martin.vignali at gmail.com>
> Date: Thu, 7 Dec 2017 21:06:43 +0100
> Subject: [PATCH 1/2] avfilter/x86/vf_threshold : add threshold16 SIMD (SSE4 
>  and AVX2)
> 
> ---
>  libavfilter/x86/vf_threshold.asm    | 19 +++++++++++++------
>  libavfilter/x86/vf_threshold_init.c | 34 ++++++++++++++++++++--------------
>  2 files changed, 33 insertions(+), 20 deletions(-)
> 
> diff --git a/libavfilter/x86/vf_threshold.asm b/libavfilter/x86/vf_threshold.asm
> index 56a6c242d8..dc42cd4971 100644
> --- a/libavfilter/x86/vf_threshold.asm
> +++ b/libavfilter/x86/vf_threshold.asm
> @@ -25,12 +25,14 @@
>  SECTION_RODATA
>  
>  pb_128: times 16 db 128
> +pb_128_0 : times 8 db 0, 128
>  
>  SECTION .text
>  
> -%macro THRESHOLD_8 0
> +;%1 depth (8 or 16) ; %2 b or w ; %3 constant
> +%macro THRESHOLD 3
>  %if ARCH_X86_64
> -cglobal threshold8, 10, 13, 5, in, threshold, min, max, out, ilinesize, tlinesize, flinesize, slinesize, olinesize, w, h, x
> +cglobal threshold%1, 10, 13, 5, in, threshold, min, max, out, ilinesize, tlinesize, flinesize, slinesize, olinesize, w, h, x
>      mov             wd, dword wm
>      mov             hd, dword hm
>  %else

You should also change the cglobal line for x86_32, right below this else

> @@ -43,7 +45,10 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x
>  %define     olinesizeq  r9mp
>  %define             hd  r11mp
>  %endif
> -    VBROADCASTI128  m4, [pb_128]
> +    VBROADCASTI128  m4, [%3]
> +%if %1 == 16
> +    add             wq, wq ; w *= 2 (16 bits instead of 8)
> +%endif
>      add            inq, wq
>      add     thresholdq, wq
>      add           minq, wq
> @@ -60,7 +65,7 @@ cglobal threshold8, 5, 7, 5, in, threshold, min, max, out, w, x
>          movu            m3, [maxq + xq]
>          pxor            m0, m4
>          pxor            m1, m4
> -        pcmpgtb         m0, m1
> +        pcmpgt%2        m0, m1
>          PBLENDVB        m3, m2, m0
>          movu   [outq + xq], m3
>          add             xq, mmsize
> @@ -77,9 +82,11 @@ RET
>  %endmacro
>  
>  INIT_XMM sse4
> -THRESHOLD_8
> +THRESHOLD 8, b, pb_128
> +THRESHOLD 16, w, pb_128_0
>  
>  %if HAVE_AVX2_EXTERNAL
>  INIT_YMM avx2
> -THRESHOLD_8
> +THRESHOLD 8, b, pb_128
> +THRESHOLD 16, w, pb_128_0
>  %endif
> diff --git a/libavfilter/x86/vf_threshold_init.c b/libavfilter/x86/vf_threshold_init.c
> index db0559533d..8e42296791 100644
> --- a/libavfilter/x86/vf_threshold_init.c
> +++ b/libavfilter/x86/vf_threshold_init.c
> @@ -23,20 +23,19 @@
>  #include "libavutil/x86/cpu.h"
>  #include "libavfilter/threshold.h"
>  
> -void ff_threshold8_sse4(const uint8_t *in, const uint8_t *threshold,
> -                        const uint8_t *min, const uint8_t *max,
> -                        uint8_t *out,
> -                        ptrdiff_t ilinesize, ptrdiff_t tlinesize,
> -                        ptrdiff_t flinesize, ptrdiff_t slinesize,
> -                        ptrdiff_t olinesize,
> -                        int w, int h);
> -void ff_threshold8_avx2(const uint8_t *in, const uint8_t *threshold,
> -                        const uint8_t *min, const uint8_t *max,
> -                        uint8_t *out,
> -                        ptrdiff_t ilinesize, ptrdiff_t tlinesize,
> -                        ptrdiff_t flinesize, ptrdiff_t slinesize,
> -                        ptrdiff_t olinesize,
> -                        int w, int h);
> +#define THRESHOLD_FUNC(depth, opt) \
> +void ff_threshold##depth##_##opt(const uint8_t *in, const uint8_t *threshold,\
> +                                const uint8_t *min, const uint8_t *max,     \
> +                                uint8_t *out,                               \
> +                                ptrdiff_t ilinesize, ptrdiff_t tlinesize,   \
> +                                ptrdiff_t flinesize, ptrdiff_t slinesize,   \
> +                                ptrdiff_t olinesize,                        \
> +                                int w, int h);
> +
> +THRESHOLD_FUNC(8, sse4)
> +THRESHOLD_FUNC(8, avx2)
> +THRESHOLD_FUNC(16, sse4)
> +THRESHOLD_FUNC(16, avx2)
>  
>  av_cold void ff_threshold_init_x86(ThresholdContext *s)
>  {
> @@ -49,5 +48,12 @@ av_cold void ff_threshold_init_x86(ThresholdContext *s)
>          if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>              s->threshold = ff_threshold8_avx2;
>          }
> +    } else if (s->depth == 16) {
> +        if (EXTERNAL_SSE4(cpu_flags)) {
> +            s->threshold = ff_threshold16_sse4;
> +        }
> +        if (EXTERNAL_AVX2_FAST(cpu_flags)) {
> +            s->threshold = ff_threshold16_avx2;
> +        }
>      }
>  }
> -- 
> 2.11.0 (Apple Git-81)
> 




More information about the ffmpeg-devel mailing list