[FFmpeg-devel] [PATCH] Higher bit-depth x86 SIMD assembly for yadif

James Darnley james.darnley at gmail.com
Sun Jan 22 17:53:44 CET 2012


On 2012-01-19 22:44, Michael Niedermayer wrote:
> CC-ing to dark shikari & loren as they might want to review too?

likewise

>> Something else to think about.  The source code clarity could be
>> greatly improved by using yasm and its preprocessor.  I wonder how
>> much abstraction it would need to roll the source to all three
>> functions together and whether it would save source code size.
> 
> if you want to convert it to yasm, thats fine, if not its fine too.
> whichever way you prefer

After all this tinkering with inline asm it sounds better than ever.
For a later time anyway.

>> +            "paddd     "MM"6, "MM"3 \n\t" /* d+diff */\
>> +            PMAXSD(MM"2",MM"1",MM"7")\
>> +            PMINSD(MM"3",MM"1",MM"7")\
>> +            PACK(MM"1")\
>> +\
>> +            :\
>> +            :[tmpA] "r"(tmpA),\
>> +             [prev] "r"(prev),\
>> +             [cur]  "r"(cur),\
>> +             [next] "r"(next),\
>> +             [prefs]"r"(prefs),\
>> +             [mrefs]"r"(mrefs),\
>> +             [mode] "g"(mode)\
> 
> this should list the SIMD registers written to on the clobber list
> otherwise with SSE* there may be issues on win64 and in theory also
> elsewhere
> 
> 
>> +        );\
>> +        __asm__ volatile(MOVH" "MM"1, %0" :"=m"(*dst));\
> 
> I guess it should be ok in reality but its not guranteed that
> SIMD registers dont change between blocks

I've made a solution for these these two points and pasted the relevant
changes below.  Please give your comments and when everyone is satisfied
I will update the patches and send them.

> diff --git a/libavfilter/x86/yadif_template_16bit.c b/libavfilter/x86/yadif_template_16bit.c
> index a3a7394..35fc085 100644
> --- a/libavfilter/x86/yadif_template_16bit.c
> +++ b/libavfilter/x86/yadif_template_16bit.c
> @@ -25,12 +25,14 @@
>      #define MOVA "movdqa"
>      #define MOVU "movdqu"
>      #define MMSIZE 16
> +    #define CLOBBER_LIST "%xmm0", "%xmm1", "%xmm2", "%xmm3","%xmm4", "%xmm5", "%xmm6", "%xmm7"
>  #else
>      #define MM "%%mm"
>      #define MOVH "movd"
>      #define MOVA "movq"
>      #define MOVU "movq"
>      #define MMSIZE 8
> +    #define CLOBBER_LIST "%mm0", "%mm1", "%mm2", "%mm3","%mm4", "%mm5", "%mm6", "%mm7"
>  #endif
>  
>  #define LOAD(mem,dst)\
> @@ -261,6 +263,7 @@ void RENAME(ff_yadif_filter_line_16bit)(uint8_t *dst,
>              PMAXSD(MM"2",MM"1",MM"7")\
>              PMINSD(MM"3",MM"1",MM"7")\
>              PACK(MM"1")\
> +            MOVH" "MM"1, (%[dst])"\
>  \
>              :\
>              :[tmpA] "r"(tmpA),\
> @@ -269,9 +272,10 @@ void RENAME(ff_yadif_filter_line_16bit)(uint8_t *dst,
>               [next] "r"(next),\
>               [prefs]"r"(prefs),\
>               [mrefs]"r"(mrefs),\
> -             [mode] "g"(mode)\
> +             [mode] "g"(mode),\
> +             [dst]  "r"(dst)\
> +            :CLOBBER_LIST\
>          );\
> -        __asm__ volatile(MOVH" "MM"1, %0" :"=m"(*dst));\
>          dst += MMSIZE/2;\
>          prev+= MMSIZE/2;\
>          cur += MMSIZE/2;\
> @@ -309,3 +313,4 @@ void RENAME(ff_yadif_filter_line_16bit)(uint8_t *dst,
>  #undef CHECK1
>  #undef CHECK2
>  #undef FILTER
> +#undef CLOBBER_LIST
> diff --git a/libavfilter/x86/yadif_template_9_14bit.c b/libavfilter/x86/yadif_template_9_14bit.c
> index 8eeddaa..26b1e93 100644
> --- a/libavfilter/x86/yadif_template_9_14bit.c
> +++ b/libavfilter/x86/yadif_template_9_14bit.c
> @@ -25,12 +25,14 @@
>      #define MOVA "movdqa"
>      #define MOVU "movdqu"
>      #define MMSIZE 16
> +    #define CLOBBER_LIST "%xmm0", "%xmm1", "%xmm2", "%xmm3","%xmm4", "%xmm5", "%xmm6", "%xmm7"
>  #else
>      #define MM "%%mm"
>      #define MOVH "movd"
>      #define MOVA "movq"
>      #define MOVU "movq"
>      #define MMSIZE 8
> +    #define CLOBBER_LIST "%mm0", "%mm1", "%mm2", "%mm3","%mm4", "%mm5", "%mm6", "%mm7"
>  #endif
>  
>  #define LOAD(mem,dst)\
> @@ -233,6 +235,7 @@ void RENAME(ff_yadif_filter_line_9_14bit)(uint8_t *dst,
>              "paddw     "MM"6, "MM"3 \n\t" /* d+diff */\
>              "pmaxsw    "MM"2, "MM"1 \n\t"\
>              "pminsw    "MM"3, "MM"1 \n\t" /* d = clip(spatial_pred, d-diff, d+diff); */\
> +            MOVU" "MM"1, (%[dst])"\
>  \
>              :\
>              :[tmpA] "r"(tmpA),\
> @@ -241,9 +244,10 @@ void RENAME(ff_yadif_filter_line_9_14bit)(uint8_t *dst,
>               [next] "r"(next),\
>               [prefs]"r"((x86_reg)prefs),\
>               [mrefs]"r"((x86_reg)mrefs),\
> -             [mode] "g"(mode)\
> +             [mode] "g"(mode),\
> +             [dst]  "r"(dst)\
> +            :CLOBBER_LIST\
>          );\
> -        __asm__ volatile(MOVU" "MM"1, %0" :"=m"(*dst));\
>          dst += MMSIZE-4;\
>          prev+= MMSIZE-4;\
>          cur += MMSIZE-4;\
> @@ -281,3 +285,4 @@ void RENAME(ff_yadif_filter_line_9_14bit)(uint8_t *dst,
>  #undef CHECK1
>  #undef CHECK2
>  #undef FILTER
> +#undef CLOBBER_LIST

P.S.  The same issues exist in the 8-bit code.


More information about the ffmpeg-devel mailing list