[FFmpeg-devel] [PATCH v2] lavc/x86/videodsp: Drop MMX usage

Zhao Zhili quinkblack at foxmail.com
Sun Dec 1 07:33:28 EET 2024


> -----Original Message-----
> From: ffmpeg-devel <ffmpeg-devel-bounces at ffmpeg.org> On Behalf Of Frank Plowman
> Sent: 2024年11月29日 6:21
> To: ffmpeg-devel at ffmpeg.org
> Subject: Re: [FFmpeg-devel] [PATCH v2] lavc/x86/videodsp: Drop MMX usage
> 
> Ping

Pushed, thanks.

> 
> On 16/11/2024 17:59, Frank Plowman wrote:
> > Remove the MMX versions of these functions and modify the SSE
> > implementations to avoid using MMX registers.
> >
> > Signed-off-by: Frank Plowman <post at frankplowman.com>
> > ---
> >  libavcodec/x86/videodsp.asm    | 61 ++++++-----------------
> >  libavcodec/x86/videodsp_init.c | 88 +++++++++++++++++-----------------
> >  2 files changed, 59 insertions(+), 90 deletions(-)
> >
> > diff --git a/libavcodec/x86/videodsp.asm b/libavcodec/x86/videodsp.asm
> > index 3cc07878d3..81ae2ec10c 100644
> > --- a/libavcodec/x86/videodsp.asm
> > +++ b/libavcodec/x86/videodsp.asm
> > @@ -123,54 +123,43 @@ hvar_fn
> >  ;         - if (%2 & 8)  fills 8 bytes into xmm$next
> >  ;         - if (%2 & 4)  fills 4 bytes into xmm$next
> >  ;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
> > -; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
> > -;         - if (%2 & 4)  fills 4 bytes into mm$next
> > -;         - if (%2 & 3)  fills 1, 2 or 4 bytes in eax
> >  ; writing data out is in the same way
> >  %macro READ_NUM_BYTES 2
> >  %assign %%off 0     ; offset in source buffer
> > -%assign %%mmx_idx 0 ; mmx register index
> >  %assign %%xmm_idx 0 ; xmm register index
> >
> >  %rep %2/mmsize
> > -%if mmsize == 16
> >      movu   xmm %+ %%xmm_idx, [srcq+%%off]
> >  %assign %%xmm_idx %%xmm_idx+1
> > -%else ; mmx
> > -    movu    mm %+ %%mmx_idx, [srcq+%%off]
> > -%assign %%mmx_idx %%mmx_idx+1
> > -%endif
> >  %assign %%off %%off+mmsize
> >  %endrep ; %2/mmsize
> >
> > -%if mmsize == 16
> >  %if (%2-%%off) >= 8
> >  %if %2 > 16 && (%2-%%off) > 8
> >      movu   xmm %+ %%xmm_idx, [srcq+%2-16]
> >  %assign %%xmm_idx %%xmm_idx+1
> >  %assign %%off %2
> >  %else
> > -    movq    mm %+ %%mmx_idx, [srcq+%%off]
> > -%assign %%mmx_idx %%mmx_idx+1
> > +    movq   xmm %+ %%xmm_idx, [srcq+%%off]
> > +%assign %%xmm_idx %%xmm_idx+1
> >  %assign %%off %%off+8
> >  %endif
> >  %endif ; (%2-%%off) >= 8
> > -%endif
> >
> >  %if (%2-%%off) >= 4
> >  %if %2 > 8 && (%2-%%off) > 4
> > -    movq    mm %+ %%mmx_idx, [srcq+%2-8]
> > +    movq   xmm %+ %%xmm_idx, [srcq+%2-8]
> >  %assign %%off %2
> >  %else
> > -    movd    mm %+ %%mmx_idx, [srcq+%%off]
> > +    movd   xmm %+ %%xmm_idx, [srcq+%%off]
> >  %assign %%off %%off+4
> >  %endif
> > -%assign %%mmx_idx %%mmx_idx+1
> > +%assign %%xmm_idx %%xmm_idx+1
> >  %endif ; (%2-%%off) >= 4
> >
> >  %if (%2-%%off) >= 1
> >  %if %2 >= 4
> > -    movd mm %+ %%mmx_idx, [srcq+%2-4]
> > +    movd xmm %+ %%xmm_idx, [srcq+%2-4]
> >  %elif (%2-%%off) == 1
> >      mov            valb, [srcq+%2-1]
> >  %elif (%2-%%off) == 2
> > @@ -185,48 +174,40 @@ hvar_fn
> >
> >  %macro WRITE_NUM_BYTES 2
> >  %assign %%off 0     ; offset in destination buffer
> > -%assign %%mmx_idx 0 ; mmx register index
> >  %assign %%xmm_idx 0 ; xmm register index
> >
> >  %rep %2/mmsize
> > -%if mmsize == 16
> >      movu   [dstq+%%off], xmm %+ %%xmm_idx
> >  %assign %%xmm_idx %%xmm_idx+1
> > -%else ; mmx
> > -    movu   [dstq+%%off], mm %+ %%mmx_idx
> > -%assign %%mmx_idx %%mmx_idx+1
> > -%endif
> >  %assign %%off %%off+mmsize
> >  %endrep ; %2/mmsize
> >
> > -%if mmsize == 16
> >  %if (%2-%%off) >= 8
> >  %if %2 > 16 && (%2-%%off) > 8
> >      movu   [dstq+%2-16], xmm %+ %%xmm_idx
> >  %assign %%xmm_idx %%xmm_idx+1
> >  %assign %%off %2
> >  %else
> > -    movq   [dstq+%%off], mm %+ %%mmx_idx
> > -%assign %%mmx_idx %%mmx_idx+1
> > +    movq   [dstq+%%off], xmm %+ %%xmm_idx
> > +%assign %%xmm_idx %%xmm_idx+1
> >  %assign %%off %%off+8
> >  %endif
> >  %endif ; (%2-%%off) >= 8
> > -%endif
> >
> >  %if (%2-%%off) >= 4
> >  %if %2 > 8 && (%2-%%off) > 4
> > -    movq    [dstq+%2-8], mm %+ %%mmx_idx
> > +    movq    [dstq+%2-8], xmm %+ %%xmm_idx
> >  %assign %%off %2
> >  %else
> > -    movd   [dstq+%%off], mm %+ %%mmx_idx
> > +    movd   [dstq+%%off], xmm %+ %%xmm_idx
> >  %assign %%off %%off+4
> >  %endif
> > -%assign %%mmx_idx %%mmx_idx+1
> > +%assign %%xmm_idx %%xmm_idx+1
> >  %endif ; (%2-%%off) >= 4
> >
> >  %if (%2-%%off) >= 1
> >  %if %2 >= 4
> > -    movd    [dstq+%2-4], mm %+ %%mmx_idx
> > +    movd    [dstq+%2-4], xmm %+ %%xmm_idx
> >  %elif (%2-%%off) == 1
> >      mov     [dstq+%2-1], valb
> >  %elif (%2-%%off) == 2
> > @@ -318,11 +299,8 @@ cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
> >  %endrep ; 1+%2-%1
> >  %endmacro ; VERTICAL_EXTEND
> >
> > -INIT_MMX mmx
> > -VERTICAL_EXTEND 1, 15
> > -
> > -INIT_XMM sse
> > -VERTICAL_EXTEND 16, 22
> > +INIT_XMM sse2
> > +VERTICAL_EXTEND 1, 22
> >
> >  ; left/right (horizontal) fast extend functions
> >  ; these are essentially identical to the vertical extend ones above,
> > @@ -337,11 +315,7 @@ VERTICAL_EXTEND 16, 22
> >      imul           vald, 0x01010101
> >  %if %1 >= 8
> >      movd             m0, vald
> > -%if mmsize == 16
> >      pshufd           m0, m0, q0000
> > -%else
> > -    punpckldq        m0, m0
> > -%endif ; mmsize == 16
> >  %endif ; %1 > 16
> >  %endif ; avx2
> >  %endmacro ; READ_V_PIXEL
> > @@ -356,7 +330,6 @@ VERTICAL_EXTEND 16, 22
> >  %assign %%off %%off+mmsize
> >  %endrep ; %1/mmsize
> >
> > -%if mmsize == 16
> >  %if %1-%%off >= 8
> >  %if %1 > 16 && %1-%%off > 8
> >      movu     [%2+%1-16], m0
> > @@ -366,7 +339,6 @@ VERTICAL_EXTEND 16, 22
> >  %assign %%off %%off+8
> >  %endif
> >  %endif ; %1-%%off >= 8
> > -%endif ; mmsize == 16
> >
> >  %if %1-%%off >= 4
> >  %if %1 > 8 && %1-%%off > 4
> > @@ -415,11 +387,8 @@ cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
> >  %endrep ; 1+(%2-%1)/2
> >  %endmacro ; H_EXTEND
> >
> > -INIT_MMX mmx
> > -H_EXTEND 2, 14
> > -
> >  INIT_XMM sse2
> > -H_EXTEND 16, 22
> > +H_EXTEND 2, 22
> >
> >  %if HAVE_AVX2_EXTERNAL
> >  INIT_XMM avx2
> > diff --git a/libavcodec/x86/videodsp_init.c b/libavcodec/x86/videodsp_init.c
> > index ae9db95624..602856de1e 100644
> > --- a/libavcodec/x86/videodsp_init.c
> > +++ b/libavcodec/x86/videodsp_init.c
> > @@ -37,37 +37,37 @@ typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
> >                                  x86_reg start_y, x86_reg end_y, x86_reg bh,
> >                                  x86_reg w);
> >
> > -extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
> > -extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
> > -static emu_edge_vfix_func * const vfixtbl_sse[22] = {
> > -    ff_emu_edge_vfix1_mmx,  ff_emu_edge_vfix2_mmx,  ff_emu_edge_vfix3_mmx,
> > -    ff_emu_edge_vfix4_mmx,  ff_emu_edge_vfix5_mmx,  ff_emu_edge_vfix6_mmx,
> > -    ff_emu_edge_vfix7_mmx,  ff_emu_edge_vfix8_mmx,  ff_emu_edge_vfix9_mmx,
> > -    ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
> > -    ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
> > -    ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
> > -    ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
> > -    ff_emu_edge_vfix22_sse
> > +extern emu_edge_vfix_func ff_emu_edge_vfix1_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix2_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix3_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix4_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix5_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix6_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix7_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix8_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix9_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix10_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix11_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix12_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix13_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix14_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix15_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix16_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix17_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix18_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix19_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix20_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix21_sse2;
> > +extern emu_edge_vfix_func ff_emu_edge_vfix22_sse2;
> > +static emu_edge_vfix_func * const vfixtbl_sse2[22] = {
> > +    ff_emu_edge_vfix1_sse2,  ff_emu_edge_vfix2_sse2,  ff_emu_edge_vfix3_sse2,
> > +    ff_emu_edge_vfix4_sse2,  ff_emu_edge_vfix5_sse2,  ff_emu_edge_vfix6_sse2,
> > +    ff_emu_edge_vfix7_sse2,  ff_emu_edge_vfix8_sse2,  ff_emu_edge_vfix9_sse2,
> > +    ff_emu_edge_vfix10_sse2, ff_emu_edge_vfix11_sse2, ff_emu_edge_vfix12_sse2,
> > +    ff_emu_edge_vfix13_sse2, ff_emu_edge_vfix14_sse2, ff_emu_edge_vfix15_sse2,
> > +    ff_emu_edge_vfix16_sse2, ff_emu_edge_vfix17_sse2, ff_emu_edge_vfix18_sse2,
> > +    ff_emu_edge_vfix19_sse2, ff_emu_edge_vfix20_sse2, ff_emu_edge_vfix21_sse2,
> > +    ff_emu_edge_vfix22_sse2
> >  };
> >  extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
> >
> > @@ -76,21 +76,21 @@ typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
> >  typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
> >                                  x86_reg start_x, x86_reg n_words, x86_reg bh);
> >
> > -extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
> > -extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
> > -extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
> > -extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
> > -extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
> > -extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
> > -extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
> > +extern emu_edge_hfix_func ff_emu_edge_hfix2_sse2;
> > +extern emu_edge_hfix_func ff_emu_edge_hfix4_sse2;
> > +extern emu_edge_hfix_func ff_emu_edge_hfix6_sse2;
> > +extern emu_edge_hfix_func ff_emu_edge_hfix8_sse2;
> > +extern emu_edge_hfix_func ff_emu_edge_hfix10_sse2;
> > +extern emu_edge_hfix_func ff_emu_edge_hfix12_sse2;
> > +extern emu_edge_hfix_func ff_emu_edge_hfix14_sse2;
> >  extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
> >  extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
> >  extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
> >  extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
> >  static emu_edge_hfix_func * const hfixtbl_sse2[11] = {
> > -    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
> > -    ff_emu_edge_hfix8_mmx,  ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
> > -    ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
> > +    ff_emu_edge_hfix2_sse2,  ff_emu_edge_hfix4_sse2,  ff_emu_edge_hfix6_sse2,
> > +    ff_emu_edge_hfix8_sse2,  ff_emu_edge_hfix10_sse2, ff_emu_edge_hfix12_sse2,
> > +    ff_emu_edge_hfix14_sse2, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
> >      ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
> >  };
> >  extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
> > @@ -104,7 +104,7 @@ extern emu_edge_hfix_func ff_emu_edge_hfix18_avx2;
> >  extern emu_edge_hfix_func ff_emu_edge_hfix20_avx2;
> >  extern emu_edge_hfix_func ff_emu_edge_hfix22_avx2;
> >  static emu_edge_hfix_func * const hfixtbl_avx2[11] = {
> > -    ff_emu_edge_hfix2_mmx,  ff_emu_edge_hfix4_mmx,  ff_emu_edge_hfix6_mmx,
> > +    ff_emu_edge_hfix2_sse2,  ff_emu_edge_hfix4_sse2,  ff_emu_edge_hfix6_sse2,
> >      ff_emu_edge_hfix8_avx2,  ff_emu_edge_hfix10_avx2, ff_emu_edge_hfix12_avx2,
> >      ff_emu_edge_hfix14_avx2, ff_emu_edge_hfix16_avx2, ff_emu_edge_hfix18_avx2,
> >      ff_emu_edge_hfix20_avx2, ff_emu_edge_hfix22_avx2
> > @@ -196,7 +196,7 @@ static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, const uint8_t *src,
> >                                                int h)
> >  {
> >      emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
> > -                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
> > +                     src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse,
> >                       hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
> >  }
> >
> > @@ -209,7 +209,7 @@ static av_noinline void emulated_edge_mc_avx2(uint8_t *buf, const uint8_t *src,
> >                                                int h)
> >  {
> >      emulated_edge_mc(buf, src, buf_stride, src_stride, block_w, block_h,
> > -                     src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
> > +                     src_x, src_y, w, h, vfixtbl_sse2, &ff_emu_edge_vvar_sse,
> >                       hfixtbl_avx2, &ff_emu_edge_hvar_avx2);
> >  }
> >  #endif /* HAVE_AVX2_EXTERNAL */
> 
> --
> Frank
> 
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
> 
> To unsubscribe, visit link above, or email
> ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".



More information about the ffmpeg-devel mailing list