[FFmpeg-devel] [PATCH 6/6] swscale/yuv2rgb/x86: remove mmx/mmxext yuv2rgb functions

Ramiro Polla ramiro.polla at gmail.com
Mon Jun 17 01:28:49 EEST 2024


These functions are either slower or barely faster than the C LUT
yuv2rgb code.
---
 libswscale/x86/yuv2rgb.c          | 51 -----------------
 libswscale/x86/yuv2rgb_template.c |  4 --
 libswscale/x86/yuv_2_rgb.asm      | 93 +------------------------------
 3 files changed, 3 insertions(+), 145 deletions(-)

diff --git a/libswscale/x86/yuv2rgb.c b/libswscale/x86/yuv2rgb.c
index 6754062245..41dfa80f33 100644
--- a/libswscale/x86/yuv2rgb.c
+++ b/libswscale/x86/yuv2rgb.c
@@ -41,25 +41,8 @@
 
 #define DITHER1XBPP // only for MMX
 
-//MMX versions
-#if HAVE_MMX
-#undef RENAME
-#define COMPILE_TEMPLATE_MMX
-#define RENAME(a) a ## _mmx
-#include "yuv2rgb_template.c"
-#undef COMPILE_TEMPLATE_MMX
-#endif /* HAVE_MMX */
-
-// MMXEXT versions
-#undef RENAME
-#define COMPILE_TEMPLATE_MMXEXT
-#define RENAME(a) a ## _mmxext
-#include "yuv2rgb_template.c"
-#undef COMPILE_TEMPLATE_MMXEXT
-
 //SSSE3 versions
 #undef RENAME
-#define COMPILE_TEMPLATE_SSSE3
 #define RENAME(a) a ## _ssse3
 #include "yuv2rgb_template.c"
 
@@ -99,40 +82,6 @@ av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
         }
     }
 
-    if (EXTERNAL_MMXEXT(cpu_flags)) {
-        switch (c->dstFormat) {
-        case AV_PIX_FMT_RGB24:
-            return yuv420_rgb24_mmxext;
-        case AV_PIX_FMT_BGR24:
-            return yuv420_bgr24_mmxext;
-        }
-    }
-
-    if (EXTERNAL_MMX(cpu_flags)) {
-        switch (c->dstFormat) {
-            case AV_PIX_FMT_RGB32:
-                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
-#if CONFIG_SWSCALE_ALPHA
-                    return yuva420_rgb32_mmx;
-#endif
-                    break;
-                } else
-                    return yuv420_rgb32_mmx;
-            case AV_PIX_FMT_BGR32:
-                if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
-#if CONFIG_SWSCALE_ALPHA
-                    return yuva420_bgr32_mmx;
-#endif
-                    break;
-                } else
-                    return yuv420_bgr32_mmx;
-            case AV_PIX_FMT_RGB565:
-                return yuv420_rgb16_mmx;
-            case AV_PIX_FMT_RGB555:
-                return yuv420_rgb15_mmx;
-        }
-    }
-
 #endif /* HAVE_X86ASM */
     return NULL;
 }
diff --git a/libswscale/x86/yuv2rgb_template.c b/libswscale/x86/yuv2rgb_template.c
index 596943bb73..a4741e6873 100644
--- a/libswscale/x86/yuv2rgb_template.c
+++ b/libswscale/x86/yuv2rgb_template.c
@@ -47,7 +47,6 @@ extern void RENAME(ff_yuv_420_bgr24)(x86_reg index, uint8_t *image, const uint8_
                                      const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                      const uint8_t *py_2index);
 
-#ifndef COMPILE_TEMPLATE_MMXEXT
 extern void RENAME(ff_yuv_420_rgb15)(x86_reg index, uint8_t *image, const uint8_t *pu_index,
                                      const uint8_t *pv_index, const uint64_t *pointer_c_dither,
                                      const uint8_t *py_2index);
@@ -163,9 +162,7 @@ static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
     }
     return srcSliceH;
 }
-#endif
 
-#if !defined(COMPILE_TEMPLATE_MMX)
 static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
                                        int srcStride[],
                                        int srcSliceY, int srcSliceH,
@@ -193,4 +190,3 @@ static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
     }
     return srcSliceH;
 }
-#endif
diff --git a/libswscale/x86/yuv_2_rgb.asm b/libswscale/x86/yuv_2_rgb.asm
index a1f9134e08..b67ab162d2 100644
--- a/libswscale/x86/yuv_2_rgb.asm
+++ b/libswscale/x86/yuv_2_rgb.asm
@@ -38,12 +38,6 @@ pb_e0:   times 16 db 224
 pb_03:   times 16 db 3
 pb_07:   times 16 db 7
 
-mask_1101: dw -1, -1,  0, -1
-mask_0010: dw  0,  0, -1,  0
-mask_0110: dw  0, -1, -1,  0
-mask_1001: dw -1,  0,  0, -1
-mask_0100: dw  0, -1,  0,  0
-
 SECTION .text
 
 ;-----------------------------------------------------------------------------
@@ -55,14 +49,6 @@ SECTION .text
 ;
 ;-----------------------------------------------------------------------------
 
-%macro MOV_H2L 1
-%if mmsize == 8
-    psrlq %1, 32
-%else ; mmsize == 16
-    psrldq %1, 8
-%endif
-%endmacro
-
 %macro yuv2rgb_fn 3
 
 %if %3 == 32
@@ -91,18 +77,6 @@ SECTION .text
 %define m_blue m1
 %endif
 
-%if mmsize == 8
-%define time_num 1
-%define reg_num 8
-%define y_offset [pointer_c_ditherq + 8  * 8]
-%define u_offset [pointer_c_ditherq + 9  * 8]
-%define v_offset [pointer_c_ditherq + 10 * 8]
-%define ug_coff  [pointer_c_ditherq + 7  * 8]
-%define vg_coff  [pointer_c_ditherq + 6  * 8]
-%define y_coff   [pointer_c_ditherq + 3  * 8]
-%define ub_coff  [pointer_c_ditherq + 5  * 8]
-%define vr_coff  [pointer_c_ditherq + 4  * 8]
-%elif mmsize == 16
 %define time_num 2
 %if ARCH_X86_32
 %define reg_num 8
@@ -125,13 +99,11 @@ SECTION .text
 %define ub_coff  m14
 %define vr_coff  m15
 %endif ; ARCH_X86_32/64
-%endif ; coeff define mmsize == 8/16
 
 cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
 
 %if ARCH_X86_64
     movsxd indexq, indexd
-%if mmsize == 16
     VBROADCASTSD y_offset, [pointer_c_ditherq + 8  * 8]
     VBROADCASTSD u_offset, [pointer_c_ditherq + 9  * 8]
     VBROADCASTSD v_offset, [pointer_c_ditherq + 10 * 8]
@@ -141,7 +113,6 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     VBROADCASTSD ub_coff,  [pointer_c_ditherq + 5  * 8]
     VBROADCASTSD vr_coff,  [pointer_c_ditherq + 4  * 8]
 %endif
-%endif
 .loop0:
     movu m_y, [py_2indexq + 2 * indexq]
     movh m_u, [pu_indexq  +     indexq]
@@ -157,7 +128,7 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     psllw m1, 3
     psllw m6, 3
     psllw m7, 3
-%if (ARCH_X86_32 && mmsize == 16)
+%if ARCH_X86_32
     VBROADCASTSD m2, mu_offset
     VBROADCASTSD m3, mv_offset
     VBROADCASTSD m4, my_offset
@@ -176,7 +147,7 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     pmulhw m0, m5
     VBROADCASTSD m4, mvr_coff
     pmulhw m1, m4
-%else ; ARCH_X86_64 || mmsize == 8
+%else ; ARCH_X86_64
     psubsw m0, u_offset ; U = U - 128
     psubsw m1, v_offset ; V = V - 128
     psubw  m6, y_offset
@@ -207,49 +178,10 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     packuswb m2, m7 ; G0 G2 G4 G6 ... G1 G3 G5 G7 ...
     mova m3, m_red
     mova m6, m_blue
-    MOV_H2L m_red
+    psrldq m_red, 8
     punpcklbw m3, m2     ; R0 G0 R2 G2 R4 G4 R6 G6 R8 G8 ...
     punpcklbw m6, m_red  ; B0 R1 B2 R3 B4 R5 B6 R7 B8 R9 ...
-    mova m5, m3
     punpckhbw m2, m_blue ; G1 B1 G3 B3 G5 B5 G7 B7 G9 B9 ...
-%if  mmsize == 8
-    punpcklwd m3 ,m6     ; R0 G0 B0 R1 R2 G2 B2 R3
-    punpckhwd m5, m6     ; R4 G4 B4 R5 R6 G6 B6 R7
-%if cpuflag(mmxext)
-    pshufw m1, m2, 0xc6
-    pshufw m6, m3, 0x84
-    pshufw m7, m5, 0x38
-    pand m6, [mask_1101] ; R0 G0 B0 R1 -- -- R2 G2
-    movq m0, m1
-    pand m7, [mask_0110] ; -- -- R6 G6 B6 R7 -- --
-    movq m2, m1
-    pand m1, [mask_0100] ; -- -- G3 B3 -- -- -- --
-    psrlq m3, 48         ; B2 R3 -- -- -- -- -- --
-    pand m0, [mask_0010] ; -- -- -- -- G1 B1 -- --
-    psllq m5, 32         ; -- -- -- -- R4 G4 B4 R5
-    pand m2, [mask_1001] ; G5 B5 -- -- -- -- G7 B7
-    por m1, m3
-    por m0, m6
-    por m1, m5
-    por m2, m7
-    movntq [imageq], m0
-    movntq [imageq + 8], m1
-    movntq [imageq + 16], m2
-%else ; cpuflag(mmx)
-    movd [imageq], m3      ; R0 G0 R2 G2
-    movd [imageq + 4], m2  ; G1 B1
-    psrlq m3, 32
-    psrlq m2, 16
-    movd [imageq + 6], m3  ; R2 G2 B2 R3
-    movd [imageq + 10], m2 ; G3 B3
-    psrlq m2, 16
-    movd [imageq + 12], m5 ; R4 G4 B4 R5
-    movd [imageq + 16], m2 ; G5 B5
-    psrlq m5, 32
-    movd [imageq + 20], m2 ; -- -- G7 B7
-    movd [imageq + 18], m5 ; R6 G6 B6 R7
-%endif ; cpuflag
-%else ; mmsize == 16
     pshufb m3, [rgb24_shuf1] ; r0  g0  r6  g6  r12 g12 r2  g2  r8  g8  r14 g14 r4  g4  r10 g10
     pshufb m6, [rgb24_shuf2] ; b10 r11 b0  r1  b6  r7  b12 r13 b2  r3  b8  r9  b14 r15 b4  r5
     pshufb m2, [rgb24_shuf3] ; g5  b5  g11 b11 g1  b1  g7  b7  g13 b13 g3  b3  g9  b9  g15 b15
@@ -274,7 +206,6 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     movu [imageq], m0
     movu [imageq + 16], m1
     movu [imageq + 32], m2
-%endif ; mmsize = 16
 %else ; PACK RGB15/16/32
     packuswb m0, m1
     packuswb m3, m5
@@ -309,18 +240,12 @@ cglobal %1_420_%2%3, GPR_num, GPR_num, reg_num, parameters
     movu [imageq + 24 * time_num], m_alpha
 %else ; PACK RGB15/16
 %define depth 2
-%if cpuflag(ssse3)
     %define red_dither m3
     %define green_dither m4
     %define blue_dither m5
     VBROADCASTSD red_dither,   [pointer_c_ditherq + 0 * 8]
     VBROADCASTSD green_dither, [pointer_c_ditherq + 1 * 8]
     VBROADCASTSD blue_dither,  [pointer_c_ditherq + 2 * 8]
-%else ; cpuflag(mmx/mmxext)
-%define blue_dither  [pointer_c_ditherq + 2  * 8]
-%define green_dither [pointer_c_ditherq + 1  * 8]
-%define red_dither   [pointer_c_ditherq + 0  * 8]
-%endif
 %if %3 == 15
 %define gmask pb_03
 %define isRGB15 1
@@ -358,18 +283,6 @@ RET
 
 %endmacro
 
-INIT_MMX mmx
-yuv2rgb_fn yuv,  rgb, 32
-yuv2rgb_fn yuv,  bgr, 32
-yuv2rgb_fn yuva, rgb, 32
-yuv2rgb_fn yuva, bgr, 32
-yuv2rgb_fn yuv,  rgb, 15
-yuv2rgb_fn yuv,  rgb, 16
-
-INIT_MMX mmxext
-yuv2rgb_fn yuv, rgb, 24
-yuv2rgb_fn yuv, bgr, 24
-
 INIT_XMM ssse3
 yuv2rgb_fn yuv,  rgb, 24
 yuv2rgb_fn yuv,  bgr, 24
-- 
2.30.2



More information about the ffmpeg-devel mailing list