[FFmpeg-devel] [PATCH 3/4] libswscale/x86/rgb2rgb: add uyvytoyuv422 avx2
chen
chenm003 at 163.com
Tue Sep 28 09:41:40 EEST 2021
The current algoithm may get improve, may you combin these optimize with your patches? since extra VPERM make code a little more slower.
On Haswell
Current alogithm:
RSHIFT_COPY m6, m2, 1 ; UYVY UYVY -> YVYU YVY...
pand m6, m1; YxYx YxYx... RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY... pand m7, m1 ; YxYx YxYx... packuswb m6, m7 ; YYYY YYYY...
Latency:
1 + 1 + 1 + 1 + 1 = 5
Proposed:
pshufb m6, m2, mX ; UYVY UYVY -> xxxx YYYY
pshufb m7, m3, mX
punpcklqdq m6, m7 ; YYYY YYYY
Latency:
1 + 1 + 1 = 3
I guess the current algorithm optimize for compatible with SSE2, because PSHUFB addition since SSSE3.
Now, we try to optimzie with AVX, AVX2 and AVX512, so I suggest we use proposed algorithm to get more performance.
Regards,
Min Chen
At 2021-09-28 13:34:03, "Wu Jianhua" <jianhua.wu at intel.com> wrote:
>With the accelerating by means of AVX2, the uyvytoyuv422 can be faster
>
>Performance data(Less is better):
> uyvytoyuv422_sse2 0.49381
> uyvytoyuv422_avx 0.42981
> uyvytoyuv422_avx2 0.27915
>
>Signed-off-by: Wu Jianhua <jianhua.wu at intel.com>
>---
> libswscale/x86/rgb2rgb.c | 6 +++++
> libswscale/x86/rgb_2_rgb.asm | 48 +++++++++++++++++++++++++++---------
> 2 files changed, 42 insertions(+), 12 deletions(-)
>
>diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
>index c9ff33ab77..a965a1755c 100644
>--- a/libswscale/x86/rgb2rgb.c
>+++ b/libswscale/x86/rgb2rgb.c
>@@ -164,6 +164,9 @@ void ff_uyvytoyuv422_sse2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> void ff_uyvytoyuv422_avx(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
> const uint8_t *src, int width, int height,
> int lumStride, int chromStride, int srcStride);
>+void ff_uyvytoyuv422_avx2(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
>+ const uint8_t *src, int width, int height,
>+ int lumStride, int chromStride, int srcStride);
> #endif
>
> av_cold void rgb2rgb_init_x86(void)
>@@ -216,5 +219,8 @@ av_cold void rgb2rgb_init_x86(void)
> if (EXTERNAL_AVX(cpu_flags)) {
> uyvytoyuv422 = ff_uyvytoyuv422_avx;
> }
>+ if (EXTERNAL_AVX2_FAST(cpu_flags)) {
>+ uyvytoyuv422 = ff_uyvytoyuv422_avx2;
>+ }
> #endif
> }
>diff --git a/libswscale/x86/rgb_2_rgb.asm b/libswscale/x86/rgb_2_rgb.asm
>index 3380a1272c..683bd067a5 100644
>--- a/libswscale/x86/rgb_2_rgb.asm
>+++ b/libswscale/x86/rgb_2_rgb.asm
>@@ -31,9 +31,16 @@ pb_shuffle0321: db 0, 3, 2, 1, 4, 7, 6, 5, 8, 11, 10, 9, 12, 15, 14, 13
> pb_shuffle1230: db 1, 2, 3, 0, 5, 6, 7, 4, 9, 10, 11, 8, 13, 14, 15, 12
> pb_shuffle3012: db 3, 0, 1, 2, 7, 4, 5, 6, 11, 8, 9, 10, 15, 12, 13, 14
> pb_shuffle3210: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
>+pd_permd256_uv: dd 0, 4, 1, 5, 2, 6, 3, 7
>
> SECTION .text
>
>+%macro VPERM 5
>+%if mmsize == %2
>+ vperm%1 %3, %4, %5
>+%endif
>+%endmacro
>+
> %macro RSHIFT_COPY 3
> ; %1 dst ; %2 src ; %3 shift
> %if cpuflag(avx)
>@@ -198,11 +205,15 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
> mov whalfq, wq
> shr whalfq, 1 ; whalf = width / 2
>
>- lea srcq, [srcq + wq * 2]
>+ lea srcq, [srcq + wq * 2]
> add ydstq, wq
> add udstq, whalfq
> add vdstq, whalfq
>
>+%if mmsize == 32
>+ movu m15, [pd_permd256_uv]
>+%endif
>+
> .loop_line:
> mov xq, wq
> mov wtwoq, wq
>@@ -251,8 +262,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
>
> RSHIFT_COPY m7, m3, 1 ; UYVY UYVY -> YVYU YVY...
> pand m7, m1 ; YxYx YxYx...
>-
> packuswb m6, m7 ; YYYY YYYY...
>+
>+ VPERM q, 32, m6, m6, 0xd8
>+
> movu [ydstq + wq], m6
>
> ; extract y part 2
>@@ -261,8 +274,10 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
>
> RSHIFT_COPY m7, m5, 1 ; UYVY UYVY -> YVYU YVY...
> pand m7, m1 ; YxYx YxYx...
>+ packuswb m6, m7 ; YYYY YYYY...
>+
>+ VPERM q, 32, m6, m6, 0xd8
>
>- packuswb m6, m7 ; YYYY YYYY...
> movu [ydstq + wq + mmsize], m6
>
> ; extract uv
>@@ -275,17 +290,21 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
> packuswb m4, m5 ; UVUV...
>
> ; U
>- pand m6, m2, m1 ; UxUx...
>- pand m7, m4, m1 ; UxUx...
>+ pand m6, m2, m1 ; UxUx...
>+ pand m7, m4, m1 ; UxUx...
>+ packuswb m6, m7 ; UUUU
>
>- packuswb m6, m7 ; UUUU
>- movu [udstq + whalfq], m6
>+ VPERM d, 32, m6, m15, m6
>
>+ movu [udstq + whalfq], m6
>
> ; V
>- psrlw m2, 8 ; VxVx...
>- psrlw m4, 8 ; VxVx...
>- packuswb m2, m4 ; VVVV
>+ psrlw m2, 8 ; VxVx...
>+ psrlw m4, 8 ; VxVx...
>+ packuswb m2, m4 ; VVVV
>+
>+ VPERM d, 32, m2, m15, m2
>+
> movu [vdstq + whalfq], m2
>
> add whalfq, mmsize
>@@ -294,13 +313,13 @@ cglobal uyvytoyuv422, 9, 14, 8, ydst, udst, vdst, src, w, h, lum_stride, chrom_s
> jl .loop_simd
>
> .end_line:
>- add srcq, src_strideq
>+ add srcq, src_strideq
> add ydstq, lum_strideq
> add udstq, chrom_strideq
> add vdstq, chrom_strideq
>
> ;restore initial state of line variable
>- mov wq, back_wq
>+ mov wq, back_wq
> mov xq, wq
> mov whalfq, wq
> shr whalfq, 1 ; whalf = width / 2
>@@ -316,4 +335,9 @@ UYVY_TO_YUV422
>
> INIT_XMM avx
> UYVY_TO_YUV422
>+
>+%if HAVE_AVX2_EXTERNAL
>+INIT_YMM avx2
>+UYVY_TO_YUV422
>+%endif
> %endif
>--
>2.17.1
>
>_______________________________________________
>ffmpeg-devel mailing list
>ffmpeg-devel at ffmpeg.org
>https://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>To unsubscribe, visit link above, or email
>ffmpeg-devel-request at ffmpeg.org with subject "unsubscribe".
More information about the ffmpeg-devel
mailing list