[FFmpeg-devel] [PATCH] avfilter: add hflip x86 SIMD
Martin Vignali
martin.vignali at gmail.com
Sun Dec 3 20:55:05 EET 2017
> Can you post a disassembly of hflip_byte_c?
>
>
> in O1 : clang -S -O1 test_asm_gen.c
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 12
.globl _hflip_byte_c
.p2align 4, 0x90
_hflip_byte_c: ## @hflip_byte_c
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
testl %edx, %edx
jle LBB0_3
## BB#1:
movl %edx, %eax
.p2align 4, 0x90
LBB0_2: ## =>This Inner Loop Header: Depth=1
movzbl (%rdi), %ecx
movb %cl, (%rsi)
decq %rdi
incq %rsi
decq %rax
jne LBB0_2
LBB0_3:
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
in O2 or O3 : clang -S -O3 test_asm_gen.c
If i correctly understand, same idea than paul's patch
but processing two xmm in the main loop
.section __TEXT,__text,regular,pure_instructions
.macosx_version_min 10, 12
.section __TEXT,__literal16,16byte_literals
.p2align 4
LCPI0_0:
.byte 15 ## 0xf
.byte 14 ## 0xe
.byte 13 ## 0xd
.byte 12 ## 0xc
.byte 11 ## 0xb
.byte 10 ## 0xa
.byte 9 ## 0x9
.byte 8 ## 0x8
.byte 7 ## 0x7
.byte 6 ## 0x6
.byte 5 ## 0x5
.byte 4 ## 0x4
.byte 3 ## 0x3
.byte 2 ## 0x2
.byte 1 ## 0x1
.byte 0 ## 0x0
.section __TEXT,__text,regular,pure_instructions
.globl _hflip_byte_c
.p2align 4, 0x90
_hflip_byte_c: ## @hflip_byte_c
.cfi_startproc
## BB#0:
pushq %rbp
Ltmp0:
.cfi_def_cfa_offset 16
Ltmp1:
.cfi_offset %rbp, -16
movq %rsp, %rbp
Ltmp2:
.cfi_def_cfa_register %rbp
## kill: %EDX<def> %EDX<kill>
%RDX<def>
testl %edx, %edx
jle LBB0_17
## BB#1:
movl %edx, %r8d
cmpl $32, %edx
jae LBB0_3
## BB#2:
xorl %r11d, %r11d
jmp LBB0_11
LBB0_3:
andl $31, %edx
movq %r8, %r11
subq %rdx, %r11
je LBB0_7
## BB#4:
leaq 1(%rdi), %rax
cmpq %rsi, %rax
jbe LBB0_8
## BB#5:
leaq (%rsi,%r8), %r9
movl $1, %eax
subq %r8, %rax
addq %rdi, %rax
cmpq %r9, %rax
jae LBB0_8
LBB0_7:
xorl %r11d, %r11d
jmp LBB0_11
LBB0_8:
leaq -15(%rdi), %r9
leaq 16(%rsi), %rax
movdqa LCPI0_0(%rip), %xmm0 ## xmm0 =
[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
movq %r11, %r10
.p2align 4, 0x90
LBB0_9: ## =>This Inner Loop Header: Depth=1
movdqu -16(%r9), %xmm1
movdqu (%r9), %xmm2
pshufb %xmm0, %xmm2
pshufb %xmm0, %xmm1
movdqu %xmm2, -16(%rax)
movdqu %xmm1, (%rax)
addq $-32, %r9
addq $32, %rax
addq $-32, %r10
jne LBB0_9
## BB#10:
testl %edx, %edx
je LBB0_17
LBB0_11:
movl %r8d, %eax
subl %r11d, %eax
leaq -1(%r8), %r9
subq %r11, %r9
andq $3, %rax
je LBB0_14
## BB#12:
movq %rdi, %rdx
subq %r11, %rdx
negq %rax
.p2align 4, 0x90
LBB0_13: ## =>This Inner Loop Header: Depth=1
movzbl (%rdx), %ecx
movb %cl, (%rsi,%r11)
incq %r11
decq %rdx
incq %rax
jne LBB0_13
LBB0_14:
cmpq $3, %r9
jb LBB0_17
## BB#15:
subq %r11, %r8
subq %r11, %rdi
leaq 3(%rsi,%r11), %rax
.p2align 4, 0x90
LBB0_16: ## =>This Inner Loop Header: Depth=1
movzbl (%rdi), %ecx
movb %cl, -3(%rax)
movzbl -1(%rdi), %ecx
movb %cl, -2(%rax)
movzbl -2(%rdi), %ecx
movb %cl, -1(%rax)
movzbl -3(%rdi), %ecx
movb %cl, (%rax)
addq $-4, %rdi
addq $4, %rax
addq $-4, %r8
jne LBB0_16
LBB0_17:
popq %rbp
retq
.cfi_endproc
.subsections_via_symbols
More information about the ffmpeg-devel
mailing list