[FFmpeg-devel] [PATCH 2/2] all: do standards compliant absdiff computation

Clément Bœsch u at pkh.me
Sun Aug 23 23:28:20 CEST 2015


On Sun, Aug 23, 2015 at 11:23:54PM +0200, Clément Bœsch wrote:
> On Sun, Aug 23, 2015 at 11:58:23AM -0400, Ganesh Ajjanagadde wrote:
> [...]
> > diff --git a/libavfilter/vf_hqx.c b/libavfilter/vf_hqx.c
> > index fa15d9c..0178793 100644
> > --- a/libavfilter/vf_hqx.c
> > +++ b/libavfilter/vf_hqx.c
> > @@ -65,9 +65,9 @@ static av_always_inline int yuv_diff(uint32_t yuv1, uint32_t yuv2)
> >  #define YMASK 0xff0000
> >  #define UMASK 0x00ff00
> >  #define VMASK 0x0000ff
> > -    return abs((yuv1 & YMASK) - (yuv2 & YMASK)) > (48 << 16) ||
> > -           abs((yuv1 & UMASK) - (yuv2 & UMASK)) > ( 7 <<  8) ||
> > -           abs((yuv1 & VMASK) - (yuv2 & VMASK)) > ( 6 <<  0);
> > +    return FFUABSDIFF(yuv1 & YMASK, yuv2 & YMASK) > (48 << 16) ||
> > +           FFUABSDIFF(yuv1 & UMASK, yuv2 & UMASK) > ( 7 <<  8) ||
> > +           FFUABSDIFF(yuv1 & VMASK, yuv2 & VMASK) > ( 6 <<  0);
> 
> This is one of the bottleneck function of the filter. How does it affect
> speed? Can you compare the generated ASM?
> 

To answer my second question:

[/tmp]☭ cat a.c
#include <stdlib.h>
#include <stdint.h>

#define YMASK 0xff0000
#define UMASK 0x00ff00
#define VMASK 0x0000ff

#define FFUABSDIFF(a,b) (((a) > (b)) ? ((a)-(b)) : ((b)-(a)))

int yuv_diff_0(uint32_t yuv1, uint32_t yuv2)
{
    return abs((yuv1 & YMASK) - (yuv2 & YMASK)) > (48 << 16) ||
           abs((yuv1 & UMASK) - (yuv2 & UMASK)) > ( 7 <<  8) ||
           abs((yuv1 & VMASK) - (yuv2 & VMASK)) > ( 6 <<  0);
}

int yuv_diff_1(uint32_t yuv1, uint32_t yuv2)
{
    return FFUABSDIFF(yuv1 & YMASK, yuv2 & YMASK) > (48 << 16) ||
           FFUABSDIFF(yuv1 & UMASK, yuv2 & UMASK) > ( 7 <<  8) ||
           FFUABSDIFF(yuv1 & VMASK, yuv2 & VMASK) > ( 6 <<  0);
}
[/tmp]☭ gcc -Wall -O2 -c a.c && objdump -d -Mintel a.o

a.o:     file format elf64-x86-64


Disassembly of section .text:

0000000000000000 <yuv_diff_0>:
   0:   89 f8                   mov    eax,edi
   2:   89 f2                   mov    edx,esi
   4:   81 e2 00 00 ff 00       and    edx,0xff0000
   a:   25 00 00 ff 00          and    eax,0xff0000
   f:   29 d0                   sub    eax,edx
  11:   99                      cdq    
  12:   31 d0                   xor    eax,edx
  14:   29 d0                   sub    eax,edx
  16:   89 c2                   mov    edx,eax
  18:   b8 01 00 00 00          mov    eax,0x1
  1d:   81 fa 00 00 30 00       cmp    edx,0x300000
  23:   7f 3e                   jg     63 <yuv_diff_0+0x63>
  25:   89 fa                   mov    edx,edi
  27:   89 f1                   mov    ecx,esi
  29:   81 e1 00 ff 00 00       and    ecx,0xff00
  2f:   81 e2 00 ff 00 00       and    edx,0xff00
  35:   29 ca                   sub    edx,ecx
  37:   89 d1                   mov    ecx,edx
  39:   c1 f9 1f                sar    ecx,0x1f
  3c:   31 ca                   xor    edx,ecx
  3e:   29 ca                   sub    edx,ecx
  40:   81 fa 00 07 00 00       cmp    edx,0x700
  46:   7f 1b                   jg     63 <yuv_diff_0+0x63>
  48:   40 0f b6 ff             movzx  edi,dil
  4c:   40 0f b6 f6             movzx  esi,sil
  50:   29 f7                   sub    edi,esi
  52:   89 f8                   mov    eax,edi
  54:   c1 f8 1f                sar    eax,0x1f
  57:   31 c7                   xor    edi,eax
  59:   29 c7                   sub    edi,eax
  5b:   31 c0                   xor    eax,eax
  5d:   83 ff 06                cmp    edi,0x6
  60:   0f 9f c0                setg   al
  63:   f3 c3                   repz ret 
  65:   90                      nop
  66:   66 2e 0f 1f 84 00 00    nop    WORD PTR cs:[rax+rax*1+0x0]
  6d:   00 00 00 

0000000000000070 <yuv_diff_1>:
  70:   89 fa                   mov    edx,edi
  72:   89 f0                   mov    eax,esi
  74:   81 e2 00 00 ff 00       and    edx,0xff0000
  7a:   25 00 00 ff 00          and    eax,0xff0000
  7f:   39 c2                   cmp    edx,eax
  81:   76 4d                   jbe    d0 <yuv_diff_1+0x60>
  83:   29 c2                   sub    edx,eax
  85:   b8 01 00 00 00          mov    eax,0x1
  8a:   81 fa 00 00 30 00       cmp    edx,0x300000
  90:   77 7e                   ja     110 <yuv_diff_1+0xa0>
  92:   89 fa                   mov    edx,edi
  94:   89 f0                   mov    eax,esi
  96:   81 e2 00 ff 00 00       and    edx,0xff00
  9c:   25 00 ff 00 00          and    eax,0xff00
  a1:   39 c2                   cmp    edx,eax
  a3:   76 43                   jbe    e8 <yuv_diff_1+0x78>
  a5:   29 c2                   sub    edx,eax
  a7:   b8 01 00 00 00          mov    eax,0x1
  ac:   81 fa 00 07 00 00       cmp    edx,0x700
  b2:   77 74                   ja     128 <yuv_diff_1+0xb8>
  b4:   40 0f b6 ff             movzx  edi,dil
  b8:   40 0f b6 f6             movzx  esi,sil
  bc:   39 f7                   cmp    edi,esi
  be:   76 58                   jbe    118 <yuv_diff_1+0xa8>
  c0:   29 f7                   sub    edi,esi
  c2:   31 c0                   xor    eax,eax
  c4:   83 ff 06                cmp    edi,0x6
  c7:   0f 97 c0                seta   al
  ca:   c3                      ret    
  cb:   0f 1f 44 00 00          nop    DWORD PTR [rax+rax*1+0x0]
  d0:   29 d0                   sub    eax,edx
  d2:   89 c2                   mov    edx,eax
  d4:   b8 01 00 00 00          mov    eax,0x1
  d9:   81 fa 00 00 30 00       cmp    edx,0x300000
  df:   76 b1                   jbe    92 <yuv_diff_1+0x22>
  e1:   f3 c3                   repz ret 
  e3:   0f 1f 44 00 00          nop    DWORD PTR [rax+rax*1+0x0]
  e8:   29 d0                   sub    eax,edx
  ea:   89 c2                   mov    edx,eax
  ec:   b8 01 00 00 00          mov    eax,0x1
  f1:   81 fa 00 07 00 00       cmp    edx,0x700
  f7:   77 e8                   ja     e1 <yuv_diff_1+0x71>
  f9:   40 0f b6 ff             movzx  edi,dil
  fd:   40 0f b6 f6             movzx  esi,sil
 101:   39 f7                   cmp    edi,esi
 103:   76 13                   jbe    118 <yuv_diff_1+0xa8>
 105:   eb b9                   jmp    c0 <yuv_diff_1+0x50>
 107:   66 0f 1f 84 00 00 00    nop    WORD PTR [rax+rax*1+0x0]
 10e:   00 00 
 110:   f3 c3                   repz ret 
 112:   66 0f 1f 44 00 00       nop    WORD PTR [rax+rax*1+0x0]
 118:   29 fe                   sub    esi,edi
 11a:   31 c0                   xor    eax,eax
 11c:   83 fe 06                cmp    esi,0x6
 11f:   0f 97 c0                seta   al
 122:   c3                      ret    
 123:   0f 1f 44 00 00          nop    DWORD PTR [rax+rax*1+0x0]
 128:   f3 c3                   repz ret 
[/tmp]☭ 

I must say I'm slightly uncomfortable with that change.

-- 
Clément B.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 473 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20150823/ee0e0291/attachment.sig>


More information about the ffmpeg-devel mailing list