[FFmpeg-devel] [PATCH] Port x264 SSE2 deblocking code to H.264 decoder

Mike Melanson mike
Sat Dec 20 04:18:16 CET 2008

Jason Garrett-Glaser wrote:
> On Thu, Dec 18, 2008 at 8:28 PM, Jason Garrett-Glaser
> <darkshikari at gmail.com> wrote:
>> On Thu, Dec 18, 2008 at 5:34 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
>>> On Thu, Dec 18, 2008 at 05:22:51PM -0800, Jason Garrett-Glaser wrote:
>>>> On Thu, Dec 18, 2008 at 5:09 PM, Michael Niedermayer <michaelni at gmx.at> wrote:
>>>>> On Thu, Dec 18, 2008 at 04:47:24PM -0800, Jason Garrett-Glaser wrote:
>>>>>> OK, now we have luma_intra in DSPutil, so this should be easier.
>>>>>> Michael: how should we rename the x264 functions?  My thought was just
>>>>>> to s/x264/ff_h264/ or something of the sort, which would modify the
>>>>>> code from x264's version but make it trivial to modify the code before
>>>>>> committing any updates from x264.  It wouldn't need ugly #defines
>>>>>> either.
>>>>> didint loren post some patch that changed cglobal to add a prefix
>>>>> automagically ...
>>>> Shouldn't that be a separate patch?  If it's fine, can I commit his
>>>> patch now then?
>>> ive approved his patch, so yes you can apply it of course
>> applied (sorry for forgetting credit, forgot to add Loren's name to
>> commit message).
>> Updated x264 deblock patch attached.
>> Dark Shikari
> Small error in patch fixed.

In case you did not see my response on ffmpeg-cvslog, this code causes 
the H.264 decoder to segfault (on about 90 of the conformance samples) 
when compiled with Intel's C Compiler, which we are sort of trying to 
support. I had time to look at it a little more but did not find much 
more useful information. I know that a build from a straight 
'./configure --cc="icc"' config is fine, but not if --enable-gpl is also 

Here comes the info dump, using the sample from this test spec:

~/build-icc$ gdb ./ffmpeg_g
GNU gdb 6.8-debian
Copyright (C) 2008 Free Software Foundation, Inc.
License GPLv3+: GNU GPL version 3 or later 
This is free software: you are free to change and redistribute it.
There is NO WARRANTY, to the extent permitted by law.  Type "show copying"
and "show warranty" for details.
This GDB was configured as "i486-linux-gnu"...
(gdb) r -i /mnt/fate-suite/h264-conformance/BA1_FT_C.264 -f framecrc -
Starting program: /home/melanson/build-icc/ffmpeg_g -i 
/mnt/fate-suite/h264-conformance/BA1_FT_C.264 -f framecrc -
FFmpeg version SVN-r16243, Copyright (c) 2000-2008 Fabrice Bellard, et al.
   configuration: --cc=/opt/intel/cc/10.1.015/bin/icc --enable-gpl
   libavutil     49.12. 0 / 49.12. 0
   libavcodec    52. 7. 0 / 52. 7. 0
   libavformat   52.23. 1 / 52.23. 1
   libavdevice   52. 1. 0 / 52. 1. 0
   built on Dec 19 2008 18:49:39, gcc: Intel(R) C++ gcc 4.2 mode

Program received signal SIGSEGV, Segmentation fault.
ff_x264_deblock_v_luma_intra_sse2 ()
     at /home/melanson/fate/source/libavcodec/i386/h264_deblock_sse2.asm:743
Current language:  auto; currently asm

(gdb) bt
#0  ff_x264_deblock_v_luma_intra_sse2 ()
     at /home/melanson/fate/source/libavcodec/i386/h264_deblock_sse2.asm:743
#1  0xbfe773b0 in ?? ()
Backtrace stopped: previous frame inner to this frame (corrupt stack?)

(gdb) disass $pc-32 $pc+32
Dump of assembler code from 0x838e9d2 to 0x838ea12:
0x0838e9d2 <ff_x264_deblock_v_luma_intra_sse2+18>:	sbb    $0x83,%al
0x0838e9d4 <ff_x264_deblock_v_luma_intra_sse2+20>:	in     (%dx),%al
0x0838e9d5 <ff_x264_deblock_v_luma_intra_sse2+21>:	pusha
0x0838e9d6 <ff_x264_deblock_v_luma_intra_sse2+22>:	lea    0x0(,%ecx,4),%esi
0x0838e9dd <ff_x264_deblock_v_luma_intra_sse2+29>:	lea    (%ecx,%ecx,2),%edi
0x0838e9e0 <ff_x264_deblock_v_luma_intra_sse2+32>:	dec    %edx
0x0838e9e1 <ff_x264_deblock_v_luma_intra_sse2+33>:	jl     0x838ed6e 
0x0838e9e7 <ff_x264_deblock_v_luma_intra_sse2+39>:	neg    %esi
0x0838e9e9 <ff_x264_deblock_v_luma_intra_sse2+41>:	dec    %ebx
0x0838e9ea <ff_x264_deblock_v_luma_intra_sse2+42>:	jl     0x838ed6e 
0x0838e9f0 <ff_x264_deblock_v_luma_intra_sse2+48>:	add    %eax,%esi
0x0838e9f2 <ff_x264_deblock_v_luma_intra_sse2+50>:	movaps 
0x0838e9f6 <ff_x264_deblock_v_luma_intra_sse2+54>:	movaps 
0x0838e9fa <ff_x264_deblock_v_luma_intra_sse2+58>:	movaps (%eax),%xmm2
0x0838e9fd <ff_x264_deblock_v_luma_intra_sse2+61>:	movaps 
0x0838ea01 <ff_x264_deblock_v_luma_intra_sse2+65>:	movd   %edx,%xmm4
0x0838ea05 <ff_x264_deblock_v_luma_intra_sse2+69>:	movd   %ebx,%xmm5
0x0838ea09 <ff_x264_deblock_v_luma_intra_sse2+73>:	pshuflw $0x0,%xmm4,%xmm4
0x0838ea0e <ff_x264_deblock_v_luma_intra_sse2+78>:	punpcklqdq %xmm4,%xmm4
End of assembler dump.

(gdb) info all-registers
eax            0xbfe77458	-1075350440
ecx            0x10	16
edx            0x3	3
ebx            0x1	1
esp            0xbfe77398	0xbfe77398
ebp            0x89351c0	0x89351c0
esi            0xbfe77418	-1075350504
edi            0x30	48
eip            0x838e9f2	0x838e9f2 <ff_x264_deblock_v_luma_intra_sse2+50>
eflags         0x210287	[ CF PF SF IF RF ID ]
cs             0x73	115
ss             0x7b	123
ds             0x7b	123
es             0x7b	123
fs             0x0	0
gs             0x33	51
st0            -nan(0xdcb1687069778689)	(raw 0xffffdcb1687069778689)
st1            -nan(0xd9d7d69cd9d6d572)	(raw 0xffffd9d7d69cd9d6d572)
st2            -nan(0xd7d6ad67d6da7a6b)	(raw 0xffffd7d6ad67d6da7a6b)
st3            -nan(0xda786a706e7d8589)	(raw 0xffffda786a706e7d8589)
st4            -nan(0xd7d6ad6773707385)	(raw 0xffffd7d6ad6773707385)
st5            -nan(0xd9d7d69c6778717a)	(raw 0xffffd9d7d69c6778717a)
st6            -nan(0xd9d6d57273736a7d)	(raw 0xffffd9d6d57273736a7d)
st7            -nan(0xd6da7a6b7175838b)	(raw 0xffffd6da7a6b7175838b)
fctrl          0x37f	895
fstat          0x120	288
ftag           0xaaaa	43690
fiseg          0x73	115
fioff          0x80f1447	135205959
foseg          0x7b	123
fooff          0x88d58ac	143481004
fop            0x11c	284
xmm0           {v4_float = {0x0, 0x0, 0x0, 0x0}, v2_double = {
     0x8000000000000000, 0x8000000000000000}, v16_int8 = {
     0x7f <repeats 16 times>}, v8_int16 = {0x7f7f, 0x7f7f, 0x7f7f, 0x7f7f,
     0x7f7f, 0x7f7f, 0x7f7f, 0x7f7f}, v4_int32 = {0x7f7f7f7f, 0x7f7f7f7f,
     0x7f7f7f7f, 0x7f7f7f7f}, v2_int64 = {0x7f7f7f7f7f7f7f7f,
     0x7f7f7f7f7f7f7f7f}, uint128 = 0x7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f}
xmm1           {v4_float = {0x0, 0x0, 0x0, 0x0}, v2_double = {
     0x8000000000000000, 0x8000000000000000}, v16_int8 = {
     0x7f <repeats 16 times>}, v8_int16 = {0x7f7f, 0x7f7f, 0x7f7f, 0x7f7f,
     0x7f7f, 0x7f7f, 0x7f7f, 0x7f7f}, v4_int32 = {0x7f7f7f7f, 0x7f7f7f7f,
     0x7f7f7f7f, 0x7f7f7f7f}, v2_int64 = {0x7f7f7f7f7f7f7f7f,
     0x7f7f7f7f7f7f7f7f}, uint128 = 0x7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f7f}
xmm2           {v4_float = {0x0, 0x0, 0x0, 0xffffe11d}, v2_double = {0x0,
     0x8000000000000000}, v16_int8 = {0xdc, 0xa8, 0x2e, 0x6, 0xa9, 0xe0, 
     0x88, 0xad, 0x30, 0xc4, 0x71, 0x5b, 0x1f, 0xf7, 0xc5}, v8_int16 = 
     0x62e, 0xe0a9, 0x8881, 0x30ad, 0x71c4, 0x1f5b, 0xc5f7}, v4_int32 = {
     0x62ea8dc, 0x8881e0a9, 0x71c430ad, 0xc5f71f5b}, v2_int64 = {
     0x8881e0a9062ea8dc, 0xc5f71f5b71c430ad},
   uint128 = 0xc5f71f5b71c430ad8881e0a9062ea8dc}
xmm3           {v4_float = {0x0, 0x0, 0x2c8ea00, 0x0}, v2_double = {
     0x8000000000000000, 0x8000000000000000}, v16_int8 = {0x30, 0x1d, 0x4c,
     0x24, 0x7a, 0x96, 0x2a, 0xdf, 0x8b, 0x9b, 0xfe, 0xcf, 0x71, 0x84, 
     0xdb}, v8_int16 = {0x1d30, 0x244c, 0x967a, 0xdf2a, 0x9b8b, 0xcffe, 
     0xdb1c}, v4_int32 = {0x244c1d30, 0xdf2a967a, 0xcffe9b8b, 0xdb1c8471},
   v2_int64 = {0xdf2a967a244c1d30, 0xdb1c8471cffe9b8b},
   uint128 = 0xdb1c8471cffe9b8bdf2a967a244c1d30}
xmm4           {v4_float = {0x0, 0x0, 0x0, 0x0}, v2_double = {0x0,
     0x8000000000000000}, v16_int8 = {0xc4, 0x6c, 0xff, 0x0, 0x28, 0xda, 
     0x9c, 0xf9, 0xdc, 0x57, 0x71, 0x1, 0x8, 0xbf, 0xeb}, v8_int16 = 
     0xff, 0xda28, 0x9cc9, 0xdcf9, 0x7157, 0x801, 0xebbf}, v4_int32 = {
     0xff6cc4, 0x9cc9da28, 0x7157dcf9, 0xebbf0801}, v2_int64 = {
     0x9cc9da2800ff6cc4, 0xebbf08017157dcf9},
   uint128 = 0xebbf08017157dcf99cc9da2800ff6cc4}
xmm5           {v4_float = {0x0, 0x2, 0x0, 0x0}, v2_double = {0x13, 0x0},
   v16_int8 = {0x2b, 0x70, 0x1c, 0x7e, 0x2e, 0x5f, 0x33, 0x40, 0x74, 0x7a,
     0x4a, 0x76, 0x94, 0x7a, 0x2f, 0x35}, v8_int16 = {0x702b, 0x7e1c, 
     0x4033, 0x7a74, 0x764a, 0x7a94, 0x352f}, v4_int32 = {0x7e1c702b,
     0x40335f2e, 0x764a7a74, 0x352f7a94}, v2_int64 = {0x40335f2e7e1c702b,
     0x352f7a94764a7a74}, uint128 = 0x352f7a94764a7a7440335f2e7e1c702b}
xmm6           {v4_float = {0x0, 0x0, 0x0, 0x0}, v2_double = {
     0x8000000000000000, 0x8000000000000000}, v16_int8 = {0x0, 0xcf, 0x34,
     0x17, 0x68, 0x0, 0xaf, 0xed, 0x9c, 0x84, 0x55, 0x5f, 0xc0, 0x8f, 0x7c,
     0xf1}, v8_int16 = {0xcf00, 0x1734, 0x68, 0xedaf, 0x849c, 0x5f55, 
     0xf17c}, v4_int32 = {0x1734cf00, 0xedaf0068, 0x5f55849c, 0xf17c8fc0},
   v2_int64 = {0xedaf00681734cf00, 0xf17c8fc05f55849c},
   uint128 = 0xf17c8fc05f55849cedaf00681734cf00}
xmm7           {v4_float = {0x6, 0xfe240240, 0x0, 0x69b}, v2_double = {
     0x8000000000000000, 0x8000000000000000}, v16_int8 = {0x34, 0xc, 0xd3,
     0x40, 0xe0, 0xfe, 0xed, 0xcb, 0x1, 0x73, 0x3c, 0x67, 0x91, 0x60, 0xd3,
     0x44}, v8_int16 = {0xc34, 0x40d3, 0xfee0, 0xcbed, 0x7301, 0x673c, 
     0x44d3}, v4_int32 = {0x40d30c34, 0xcbedfee0, 0x673c7301, 0x44d36091},
   v2_int64 = {0xcbedfee040d30c34, 0x44d36091673c7301},
   uint128 = 0x44d36091673c7301cbedfee040d30c34}
mxcsr          0x9fe0	[ PE DAZ IM DM ZM OM UM PM FZ ]
mm0            {uint64 = 0xdcb1687069778689, v2_int32 = {0x69778689,
     0xdcb16870}, v4_int16 = {0x8689, 0x6977, 0x6870, 0xdcb1}, v8_int8 = 
     0x86, 0x77, 0x69, 0x70, 0x68, 0xb1, 0xdc}}
mm1            {uint64 = 0xd9d7d69cd9d6d572, v2_int32 = {0xd9d6d572,
     0xd9d7d69c}, v4_int16 = {0xd572, 0xd9d6, 0xd69c, 0xd9d7}, v8_int8 = 
     0xd5, 0xd6, 0xd9, 0x9c, 0xd6, 0xd7, 0xd9}}
mm2            {uint64 = 0xd7d6ad67d6da7a6b, v2_int32 = {0xd6da7a6b,
     0xd7d6ad67}, v4_int16 = {0x7a6b, 0xd6da, 0xad67, 0xd7d6}, v8_int8 = 
     0x7a, 0xda, 0xd6, 0x67, 0xad, 0xd6, 0xd7}}
mm3            {uint64 = 0xda786a706e7d8589, v2_int32 = {0x6e7d8589,
     0xda786a70}, v4_int16 = {0x8589, 0x6e7d, 0x6a70, 0xda78}, v8_int8 = 
     0x85, 0x7d, 0x6e, 0x70, 0x6a, 0x78, 0xda}}
mm4            {uint64 = 0xd7d6ad6773707385, v2_int32 = {0x73707385,
     0xd7d6ad67}, v4_int16 = {0x7385, 0x7370, 0xad67, 0xd7d6}, v8_int8 = 
     0x73, 0x70, 0x73, 0x67, 0xad, 0xd6, 0xd7}}
mm5            {uint64 = 0xd9d7d69c6778717a, v2_int32 = {0x6778717a,
     0xd9d7d69c}, v4_int16 = {0x717a, 0x6778, 0xd69c, 0xd9d7}, v8_int8 = 
     0x71, 0x78, 0x67, 0x9c, 0xd6, 0xd7, 0xd9}}
mm6            {uint64 = 0xd9d6d57273736a7d, v2_int32 = {0x73736a7d,
     0xd9d6d572}, v4_int16 = {0x6a7d, 0x7373, 0xd572, 0xd9d6}, v8_int8 = 
     0x6a, 0x73, 0x73, 0x72, 0xd5, 0xd6, 0xd9}}
mm7            {uint64 = 0xd6da7a6b7175838b, v2_int32 = {0x7175838b,
     0xd6da7a6b}, v4_int16 = {0x838b, 0x7175, 0x7a6b, 0xd6da}, v8_int8 = 
     0x83, 0x75, 0x71, 0x6b, 0x7a, 0xda, 0xd6}}

     -Mike Melanson

More information about the ffmpeg-devel mailing list