[Libav-user] a little performance/optimisation headbreaker :)
"René J.V. Bertin"
rjvbertin at gmail.com
Fri Feb 15 10:37:06 CET 2013
'Morning!
I guess there are a number of people on here who are experts at writing optimised code exploiting every bit of a processor's instruction set. The code I recently isolated from the Perian project also attempts this, and I just came across something that got flabbergasted me. Perian is a Mac-only project, so it can make a number of safe assumptions about the CPUs it'll run on. Not so the split-off I created, which is Mac + Win32, likely to run on (lowly) AMD CPUs as well as high-end Intel ones.
So I looked at the actual performance of a few of the SSE-optimised functions, knowing from experience that using SIMD also introduces overhead and that a good compiler is likely to create better assembly than a hand-coding non-expert tinkerer.
So I have this little benchmark comparing 2 routines that convert yuv420 to yuv422, running them on data that's allocated only once and should contain random bits, counting the number of calls made during a 5 second period (using tested, high resolution, low-latency timing functions):
AVPicture pict;
uint8_t *baseAddr = NULL;
int width = 720, height = 576, outRB = 1440;
double t;
unsigned long N = 0;
memset( &pict.data, 0, sizeof(pict.data) );
pict.linesize[0] = 752, pict.linesize[1] = 376;
init_HRTime();
if( (pict.data[0] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) ))
&& (pict.data[1] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) ))
&& (pict.data[2] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) ))
&& (baseAddr = (uint8_t*) malloc( width * height * 4 * 2 * sizeof(uint8_t) ))
){
double startT = HRTime_Time();
do{
Y420toY422_sse2( &pict, baseAddr, outRB, width, height, &N );
} while( (t = HRTime_Time() - startT) < 5 );
fprintf( stderr, "%lu Y420toY422_sse2(outRB=%d,width=%d,height=%d) conversions in %gs; %gHz\n",
N, outRB, width, height, t,
N / t );
startT = HRTime_Time(); N = 0;
do{
Y420toY422_x86_scalar( &pict, baseAddr, outRB, width, height, &N );
N += 1;
} while( (t = HRTime_Time() - startT) < 5 );
fprintf( stderr, "%lu Y420toY422_x86_scalar(outRB=%d,width=%d,height=%d) conversions in %gs; %gHz\n",
N, outRB, width, height, t,
N / t );
}
On my 2.7Ghz dual-core i7 MBP, I get about 10000Hz for the SSE version, and roughly half that for the generic, scalar function, using gcc-4.2 as well as using MSVC 2010 Express running under WinXP in VirtualBox. The factor 2 speed gain for SSE code also applies on 2 AMD machines (mid-end laptop and C62 netbook).
Then I installed a new mingw32 cross-compiler based on gcc 4.7 and for the heck of it compiled my benchmark with it ... and found same factor 2 ... but in favour of the scalar code, on my i7 . It's more like a factor 2.5, actually. Same thing after installing the native OS X gcc 4.7 version.
The question: is gcc-4.7 clever enough to do a better optimisation of the 2nd benchmark loop than the 1st loop, or does it really generate so much better assembly from the scalar function? NB, -fno-inline-functions has no effect here.
Not that it matters much, as even on the C62 netbook the SSE function runs at almost 700Hz.
The functions:
//Handles the last row for Y420 videos with an odd number of luma rows
//FIXME: odd number of luma columns is not handled and they will be lost
static void Y420toY422_lastrow(uint8_t *o, uint8_t *yc, uint8_t *uc, uint8_t *vc, int halfWidth)
{
int x;
for(x=0; x < halfWidth; x++)
{
int x4 = x*4, x2 = x*2;
o[x4] = uc[x];
o[++x4] = yc[x2];
o[++x4] = vc[x];
o[++x4] = yc[++x2];
}
}
#define HandleLastRow(o, yc, uc, vc, halfWidth, height) if (unlikely(height & 1)) Y420toY422_lastrow(o, yc, uc, vc, halfWidth)
#include <emmintrin.h>
#ifdef _MSCVER
# define FASTCALL __fastcall
#elif defined(__i386__) && !defined(__llvm__) && !defined(_MSC_VER)
# define FASTCALL __attribute__((fastcall))
#else
# define FASTCALL
#endif
static FASTCALL void Y420toY422_sse2(AVPicture *picture, uint8_t *o, int outRB, int width, int height, unsigned long *N)
{
uint8_t *yc = picture->data[0], *uc = picture->data[1], *vc = picture->data[2];
int rY = picture->linesize[0], rUV = picture->linesize[1];
int y, x, halfwidth = width >> 1, halfheight = height >> 1;
int vWidth = width >> 5;
for (y = 0; y < halfheight; y++) {
uint8_t *o2 = o + outRB, *yc2 = yc + rY;
__m128i *ov = (__m128i*)o, *ov2 = (__m128i*)o2, *yv = (__m128i*)yc, *yv2 = (__m128i*)yc2;
__m128i *uv = (__m128i*)uc,*vv = (__m128i*)vc;
#if defined(__i386__) && !defined(_MSC_VER) //FIXMERJVB
int vWidth_ = vWidth;
asm volatile(
"\n0: \n\t"
"movdqa (%2), %%xmm0 \n\t"
"movdqa 16(%2), %%xmm2 \n\t"
"movdqa (%3), %%xmm1 \n\t"
"movdqa 16(%3), %%xmm3 \n\t"
"movdqu (%4), %%xmm4 \n\t"
"movdqu (%5), %%xmm5 \n\t"
"addl $32, %2 \n\t"
"addl $32, %3 \n\t"
"addl $16, %4 \n\t"
"addl $16, %5 \n\t"
"movdqa %%xmm4, %%xmm6 \n\t"
"punpcklbw %%xmm5, %%xmm4 \n\t" /*chroma_l*/
"punpckhbw %%xmm5, %%xmm6 \n\t" /*chroma_h*/
"movdqa %%xmm4, %%xmm5 \n\t"
"punpcklbw %%xmm0, %%xmm5 \n\t"
"movntdq %%xmm5, (%0) \n\t" /*ov[x4]*/
"movdqa %%xmm4, %%xmm5 \n\t"
"punpckhbw %%xmm0, %%xmm5 \n\t"
"movntdq %%xmm5, 16(%0) \n\t" /*ov[x4+1]*/
"movdqa %%xmm6, %%xmm5 \n\t"
"punpcklbw %%xmm2, %%xmm5 \n\t"
"movntdq %%xmm5, 32(%0) \n\t" /*ov[x4+2]*/
"movdqa %%xmm6, %%xmm5 \n\t"
"punpckhbw %%xmm2, %%xmm5 \n\t"
"movntdq %%xmm5, 48(%0) \n\t" /*ov[x4+3]*/
"addl $64, %0 \n\t"
"movdqa %%xmm4, %%xmm5 \n\t"
"punpcklbw %%xmm1, %%xmm5 \n\t"
"movntdq %%xmm5, (%1) \n\t" /*ov2[x4]*/
"punpckhbw %%xmm1, %%xmm4 \n\t"
"movntdq %%xmm4, 16(%1) \n\t" /*ov2[x4+1]*/
"movdqa %%xmm6, %%xmm5 \n\t"
"punpcklbw %%xmm3, %%xmm5 \n\t"
"movntdq %%xmm5, 32(%1) \n\t" /*ov2[x4+2]*/
"punpckhbw %%xmm3, %%xmm6 \n\t"
"movntdq %%xmm6, 48(%1) \n\t" /*ov2[x4+3]*/
"addl $64, %1 \n\t"
"decl %6 \n\t"
"jnz 0b \n\t"
: "+r" (ov), "+r" (ov2), "+r" (yv),
"+r" (yv2), "+r" (uv), "+r" (vv), "+m"(vWidth_)
:
: "memory", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
);
#else
for (x = 0; x < vWidth; x++) {
int x2 = x*2, x4 = x*4;
__m128i tmp_y = yv[x2], tmp_y3 = yv[x2+1],
tmp_y2 = yv2[x2], tmp_y4 = yv2[x2+1],
tmp_u = _mm_loadu_si128(&uv[x]), tmp_v = _mm_loadu_si128(&vv[x]),
chroma_l = _mm_unpacklo_epi8(tmp_u, tmp_v),
chroma_h = _mm_unpackhi_epi8(tmp_u, tmp_v);
_mm_stream_si128(&ov[x4], _mm_unpacklo_epi8(chroma_l, tmp_y));
_mm_stream_si128(&ov[x4+1], _mm_unpackhi_epi8(chroma_l, tmp_y));
_mm_stream_si128(&ov[x4+2], _mm_unpacklo_epi8(chroma_h, tmp_y3));
_mm_stream_si128(&ov[x4+3], _mm_unpackhi_epi8(chroma_h, tmp_y3));
_mm_stream_si128(&ov2[x4], _mm_unpacklo_epi8(chroma_l, tmp_y2));
_mm_stream_si128(&ov2[x4+1],_mm_unpackhi_epi8(chroma_l, tmp_y2));
_mm_stream_si128(&ov2[x4+2],_mm_unpacklo_epi8(chroma_h, tmp_y4));
_mm_stream_si128(&ov2[x4+3],_mm_unpackhi_epi8(chroma_h, tmp_y4));
}
#endif
for (x=vWidth * 16; x < halfwidth; x++) {
int x4 = x*4, x2 = x*2;
o2[x4] = o[x4] = uc[x];
x4++;
o [x4] = yc[x2], o2[x4] = yc2[x2];
x4++;
o2[x4] = o[x4] = vc[x];
x4++, x2++;
o [x4] = yc[x2], o2[x4] = yc2[x2];
}
o += outRB*2;
yc += rY*2;
uc += rUV;
vc += rUV;
}
HandleLastRow(o, yc, uc, vc, halfwidth, height);
*N += 1;
}
static FASTCALL void Y420toY422_x86_scalar(AVPicture *picture, uint8_t *o, int outRB, int width, int height, unsigned long *N)
{
uint8_t *yc = picture->data[0], *u = picture->data[1], *v = picture->data[2];
int rY = picture->linesize[0], rUV = picture->linesize[1];
int halfheight = height >> 1, halfwidth = width >> 1;
int y, x;
for (y = 0; y < halfheight; y ++) {
uint8_t *o2 = o + outRB, *yc2 = yc + rY;
for (x = 0; x < halfwidth; x++) {
int x4 = x*4, x2 = x*2;
o2[x4] = o[x4] = u[x];
o [++x4] = yc[x2];
o2[x4] = yc2[x2];
x4++;
o2[x4] = o[x4] = v[x];
o [++x4] = yc[++x2];
o2[x4] = yc2[x2];
}
o += outRB*2;
yc += rY*2;
u += rUV;
v += rUV;
}
HandleLastRow(o, yc, u, v, halfwidth, height);
*N += 1;
}
More information about the Libav-user
mailing list