[Libav-user] a little performance/optimisation headbreaker :)

Fri Feb 15 10:37:06 CET 2013

'Morning!

I guess there are a number of people on here who are experts at writing optimised code exploiting every bit of a processor's instruction set. The code I recently isolated from the Perian project also attempts this, and I just came across something that got flabbergasted me. Perian is a Mac-only project, so it can make a number of safe assumptions about the CPUs it'll run on. Not so the split-off I created, which is Mac + Win32, likely to run on (lowly) AMD CPUs as well as high-end Intel ones.
So I looked at the actual performance of a few of the SSE-optimised functions, knowing from experience that using SIMD also introduces overhead and that a good compiler is likely to create better assembly than a hand-coding non-expert tinkerer.

So I have this little benchmark comparing 2 routines that convert yuv420 to yuv422, running them on data that's allocated only once and should contain random bits, counting the number of calls made during a 5 second period (using tested, high resolution, low-latency timing functions):

	  AVPicture pict;
	  uint8_t *baseAddr = NULL;
	  int width = 720, height = 576, outRB = 1440;
	  double t;
	  unsigned long N = 0;
		memset( &pict.data, 0, sizeof(pict.data) );
		pict.linesize[0] = 752, pict.linesize[1] = 376;
		init_HRTime();
		if( (pict.data[0] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) ))
		   && (pict.data[1] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) ))
		   && (pict.data[2] = (uint8_t*) malloc( width * height * 2 * sizeof(uint8_t) ))
		   && (baseAddr = (uint8_t*) malloc( width * height * 4 * 2 * sizeof(uint8_t) ))
		){
		  double startT = HRTime_Time();
			do{
				Y420toY422_sse2( &pict, baseAddr, outRB, width, height, &N );
			} while( (t = HRTime_Time() - startT) < 5 );
			fprintf( stderr, "%lu Y420toY422_sse2(outRB=%d,width=%d,height=%d) conversions in %gs; %gHz\n",
					N, outRB, width, height, t,
					N / t );
			startT = HRTime_Time(); N = 0;
			do{
				Y420toY422_x86_scalar( &pict, baseAddr, outRB, width, height, &N );
				N += 1;
			} while( (t = HRTime_Time() - startT) < 5 );
			fprintf( stderr, "%lu Y420toY422_x86_scalar(outRB=%d,width=%d,height=%d) conversions in %gs; %gHz\n",
					N, outRB, width, height, t,
					N / t );
		}

On my 2.7Ghz dual-core i7 MBP, I get about 10000Hz for the SSE version, and roughly half that for the generic, scalar function, using gcc-4.2 as well as using MSVC 2010 Express running under WinXP in VirtualBox. The factor 2 speed gain for SSE code also applies on 2 AMD machines (mid-end laptop and C62 netbook).

Then I installed a new mingw32 cross-compiler based on gcc 4.7 and for the heck of it compiled my benchmark with it ... and found same factor 2 ... but in favour of the scalar code, on my i7 . It's more like a factor 2.5, actually. Same thing after installing the native OS X gcc 4.7 version.

The question: is gcc-4.7 clever enough to do a better optimisation of the 2nd benchmark loop than the 1st loop, or does it really generate so much better assembly from the scalar function? NB, -fno-inline-functions has no effect here.

Not that it matters much, as even on the C62 netbook the SSE function runs at almost 700Hz.

The functions:

//Handles the last row for Y420 videos with an odd number of luma rows
//FIXME: odd number of luma columns is not handled and they will be lost
static void Y420toY422_lastrow(uint8_t *o, uint8_t *yc, uint8_t *uc, uint8_t *vc, int halfWidth)
{
	int x;
	for(x=0; x < halfWidth; x++)
	{
		int x4 = x*4, x2 = x*2;

		o[x4]   = uc[x];
		o[++x4] = yc[x2];
		o[++x4] = vc[x];
		o[++x4] = yc[++x2];
	}
}

#define HandleLastRow(o, yc, uc, vc, halfWidth, height) if (unlikely(height & 1)) Y420toY422_lastrow(o, yc, uc, vc, halfWidth)

#include <emmintrin.h>

#ifdef _MSCVER
#	define FASTCALL	__fastcall
#elif defined(__i386__) && !defined(__llvm__) && !defined(_MSC_VER)
#	define FASTCALL	__attribute__((fastcall))
#else
#	define FASTCALL
#endif

static FASTCALL void Y420toY422_sse2(AVPicture *picture, uint8_t *o, int outRB, int width, int height, unsigned long *N)
{
	uint8_t	*yc = picture->data[0], *uc = picture->data[1], *vc = picture->data[2];
	int		rY = picture->linesize[0], rUV = picture->linesize[1];
	int		y, x, halfwidth = width >> 1, halfheight = height >> 1;
	int		vWidth = width >> 5;

	for (y = 0; y < halfheight; y++) {
		uint8_t   *o2 = o + outRB,   *yc2 = yc + rY;
		__m128i *ov = (__m128i*)o, *ov2 = (__m128i*)o2, *yv = (__m128i*)yc, *yv2 = (__m128i*)yc2;
		__m128i *uv = (__m128i*)uc,*vv  = (__m128i*)vc;

#if defined(__i386__) && !defined(_MSC_VER) //FIXMERJVB
		int vWidth_ = vWidth;

		asm volatile(
			"\n0:			\n\t"
			"movdqa		(%2),	%%xmm0	\n\t"
			"movdqa		16(%2),	%%xmm2	\n\t"
			"movdqa		(%3),		%%xmm1	\n\t"
			"movdqa		16(%3),	%%xmm3	\n\t"
			"movdqu		(%4),	%%xmm4	\n\t"
			"movdqu		(%5),	%%xmm5	\n\t"
			"addl		$32,	%2		\n\t"
			"addl		$32,	%3		\n\t"
			"addl		$16,	%4		\n\t"
			"addl		$16,	%5		\n\t"
			"movdqa		%%xmm4, %%xmm6	\n\t"
			"punpcklbw	%%xmm5, %%xmm4	\n\t" /*chroma_l*/
			"punpckhbw	%%xmm5, %%xmm6	\n\t" /*chroma_h*/
			"movdqa		%%xmm4, %%xmm5	\n\t"
			"punpcklbw	%%xmm0, %%xmm5	\n\t"
			"movntdq	%%xmm5, (%0)	\n\t" /*ov[x4]*/
			"movdqa		%%xmm4, %%xmm5	\n\t"
			"punpckhbw	%%xmm0, %%xmm5	\n\t"
			"movntdq	%%xmm5, 16(%0)	\n\t" /*ov[x4+1]*/
			"movdqa		%%xmm6, %%xmm5	\n\t"
			"punpcklbw	%%xmm2, %%xmm5	\n\t"
			"movntdq	%%xmm5, 32(%0)	\n\t" /*ov[x4+2]*/
			"movdqa		%%xmm6, %%xmm5	\n\t"
			"punpckhbw	%%xmm2, %%xmm5	\n\t"
			"movntdq	%%xmm5, 48(%0)	\n\t" /*ov[x4+3]*/
			"addl		$64,	%0		\n\t"
			"movdqa		%%xmm4, %%xmm5	\n\t"
			"punpcklbw	%%xmm1, %%xmm5	\n\t"
			"movntdq	%%xmm5, (%1)	\n\t" /*ov2[x4]*/
			"punpckhbw	%%xmm1, %%xmm4	\n\t"
			"movntdq	%%xmm4, 16(%1)	\n\t" /*ov2[x4+1]*/
			"movdqa		%%xmm6, %%xmm5	\n\t"
			"punpcklbw	%%xmm3, %%xmm5	\n\t"
			"movntdq	%%xmm5, 32(%1)	\n\t" /*ov2[x4+2]*/
			"punpckhbw	%%xmm3, %%xmm6	\n\t"
			"movntdq	%%xmm6, 48(%1)	\n\t" /*ov2[x4+3]*/
			"addl		$64,	%1		\n\t"
			"decl		%6				\n\t"
			"jnz		0b				\n\t"
			: "+r" (ov), "+r" (ov2), "+r" (yv),
			  "+r" (yv2), "+r" (uv), "+r" (vv), "+m"(vWidth_)
			:
			: "memory", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6"
			);
#else
		for (x = 0; x < vWidth; x++) {
			int x2 = x*2, x4 = x*4;

			__m128i	tmp_y = yv[x2], tmp_y3 = yv[x2+1],
					tmp_y2 = yv2[x2], tmp_y4 = yv2[x2+1],
					tmp_u = _mm_loadu_si128(&uv[x]), tmp_v = _mm_loadu_si128(&vv[x]),
					chroma_l = _mm_unpacklo_epi8(tmp_u, tmp_v),
					chroma_h = _mm_unpackhi_epi8(tmp_u, tmp_v);

			_mm_stream_si128(&ov[x4],   _mm_unpacklo_epi8(chroma_l, tmp_y)); 
			_mm_stream_si128(&ov[x4+1], _mm_unpackhi_epi8(chroma_l, tmp_y)); 
			_mm_stream_si128(&ov[x4+2], _mm_unpacklo_epi8(chroma_h, tmp_y3)); 
			_mm_stream_si128(&ov[x4+3], _mm_unpackhi_epi8(chroma_h, tmp_y3)); 

			_mm_stream_si128(&ov2[x4],  _mm_unpacklo_epi8(chroma_l, tmp_y2)); 
			_mm_stream_si128(&ov2[x4+1],_mm_unpackhi_epi8(chroma_l, tmp_y2));
			_mm_stream_si128(&ov2[x4+2],_mm_unpacklo_epi8(chroma_h, tmp_y4));
			_mm_stream_si128(&ov2[x4+3],_mm_unpackhi_epi8(chroma_h, tmp_y4));
		}
#endif

		for (x=vWidth * 16; x < halfwidth; x++) {
		  int x4 = x*4, x2 = x*2;
			o2[x4]     = o[x4] = uc[x];
			x4++;
			o [x4] = yc[x2], o2[x4] = yc2[x2];
			x4++;
			o2[x4] = o[x4] = vc[x];
			x4++, x2++;
			o [x4] = yc[x2], o2[x4] = yc2[x2];
		}			

		o  += outRB*2;
		yc += rY*2;
		uc += rUV;
		vc += rUV;
	}

	HandleLastRow(o, yc, uc, vc, halfwidth, height);
	*N += 1;
}

static FASTCALL void Y420toY422_x86_scalar(AVPicture *picture, uint8_t *o, int outRB, int width, int height, unsigned long *N)
{
	uint8_t	*yc = picture->data[0], *u = picture->data[1], *v = picture->data[2];
	int		rY = picture->linesize[0], rUV = picture->linesize[1];
	int		halfheight = height >> 1, halfwidth = width >> 1;
	int		y, x;

	for (y = 0; y < halfheight; y ++) {
		uint8_t *o2 = o + outRB, *yc2 = yc + rY;

		for (x = 0; x < halfwidth; x++) {
			int x4 = x*4, x2 = x*2;
			o2[x4]     = o[x4] = u[x];
			o [++x4] = yc[x2];
			o2[x4] = yc2[x2];
			x4++;
			o2[x4] = o[x4] = v[x];
			o [++x4] = yc[++x2];
			o2[x4] = yc2[x2];
		}

		o  += outRB*2;
		yc += rY*2;
		u  += rUV;
		v  += rUV;
	}

	HandleLastRow(o, yc, u, v, halfwidth, height);
	*N += 1;
}