[FFmpeg-devel] [PATCH] Yet another stab at RGB48 support

Sat May 16 08:24:13 CEST 2009

On Fri, May 15, 2009 at 04:12:17PM +0200, Michael Niedermayer wrote:
> On Fri, May 15, 2009 at 08:56:28AM +0300, Kostya wrote:
> > On Wed, May 13, 2009 at 01:59:19AM +0200, Michael Niedermayer wrote:
> > > On Sun, May 10, 2009 at 03:21:23PM +0300, Kostya wrote:
> > > > On Sat, May 09, 2009 at 07:31:20PM +0200, Diego Biurrun wrote:
> > > > > On Sat, May 09, 2009 at 02:28:18PM +0300, Kostya wrote:
> > > > > > $subj
> > > > > 
> > > > > Changelog update is missing.
> > > > > 
> > > > > $nits below
> > > > > 
> > > > [...]
> > > > reformatted, splitted some long lines.
> > > > Since this is diff against libswscale, no Changelog entry (but I
> > > > remember about it).
> > >
> > > [...]
> > > > +static int rgb48togray16_template(SwsContext *c, uint8_t *src[],
> > > > +                                  int srcStride[], int srcSliceY,
> > > > +                                  int srcSliceH, uint8_t *dst[],
> > > > +                                  int dstStride[], const int srcLE,
> > > > +                                  const int dstLE) {
> > > > +    int length      = c->srcW,
> > > > +        y           = srcSliceY,
> > > > +        height      = srcSliceH,
> > > > +        stride_s2   = srcStride[0] / 2,
> > > > +        stride_d2   = dstStride[0] / 2,
> > > > +        i;
> > > > +    uint16_t *s = (uint16_t *) src[0], *d = (uint16_t *) dst[0] + stride_d2 * y;
> > > > +
> > > > +    for (i = 0; i < height; i++) {
> > > > +             if (!srcLE && !dstLE) rgb48togray16row_template(s, d, length, 0, 0);
> > > > +        else if (!srcLE &&  dstLE) rgb48togray16row_template(s, d, length, 0, 1);
> > > > +        else if ( srcLE && !dstLE) rgb48togray16row_template(s, d, length, 1, 0);
> > > > +        else                       rgb48togray16row_template(s, d, length, 1, 1);
> > > > +        s += stride_s2;
> > > > +        d += stride_d2;
> > > > +    }
> > > > +    return height;
> > > > +}
> > > > +
> > > 
> > > > +#define FUNC_RGB48TOGRAY16(name, sLE, dLE) \
> > > > +    static int name(SwsContext *c, uint8_t *s[], int sS[], int sSY, int sSH, \
> > > > +                    uint8_t *d[], int dS[]) { \
> > > > +        return rgb48togray16_template(c, s, sS, sSY, sSH, d, dS, sLE, dLE); \
> > > > +    }
> > > > +
> > > > +FUNC_RGB48TOGRAY16( rgb48letogray16le, 1, 1)
> > > > +FUNC_RGB48TOGRAY16( rgb48betogray16le, 0, 1)
> > > > +FUNC_RGB48TOGRAY16( rgb48letogray16be, 1, 0)
> > > > +FUNC_RGB48TOGRAY16( rgb48betogray16be, 0, 0)
> > > 
> > > bloat, a single function does the trick too in this case
> > 
> > huh? What do you mean "single function"? 
> 
> rgb48togray16_template()

merged

> > That's the only place where low bits of 16-bit variable are used.
> > 
> > > [...]
> > > > +static inline void rgb1516to48(const uint8_t *src, uint8_t *d, long src_size,
> > > > +                               const int be, const int g6, const int bgr)
> > > > +{
> > > 
> > > > +    const uint16_t *s = (const uint16_t*) src, *end = s + src_size/2;
> > > 
> > > unreadable
> > 
> > split 
> 
> better

in some other places too 

> > 
> > > [...]
> > > > +static void rgb48to48(const uint8_t *src, uint8_t *dst, long src_size)
> > > > +{
> > > > +    long i = 23 - src_size;
> > > > +    uint8_t *d = dst - i;
> > > > +    const uint8_t *s = src - i;
> > > > +#if HAVE_MMX
> > > > +    __asm__ volatile(
> > > > +    "test           %0,   %0            \n\t"
> > > > +    "jns            2f                  \n\t"
> > > > +    PREFETCH"     (%1,    %0)           \n\t"
> > > > +    ASMALIGN(4)
> > > > +    "1:                                 \n\t"
> > > > +    PREFETCH"    32(%1,   %0)           \n\t"
> > > > +    "movq          (%1,   %0), %%mm0    \n\t"
> > > > +    "movq         8(%1,   %0), %%mm1    \n\t"
> > > > +    "movq        16(%1,   %0), %%mm2    \n\t"
> > > > +    "movq        %%mm0, %%mm3           \n\t"
> > > > +    "movq        %%mm1, %%mm4           \n\t"
> > > > +    "movq        %%mm2, %%mm5           \n\t"
> > > > +    "psllw          $8, %%mm0           \n\t"
> > > > +    "psllw          $8, %%mm1           \n\t"
> > > > +    "psllw          $8, %%mm2           \n\t"
> > > > +    "psrlw          $8, %%mm3           \n\t"
> > > > +    "psrlw          $8, %%mm4           \n\t"
> > > > +    "psrlw          $8, %%mm5           \n\t"
> > > > +    "por         %%mm3, %%mm0           \n\t"
> > > > +    "por         %%mm4, %%mm1           \n\t"
> > > > +    "por         %%mm5, %%mm2           \n\t"
> > > > +    "movq        %%mm0,   (%2,  %0)     \n\t"
> > > > +    "movq        %%mm1,  8(%2,  %0)     \n\t"
> > > > +    "movq        %%mm2, 16(%2,  %0)     \n\t"
> > > > +    "add           $24,    %0           \n\t"
> > > > +    "js 1b                              \n\t"
> > > > +    "2:                                 \n\t"
> > > > +    SFENCE"                             \n\t"
> > > > +    EMMS"                               \n\t"
> > > > +    : "+&r"(i)
> > > > +    : "r" (s), "r" (d)
> > > > +    : "memory");
> > > > +#endif
> > > > +    for (; i < 21; i += 4) {
> > > > +        unsigned int x = *(uint32_t*)&s[i];
> > > > +        *(uint32_t*)&d[i] = ((x>>8) & 0x00FF00FF) + ((x<<8) & 0xFF00FF00);
> > > > +    }
> > > > +    if (i < 23) {
> > > > +        d[i]   = s[i+1];
> > > > +        d[i+1] = s[i];
> > > > +    }
> > > 
> > > is this some duplicate of a gray le <-> be function?
> > 
> > Haven't found that byteswapping function but should be, yes. 
> 
> planarCopy()

Hacked it a bit to handle conversion instead of these functions. 

> > 
> > > > +}
> > > > +
> > > > +static void rgb48leto24(const uint8_t *s, uint8_t *dst, long src_size)
> > > > +{
> > > > +    uint8_t *d = dst;
> > > > +    const uint8_t *end = s + src_size;
> > > > +#if HAVE_MMX
> > > > +    __asm__ volatile(
> > > > +    PREFETCH"     (%1)          \n\t"
> > > > +    "jmp 2f                     \n\t"
> > > > +    ASMALIGN(4)
> > > > +    "1:                         \n\t"
> > > > +    PREFETCH"   32(%1)          \n\t"
> > > > +    "movq         (%1), %%mm0   \n\t"
> > > > +    "movq        8(%1), %%mm1   \n\t"
> > > > +    "movq       16(%1), %%mm2   \n\t"
> > > > +    "movq       24(%1), %%mm3   \n\t"
> > > > +    "movq       32(%1), %%mm4   \n\t"
> > > > +    "movq       40(%1), %%mm5   \n\t"
> > > > +    "psrlw          $8, %%mm0   \n\t"
> > > > +    "psrlw          $8, %%mm1   \n\t"
> > > > +    "psrlw          $8, %%mm2   \n\t"
> > > > +    "psrlw          $8, %%mm3   \n\t"
> > > > +    "psrlw          $8, %%mm4   \n\t"
> > > > +    "psrlw          $8, %%mm5   \n\t"
> > > > +    "packuswb    %%mm1, %%mm0   \n\t"
> > > > +    "packuswb    %%mm3, %%mm2   \n\t"
> > > > +    "packuswb    %%mm5, %%mm4   \n\t"
> > > > +    "movq        %%mm0,   (%0)  \n\t"
> > > > +    "movq        %%mm2,  8(%0)  \n\t"
> > > > +    "movq        %%mm4, 16(%0)  \n\t"
> > > > +    "add           $24,    %0   \n\t"
> > > > +    "add           $48,    %1   \n\t"
> > > > +    "2:                         \n\t"
> > > > +    "cmp            %1,    %2   \n\t"
> > > > +    "ja 1b                      \n\t"
> > > > +    SFENCE"                     \n\t"
> > > > +    EMMS"                       \n\t"
> > > > +    : "+&r"(d), "+&r"(s)
> > > > +    : "r" (end-47)
> > > > +    : "memory");
> > > > +#endif
> > > > +    for (; s < end; s += 2, d++)
> > > > +        *d = *(s+1);
> > > > +}
> > > 
> > > same?
> > 
> > No. That was byte-swapping, this is byte-shaving. 
> 
> same, in the sense that this too is code duplication

gone

> ill read/review the rest once you removed trivial code duplication
> 
> [...]
> -- 
> Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 48bpp.patch
Type: text/x-diff
Size: 18408 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20090516/c4ca77d9/attachment.patch>