[FFmpeg-devel] RoQ video encoder (take 3)

Sun Jun 10 14:10:52 CEST 2007

Hi

On Fri, Jun 08, 2007 at 10:34:09PM +0200, Vitor wrote:
> Hi,
> 
> I'm sending the new version of the RoQ patch. Most of the changes were 
> made by Eric Lasota, I made mostly minor changes. It's now using 
> properly the YUV444 colorspace and encode properly the MOT blocks.
> 
> Comments welcome.
> 
> -Vitor

[...]
> +static void unpack_roq_qcell(uint8_t cb2[][3][4], roq_qcell *qcell, uint8_t u[3][16])
> +{
> +    uint8_t base[4][3][4];
> +    int n,cp,i;

> +    int offsets[4] = {0, 2, 8, 10};

static const

[...]
> +static void enlarge_roq_mb4(uint8_t base[3][16], uint8_t u[3][64])
> +{
> +    int x,y,cp;
> +
> +    for(y=0;y<8;y++)
> +        for(x=0;x<8;x++)
> +            for(cp=0;cp<3;cp++)
> +                u[cp][y*8+x] = base[cp][(y/2)*4 + (x/2)];

the for(cp loop should be the outermost IMHO
also

enlarge_roq_mb4(uint8_t base[3][4][4], uint8_t u[3][8][8])

u[cp][y][x] = base[cp][y>>1][x>>1];
seems more readable

[...]
> +// FIXME Could use DSPContext.sse, but it is so speed critical (used
> +// just for motion estimation).
> +static int block_sse(uint8_t **buf1, uint8_t **buf2, int x1, int y1, int x2, int y2,
> +                     int stride, int size)
> +{
> +    int i, j, k;
> +    int sse=0;
> +
> +    for (i=0; i<size; i++)
> +        for (j=0; j<size; j++)
> +            for (k=0; k<3; k++)
> +                sse += square(((int) buf1[k][(y1+i)*stride + x1+j]) -
> +                              buf2[k][(y2+i)*stride + x2+j]);

for(k should be the outermost loop, it reduces the number of needed pointer
dereferences if the compiler is stupid

the int cast seems useless
and the thing can be vertically aligned like:
sse += square(((int) buf1[k][(y1+i)*stride + x1+j]) -
                     buf2[k][(y2+i)*stride + x2+j]);

[...]
> +/**
> + * Returns distortion between two 2x2 macroblocks
> + */
> +static inline int squared_diff_mb2(uint8_t a[3][4], uint8_t b[3][4])
> +{
> +    int sdiff=0;
> +    int n,cp;
> +
> +    for(cp=0;cp<3;cp++)
> +        for(n=0;n<4;n++)
> +        sdiff += square(b[cp][n] - a[cp][n]);

wrong indention

> +
> +    return sdiff;
> +}
> +
> +/**
> + * Returns SE between two 4x4 macroblocks
> + */
> +static inline int squared_diff_mb4(uint8_t a[3][16], uint8_t b[3][16])
> +{
> +    int sdiff=0;
> +    int n,cp;
> +
> +    for(cp=0;cp<3;cp++)
> +        for(n=0;n<16;n++)
> +        sdiff += square(b[cp][n] - a[cp][n]);
> +
> +    return sdiff;
> +}
> +
> +/**
> + * Returns SE between two 8x8 macroblocks
> + */
> +static inline int squared_diff_mb8(uint8_t a[3][64], uint8_t b[3][64])
> +{
> +    int sdiff=0;
> +    int n,cp;
> +
> +    for(cp=0;cp<3;cp++)
> +        for(n=0;n<64;n++)
> +        sdiff += square(b[cp][n] - a[cp][n]);
> +
> +    return sdiff;
> +}

code triplification

[...]
> +    uint32_t byteConsumption;    ///< Number of bytes used for data encodes consumed
> +    uint32_t codeConsumption;    ///< Number of 2-bit codes consumed
> +    uint32_t combinedBitConsumption;

whats the sense of these 3?! a single variable will do just fine
also they should be int or unsigned int

[...]
> +/**
> + * Template code to get macroblocks from parts of the image
> + */
> +#define EXTRACT_MACROBLOCK(FUNCT, SIZ)\
> +static void FUNCT(AVFrame *frame, int x, int y, uint8_t mb[3][SIZ*SIZ]) { \
> +    int i,j,cp,stride; \
> +    uint8_t *pa, *pb; \
> +\
> +    for(cp=0; cp<3; cp++) { \
> +        stride = frame->linesize[cp]; \
> +        pa = mb[cp]; \
> +        pb = frame->data[cp] + (y*stride) + x; \
> +        for(i=0; i<SIZ; i++) { \
> +            for(j=0; j<SIZ; j++) \
> +                pa[j] = pb[j]; \

memcpy()

> +            pa += SIZ; \
> +            pb += stride; \
> +        } \
> +    } \
> +}
> +
> +EXTRACT_MACROBLOCK(get_frame_mb2, 2)
> +
> +EXTRACT_MACROBLOCK(get_frame_mb4, 4)
> +
> +EXTRACT_MACROBLOCK(get_frame_mb8, 8)

why is this not a single normal fuction?

> +
> +/**
> + * Template code to find the codebook with the lowest distortion from an image
> + */
> +#define GET_LOWEST_CB_ERR(FUNCT, DIM, ERR_COMMAND) \
> +static int FUNCT(uint8_t cluster[3][DIM], uint8_t cb[][3][DIM], int numCB, int *outIndex) \
> +{ \
> +    int diff; \
> +    int pick=0, lDiff; \
> +    int i=0; \
> +\
> +    lDiff = INT_MAX; \
> +\
> +    /* Diff against the others */ \
> +    for (i=0; i<numCB; i++) { \
> +        diff = ERR_COMMAND \
> +        if (diff < lDiff) { \
> +            lDiff = diff; \
> +            pick = i; \
> +        } \
> +    } \
> +\
> +    *outIndex = pick; \
> +    return lDiff; \
> +}
> +
> +
> +GET_LOWEST_CB_ERR(index_mb2,  4, squared_diff_mb2(cluster, cb[i]);)
> +GET_LOWEST_CB_ERR(index_mb4, 16, squared_diff_mb4(cluster, cb[i]);)
> +GET_LOWEST_CB_ERR(index_mb8, 64, squared_diff_mb8(cluster, cb[i]);)

same question here, why is this not a normal funcion?

[...]
> +static void add_to_size_calc(roq_possibility_t *p, cel_evaluation_t *eval, roq_sizecalc_t *sizeCalc)
> +{
> +    SIZE_CALC_BASE_CODE;
> +
> +    /* Modify CB4 entries */
> +    for (i=0; i<numCB4Changes; i++) {
> +        if (!sizeCalc->cb4UsageCount[cb4Changes[i]]) {
> +            sizeCalc->usedCB4++;
> +            cb2Changes[numCB2Changes++] = sizeCalc->cb4Indexes[cb4Changes[i]][0];
> +            cb2Changes[numCB2Changes++] = sizeCalc->cb4Indexes[cb4Changes[i]][1];
> +            cb2Changes[numCB2Changes++] = sizeCalc->cb4Indexes[cb4Changes[i]][2];
> +            cb2Changes[numCB2Changes++] = sizeCalc->cb4Indexes[cb4Changes[i]][3];
> +        }
> +
> +        sizeCalc->cb4UsageCount[cb4Changes[i]]++;
> +    }
> +
> +    /* Modify CB2 entries */
> +    for (i=0; i<numCB2Changes; i++) {
> +        if (!sizeCalc->cb2UsageCount[cb2Changes[i]])
> +            sizeCalc->usedCB2++;
> +
> +        sizeCalc->cb2UsageCount[cb2Changes[i]]++;
> +    }
> +
> +    sizeCalc->numArguments += argumentsChange;
> +    sizeCalc->numCodes += codeChange;
> +}
> +
> +static void remove_from_size_calc(roq_possibility_t *p, cel_evaluation_t *eval, roq_sizecalc_t *sizeCalc)
> +{
> +    SIZE_CALC_BASE_CODE;
> +
> +    /* Modify CB4 entries */
> +    for (i=0; i<numCB4Changes; i++) {
> +        sizeCalc->cb4UsageCount[cb4Changes[i]]--;
> +
> +        if (!sizeCalc->cb4UsageCount[cb4Changes[i]]) {
> +            sizeCalc->usedCB4--;
> +            cb2Changes[numCB2Changes++] = sizeCalc->cb4Indexes[cb4Changes[i]][0];
> +            cb2Changes[numCB2Changes++] = sizeCalc->cb4Indexes[cb4Changes[i]][1];
> +            cb2Changes[numCB2Changes++] = sizeCalc->cb4Indexes[cb4Changes[i]][2];
> +            cb2Changes[numCB2Changes++] = sizeCalc->cb4Indexes[cb4Changes[i]][3];
> +        }
> +    }
> +
> +    /* Modify CB2 entries */
> +    for (i=0; i<numCB2Changes; i++) {
> +        sizeCalc->cb2UsageCount[cb2Changes[i]]--;
> +
> +        if (!sizeCalc->cb2UsageCount[cb2Changes[i]])
> +            sizeCalc->usedCB2--;
> +    }
> +
> +    sizeCalc->numArguments -= argumentsChange;
> +    sizeCalc->numCodes -= codeChange;
> +}

duplicated

[...]
> +typedef struct
> +{
> +    unsigned char *data;
> +    int bytesRemaining;
> +    int bytesTotal;
> +} writebuffer_t;
> +
> +int roq_writebytes(writebuffer_t *wb, const uint8_t *data, unsigned long numBytes)
> +{
> +    if (wb->bytesRemaining < numBytes)
> +        return -1;
> +
> +    memcpy(wb->data, data, numBytes);
> +    wb->bytesRemaining -= numBytes;
> +    wb->data += numBytes;
> +    wb->bytesTotal += numBytes;
> +    return 1;
> +}

if(end - ptr >= size)
    bytestream_put_buffer(&ptr, data, size);

is simpler instead of the struct and roq_writebytes() IMHO

[...]
> +// TODO: Make chroma bias a parameter
> +int generate_codebooks2(RoqContext *enc, roq_tempdata_t *tempdata,
> +                        roq_cell *input, int inputCount, int *resultCount,
> +                        roq_cell *resultElements)
> +{
> +    int i;
> +    int *points = av_malloc(inputCount*6*sizeof(int));
> +    int *buf = points;
> +    roq_cell *results = resultElements;
> +    int *codebook = av_malloc(6*256*sizeof(int));
> +
> +    for (i=0; i<inputCount; i++) {
> +        *buf++ = input[i].y[0];
> +        *buf++ = input[i].y[1];
> +        *buf++ = input[i].y[2];
> +        *buf++ = input[i].y[3];
> +        *buf++ = input[i].u*4;
> +        *buf++ = input[i].v*4;
> +    }
> +
> +    ff_init_elbg(points, 6, inputCount, codebook, 256, 1, tempdata->closest_cb2,  &enc->randctx);
> +    ff_do_elbg(points, 6, inputCount, codebook, 256, 1, tempdata->closest_cb2, &enc->randctx);
> +
> +    av_free(points);
> +
> +    buf = codebook;
> +    for (i=0; i<256; i++) {
> +        results[i].y[0] = *buf++;
> +        results[i].y[1] = *buf++;
> +        results[i].y[2] = *buf++;
> +        results[i].y[3] = *buf++;
> +        results[i].u =    *buf++/4;
> +        results[i].v =    *buf++/4;
> +    }
> +
> +    av_free(codebook);
> +
> +    *resultCount = 256;
> +    return 1;
> +}
> +
> +// TODO: Make chroma bias a parameter
> +int generate_codebooks4(RoqContext *enc, roq_tempdata_t *tempdata,
> +                        roq_cell4 *input, int inputCount, int *resultCount,
> +                        roq_cell4 **resultElements)
> +{
> +    int i, k;
> +    int *points = av_malloc(24*inputCount*sizeof(int));
> +    int *buf = points;
> +    roq_cell4 *results = av_malloc(sizeof(roq_cell4) * 256);
> +    int *codebook = av_malloc(24*256*sizeof(int));
> +    int *closest_cb4 = av_malloc(24*inputCount*sizeof(int));
> +
> +    for (i=0; i<inputCount; i++)
> +        for (k=0; k<4; k++) {
> +            *buf++ = input[i].block[k].y[0];
> +            *buf++ = input[i].block[k].y[1];
> +            *buf++ = input[i].block[k].y[2];
> +            *buf++ = input[i].block[k].y[3];
> +            *buf++ = input[i].block[k].u*4;
> +            *buf++ = input[i].block[k].v*4;
> +        }
> +
> +    ff_init_elbg(points, 24, inputCount, codebook, 256, 1, closest_cb4, &enc->randctx);
> +    ff_do_elbg(points, 24, inputCount, codebook, 256, 1, closest_cb4, &enc->randctx);
> +
> +    av_free(points);
> +    av_free(closest_cb4);
> +
> +    buf = codebook;
> +    for (i=0; i<256; i++)
> +        for (k=0; k<4; k++) {
> +            results[i].block[k].y[0] = *buf++;
> +            results[i].block[k].y[1] = *buf++;
> +            results[i].block[k].y[2] = *buf++;
> +            results[i].block[k].y[3] = *buf++;
> +            results[i].block[k].u =    *buf++/4;
> +            results[i].block[k].v =    *buf++/4;
> +        }
> +
> +    av_free(codebook);
> +
> +    *resultElements = results;
> +    *resultCount = 256;
> +    return 1;
> +}
> +

this also is near duplicated

> +int generate_new_codebooks(RoqContext *enc, roq_tempdata_t *tempData)

this and many other functions should be static

[...]
> +static int roq_encode_init(AVCodecContext *avctx)
> +{
> +    RoqContext *enc = avctx->priv_data;
> +
> +    /* Clear out the context */
> +    memset(enc, 0, sizeof(*enc));

it should be zeroed automatically ...

[...]
> +static int roq_write_video_info_chunk(RoqContext *enc, writebuffer_t *wb)
> +{
> +    unsigned char chunk[16];
> +    unsigned char *buf = chunk;
> +
> +    /* ROQ info chunk */
> +    bytestream_put_le16(&buf, RoQ_INFO);
> +
> +    /* Size: 8 bytes */
> +    bytestream_put_le32(&buf, 8);
> +
> +    /* Unused argument */
> +    bytestream_put_byte(&buf, 0x00);
> +    bytestream_put_byte(&buf, 0x00);
> +
> +    /* Width */
> +    bytestream_put_le16(&buf, enc->width);
> +
> +    /* Height */
> +    bytestream_put_le16(&buf, enc->height);
> +
> +    /* Unused in Quake 3, mimics the output of the real encoder */
> +    bytestream_put_byte(&buf, 0x08);
> +    bytestream_put_byte(&buf, 0x00);
> +    bytestream_put_byte(&buf, 0x04);
> +    bytestream_put_byte(&buf, 0x00);
> +
> +    return roq_writebytes(wb, chunk, buf-chunk);

why dont write the data directly? instead of into chunk and then memcpy()
it?

[...]
> +    /* Check for I frame */
> +    if (!enc->framesToKeyframe) {
> +        enc->framesSinceKeyframe = 0;
> +        enc->framesToKeyframe = avctx->gop_size;
> +        rate = avctx->rc_max_rate;
> +    } else
> +        rate = avctx->rc_min_rate;

this looks like a nasty missuse of the max/min rate variables

[...]
> +    enc->maxBytes = rate / 240;

this could benefit from a comment like (fps=30) * (bit per byte=8) = 240 

also i dont think a constant number of bytes per frame is a good idea
even less so considering the complexity of the current code (which sadly
ive not completely reviewed yet, but i thought i better post some reply
with a partial review instead of letting you wait for a week ...)

[...]
> +static AVRational roq_framerates[] =
> +{
> +    {30,1},
> +    {0,0},
> +};
> +
> +AVCodec roq_encoder =
> +{
> +    "roqvideo",
> +    CODEC_TYPE_VIDEO,
> +    CODEC_ID_ROQ,
> +    sizeof(RoqContext),
> +    roq_encode_init,
> +    roq_encode_frame,
> +    roq_encode_end,
> +
> +    .supported_framerates = roq_framerates,

(AVRational[]){{30,1}, {0,0}}

> +    .pix_fmts = (enum PixelFormat[]){PIX_FMT_YUV444P, -1},
> +};
[...]

> +typedef struct
> +{
> +    int dx, dy;
> +} motion_vect;

why not use a simple array of 2 elements? this would also allow loops
to iterate over x and y

[...]
> +    int framesToKeyframe;

this seems redundant with gop_size and framesSinceKeyframe, also a design
based on the assumtion that the distance between keyframes is constant is
not ideal, some frame might very well be much cheaper to make keyframes
out of (first frame after a scene change for examplle) compared to the 
avergage frame ...
of course such scene change detection has nothing to do with the current
patch ...

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

Those who are too smart to engage in politics are punished by being
governed by those who are dumber. -- Plato 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 189 bytes
Desc: not available
URL: <http://lists.mplayerhq.hu/pipermail/ffmpeg-devel/attachments/20070610/46ef7dc9/attachment.pgp>