[FFmpeg-devel] [PATCH] avcodec/prores_ks reduce twice fdct calls
Alex Mogurenko
alex at mogurenko.com
Sun Dec 30 22:57:23 EET 2018
fdct done twice for each block. first time during quant calculation, second during slice encoding. so if we pre-save dct coefficients no need to do fdct second time.
disadvantages: requires more memory
advantages: improves performance ~4-5%
---
libavcodec/proresenc_kostya.c | 74 ++++++++++++++++++++++++-----------
1 file changed, 52 insertions(+), 22 deletions(-)
diff --git a/libavcodec/proresenc_kostya.c b/libavcodec/proresenc_kostya.c
index e045a972f1..4d49d6521a 100644
--- a/libavcodec/proresenc_kostya.c
+++ b/libavcodec/proresenc_kostya.c
@@ -219,7 +219,6 @@ struct TrellisNode {
#define MAX_STORED_Q 16
typedef struct ProresThreadData {
- DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16];
int16_t custom_q[64];
int16_t custom_chroma_q[64];
@@ -228,7 +227,6 @@ typedef struct ProresThreadData {
typedef struct ProresContext {
AVClass *class;
- DECLARE_ALIGNED(16, int16_t, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
int16_t quants[MAX_STORED_Q][64];
int16_t quants_chroma[MAX_STORED_Q][64];
@@ -237,6 +235,7 @@ typedef struct ProresContext {
const uint8_t *quant_mat;
const uint8_t *quant_chroma_mat;
const uint8_t *scantable;
+ int16_t *blocks[MAX_PLANES];
void (*fdct)(FDCTDSPContext *fdsp, const uint16_t *src,
ptrdiff_t linesize, int16_t *block);
@@ -562,6 +561,8 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
int plane_factor, is_chroma;
uint16_t *qmat;
uint16_t *qmat_chroma;
+ int16_t *blocks;
+ DECLARE_ALIGNED(16, int16_t, dct_blocks)[16 * 16 * MAX_MBS_PER_SLICE];
if (ctx->pictures_per_frame == 1)
line_add = 0;
@@ -604,28 +605,38 @@ static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
src = (const uint16_t*)(pic->data[i] + yp * linesize +
line_add * pic->linesize[i]) + xp;
+ if (!ctx->force_quant) {
+ blocks = ctx->blocks[i] + (y * ctx->slices_width + x / ctx->mbs_per_slice) * 16 * 16 * ctx->mbs_per_slice;
+ } else {
+ blocks = dct_blocks;
+ }
+
if (i < 3) {
- get_slice_data(ctx, src, linesize, xp, yp,
- pwidth, avctx->height / ctx->pictures_per_frame,
- ctx->blocks[0], ctx->emu_buf,
- mbs_per_slice, num_cblocks, is_chroma);
+ if (ctx->force_quant) {
+ get_slice_data(ctx, src, linesize, xp, yp,
+ pwidth, avctx->height / ctx->pictures_per_frame,
+ blocks, ctx->emu_buf,
+ mbs_per_slice, num_cblocks, is_chroma);
+ }
if (!is_chroma) {/* luma quant */
sizes[i] = encode_slice_plane(ctx, pb, src, linesize,
- mbs_per_slice, ctx->blocks[0],
+ mbs_per_slice, blocks,
num_cblocks, plane_factor,
qmat);
} else { /* chroma plane */
sizes[i] = encode_slice_plane(ctx, pb, src, linesize,
- mbs_per_slice, ctx->blocks[0],
+ mbs_per_slice, blocks,
num_cblocks, plane_factor,
qmat_chroma);
}
} else {
- get_alpha_data(ctx, src, linesize, xp, yp,
- pwidth, avctx->height / ctx->pictures_per_frame,
- ctx->blocks[0], mbs_per_slice, ctx->alpha_bits);
+ if (ctx->force_quant) {
+ get_alpha_data(ctx, src, linesize, xp, yp,
+ pwidth, avctx->height / ctx->pictures_per_frame,
+ blocks, mbs_per_slice, ctx->alpha_bits);
+ }
sizes[i] = encode_alpha_plane(ctx, pb, mbs_per_slice,
- ctx->blocks[0], quant);
+ blocks, quant);
}
total_size += sizes[i];
if (put_bits_left(pb) < 0) {
@@ -730,15 +741,15 @@ static int estimate_slice_plane(ProresContext *ctx, int *error, int plane,
const uint16_t *src, ptrdiff_t linesize,
int mbs_per_slice,
int blocks_per_mb, int plane_size_factor,
- const int16_t *qmat, ProresThreadData *td)
+ const int16_t *qmat, int16_t *blocks)
{
int blocks_per_slice;
int bits;
blocks_per_slice = mbs_per_slice * blocks_per_mb;
- bits = estimate_dcs(error, td->blocks[plane], blocks_per_slice, qmat[0]);
- bits += estimate_acs(error, td->blocks[plane], blocks_per_slice,
+ bits = estimate_dcs(error, blocks, blocks_per_slice, qmat[0]);
+ bits += estimate_acs(error, blocks, blocks_per_slice,
plane_size_factor, ctx->scantable, qmat);
return FFALIGN(bits, 8);
@@ -819,6 +830,7 @@ static int find_slice_quant(AVCodecContext *avctx,
int overquant;
uint16_t *qmat;
uint16_t *qmat_chroma;
+ int16_t *blocks[MAX_PLANES];
int linesize[4], line_add;
int alpha_bits = 0;
@@ -848,16 +860,17 @@ static int find_slice_quant(AVCodecContext *avctx,
linesize[i] = ctx->pic->linesize[i] * ctx->pictures_per_frame;
src = (const uint16_t *)(ctx->pic->data[i] + yp * linesize[i] +
line_add * ctx->pic->linesize[i]) + xp;
+ blocks[i] = ctx->blocks[i] + (y * ctx->slices_width + x / ctx->mbs_per_slice) * 16 * 16 * ctx->mbs_per_slice;
if (i < 3) {
get_slice_data(ctx, src, linesize[i], xp, yp,
pwidth, avctx->height / ctx->pictures_per_frame,
- td->blocks[i], td->emu_buf,
+ blocks[i], td->emu_buf,
mbs_per_slice, num_cblocks[i], is_chroma[i]);
} else {
get_alpha_data(ctx, src, linesize[i], xp, yp,
pwidth, avctx->height / ctx->pictures_per_frame,
- td->blocks[i], mbs_per_slice, ctx->alpha_bits);
+ blocks[i], mbs_per_slice, ctx->alpha_bits);
}
}
@@ -868,7 +881,7 @@ static int find_slice_quant(AVCodecContext *avctx,
if (ctx->alpha_bits)
alpha_bits = estimate_alpha_plane(ctx, src, linesize[3],
- mbs_per_slice, td->blocks[3]);
+ mbs_per_slice, blocks[3]);
// todo: maybe perform coarser quantising to fit into frame size when needed
for (q = min_quant; q <= max_quant; q++) {
bits = alpha_bits;
@@ -877,13 +890,13 @@ static int find_slice_quant(AVCodecContext *avctx,
src, linesize[0],
mbs_per_slice,
num_cblocks[0], plane_factor[0],
- ctx->quants[q], td); /* estimate luma plane */
+ ctx->quants[q], blocks[0]); /* estimate luma plane */
for (i = 1; i < ctx->num_planes - !!ctx->alpha_bits; i++) { /* estimate chroma plane */
bits += estimate_slice_plane(ctx, &error, i,
src, linesize[i],
mbs_per_slice,
num_cblocks[i], plane_factor[i],
- ctx->quants_chroma[q], td);
+ ctx->quants_chroma[q], blocks[i]);
}
if (bits > 65000 * 8)
error = SCORE_LIMIT;
@@ -914,13 +927,13 @@ static int find_slice_quant(AVCodecContext *avctx,
src, linesize[0],
mbs_per_slice,
num_cblocks[0], plane_factor[0],
- qmat, td);/* estimate luma plane */
+ qmat, blocks[0]);/* estimate luma plane */
for (i = 1; i < ctx->num_planes - !!ctx->alpha_bits; i++) { /* estimate chroma plane */
bits += estimate_slice_plane(ctx, &error, i,
src, linesize[i],
mbs_per_slice,
num_cblocks[i], plane_factor[i],
- qmat_chroma, td);
+ qmat_chroma, blocks[i]);
}
if (bits <= ctx->bits_per_mb * mbs_per_slice)
break;
@@ -1167,6 +1180,10 @@ static av_cold int encode_close(AVCodecContext *avctx)
av_freep(&ctx->tdata);
av_freep(&ctx->slice_q);
+ for (i = 0; i < MAX_PLANES; i++) {
+ av_freep(&ctx->blocks[i]);
+ }
+
return 0;
}
@@ -1319,6 +1336,19 @@ FF_ENABLE_DEPRECATION_WARNINGS
ctx->tdata[j].nodes[i].score = 0;
}
}
+
+ for (j = 0; j < MAX_PLANES; j++) {
+ ctx->blocks[j] = av_malloc(16 * 16
+ * ctx -> slices_width
+ * ctx -> mb_height
+ * ctx -> mbs_per_slice
+ * sizeof(*ctx->blocks[0]));
+
+ if (!ctx->blocks[j]) {
+ encode_close(avctx);
+ return AVERROR(ENOMEM);
+ }
+ }
} else {
int ls = 0;
int ls_chroma = 0;
--
2.19.0
More information about the ffmpeg-devel
mailing list