[FFmpeg-devel] [PATCH] libavcodec/pthread_slice: for large job counts, avoid lock/unlock between jobs on the same thread
Tom Butterworth
bangnoise at gmail.com
Mon Jul 13 18:34:00 CEST 2015
Currently for large job counts, pthread_slice.c acquires and releases a lock
between each job. Acquiring the locks can take more time than the job itself.
The DDS and Hap decoders naively create a job per 4x4 pixel block. For a 4Kx2K
frame:
decode before patch: 1562ms
decode after patch: 14ms
Clients probably should be able to submit jobs without having to consider the
number of threads the jobs will be run on, and this is a possible solution to
that. It makes the assumption that all jobs will take roughly the same amount
of time and that threads will be scheduled evenly.
---
libavcodec/pthread_slice.c | 39 ++++++++++++++++++++++++++++++++-------
1 file changed, 32 insertions(+), 7 deletions(-)
diff --git a/libavcodec/pthread_slice.c b/libavcodec/pthread_slice.c
index c8e69f0..751e5f4 100644
--- a/libavcodec/pthread_slice.c
+++ b/libavcodec/pthread_slice.c
@@ -50,9 +50,11 @@ typedef struct SliceThreadContext {
action_func2 *func2;
void *args;
int *rets;
+ int *subjob_limits;
+ int *subjob_offsets;
int rets_count;
int job_count;
- int job_size;
+ int subjob_size;
pthread_cond_t last_job_cond;
pthread_cond_t current_job_cond;
@@ -76,6 +78,7 @@ static void* attribute_align_arg worker(void *v)
int our_job = c->job_count;
int thread_count = avctx->thread_count;
int self_id;
+ int i;
pthread_mutex_lock(&c->current_job_lock);
self_id = c->current_job++;
@@ -96,8 +99,10 @@ static void* attribute_align_arg worker(void *v)
}
pthread_mutex_unlock(&c->current_job_lock);
- c->rets[our_job%c->rets_count] = c->func ? c->func(avctx, (char*)c->args + our_job*c->job_size):
- c->func2(avctx, c->args, our_job, self_id);
+ for (i = c->subjob_offsets[our_job]; i < c->subjob_limits[our_job]; i++) {
+ c->rets[i%c->rets_count] = c->func ? c->func(avctx, (char*)c->args + i*c->subjob_size):
+ c->func2(avctx, c->args, i, self_id);
+ }
pthread_mutex_lock(&c->current_job_lock);
our_job = c->current_job++;
@@ -133,6 +138,8 @@ void ff_slice_thread_free(AVCodecContext *avctx)
av_freep(&c->progress_cond);
av_freep(&c->workers);
+ av_freep(&c->subjob_limits);
+ av_freep(&c->subjob_offsets);
av_freep(&avctx->internal->thread_ctx);
}
@@ -146,7 +153,7 @@ static av_always_inline void thread_park_workers(SliceThreadContext *c, int thre
static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, int *ret, int job_count, int job_size)
{
SliceThreadContext *c = avctx->internal->thread_ctx;
- int dummy_ret;
+ int dummy_ret, i;
if (!(avctx->active_thread_type&FF_THREAD_SLICE) || avctx->thread_count <= 1)
return avcodec_default_execute(avctx, func, arg, ret, job_count, job_size);
@@ -157,8 +164,14 @@ static int thread_execute(AVCodecContext *avctx, action_func* func, void *arg, i
pthread_mutex_lock(&c->current_job_lock);
c->current_job = avctx->thread_count;
- c->job_count = job_count;
- c->job_size = job_size;
+ c->job_count = FFMIN(job_count, avctx->thread_count);
+ c->subjob_offsets[0] = 0;
+ c->subjob_limits[0] = (job_count / c->job_count) + (job_count % c->job_count);
+ for (i = 1; i < c->job_count; i++) {
+ c->subjob_offsets[i] = c->subjob_limits[i-1];
+ c->subjob_limits[i] = c->subjob_offsets[i] + (job_count / c->job_count);
+ }
+ c->subjob_size = job_size;
c->args = arg;
c->func = func;
if (ret) {
@@ -218,17 +231,29 @@ int ff_slice_thread_init(AVCodecContext *avctx)
av_free(c);
return -1;
}
+ c->subjob_offsets = av_mallocz_array(thread_count, sizeof(int));
+ if (!c->subjob_offsets) {
+ av_free(c);
+ return -1;
+ }
+ c->subjob_limits = av_mallocz_array(thread_count, sizeof(int));
+ if (!c->subjob_limits) {
+ av_free(c);
+ return -1;
+ }
+
avctx->internal->thread_ctx = c;
c->current_job = 0;
c->job_count = 0;
- c->job_size = 0;
+ c->subjob_size = 0;
c->done = 0;
pthread_cond_init(&c->current_job_cond, NULL);
pthread_cond_init(&c->last_job_cond, NULL);
pthread_mutex_init(&c->current_job_lock, NULL);
pthread_mutex_lock(&c->current_job_lock);
for (i=0; i<thread_count; i++) {
+ c->subjob_limits[i] = c->subjob_offsets[i] = 0;
if(pthread_create(&c->workers[i], NULL, worker, avctx)) {
avctx->thread_count = i;
pthread_mutex_unlock(&c->current_job_lock);
--
2.3.2 (Apple Git-55)
More information about the ffmpeg-devel
mailing list