[FFmpeg-cvslog] avcodec/vvcdec: move frame tab memset from the main thread to worker threads

Nuo Mi git at videolan.org
Thu Aug 15 15:44:06 EEST 2024


ffmpeg | branch: master | Nuo Mi <nuomi2021 at gmail.com> | Sun Jul 28 11:18:07 2024 +0800| [80af195804fdef9ccd5a48251fc366d28bceb437] | committer: Nuo Mi

avcodec/vvcdec: move frame tab memset from the main thread to worker threads

memset tables in the main thread can become a bottleneck for the decoder.
For example, if it takes 1% of the processing time for one core, the maximum achievable FPS will be 100.
Move the memeset to worker threads will fix the issue.

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=80af195804fdef9ccd5a48251fc366d28bceb437
---

 libavcodec/vvc/dec.c    |  13 +++++-
 libavcodec/vvc/thread.c | 122 ++++++++++++++++++++++++++++--------------------
 libavcodec/vvc/thread.h |   1 +
 3 files changed, 85 insertions(+), 51 deletions(-)

diff --git a/libavcodec/vvc/dec.c b/libavcodec/vvc/dec.c
index 575bcfa33d..d34713296d 100644
--- a/libavcodec/vvc/dec.c
+++ b/libavcodec/vvc/dec.c
@@ -82,7 +82,13 @@ static int tl_create(TabList *l)
             if (!*t->tab)
                 return AVERROR(ENOMEM);
         }
-    } else if (l->zero) {
+    }
+    return 0;
+}
+
+static int tl_zero(TabList *l)
+{
+    if (l->zero) {
         for (int i = 0; i < l->nb_tabs; i++) {
             Tab *t = l->tabs + i;
             memset(*t->tab, 0, t->size);
@@ -404,6 +410,11 @@ static int pic_arrays_init(VVCContext *s, VVCFrameContext *fc)
     return 0;
 }
 
+int ff_vvc_per_frame_init(VVCFrameContext *fc)
+{
+    return frame_context_for_each_tl(fc, tl_zero);
+}
+
 static int min_positive(const int idx, const int diff, const int min_diff)
 {
     return diff > 0 && (idx < 0 || diff < min_diff);
diff --git a/libavcodec/vvc/thread.c b/libavcodec/vvc/thread.c
index 28065d726f..74f8e4e9d0 100644
--- a/libavcodec/vvc/thread.c
+++ b/libavcodec/vvc/thread.c
@@ -40,6 +40,7 @@ typedef struct ProgressListener {
 } ProgressListener;
 
 typedef enum VVCTaskStage {
+    VVC_TASK_STAGE_INIT,                    // for CTU(0, 0) only
     VVC_TASK_STAGE_PARSE,
     VVC_TASK_STAGE_INTER,
     VVC_TASK_STAGE_RECON,
@@ -175,10 +176,14 @@ static int task_has_target_score(VVCTask *t, const VVCTaskStage stage, const uin
     uint8_t target = 0;
     VVCFrameContext *fc = t->fc;
 
+    if (stage == VVC_TASK_STAGE_INIT)
+        return 1;
+
     if (stage == VVC_TASK_STAGE_PARSE) {
-        const H266RawSPS *rsps = fc->ps.sps->r;
-        const int wpp = rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx, t->ry);
-        target = 2 + wpp - 1;                           //left parse + colocation + wpp - no previous stage
+        const H266RawSPS *rsps   = fc->ps.sps->r;
+        const int wpp            = rsps->sps_entropy_coding_sync_enabled_flag && !is_first_row(fc, t->rx, t->ry);
+        const int no_prev_stage  = t->rs > 0;
+        target = 2 + wpp - no_prev_stage;                           //left parse + colocation + wpp - no_prev_stage
     } else if (stage == VVC_TASK_STAGE_INTER) {
         target = atomic_load(&t->target_inter_score);
     } else {
@@ -399,6 +404,55 @@ static int task_priority_higher(const AVTask *_a, const AVTask *_b)
     return a->ry < b->ry;
 }
 
+static void check_colocation(VVCContext *s, VVCTask *t)
+{
+    const VVCFrameContext *fc = t->fc;
+
+    if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag || fc->ps.sps->r->sps_sbtmvp_enabled_flag) {
+        VVCFrame *col       = fc->ref->collocated_ref;
+        const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx];
+        if (col && first_col) {
+            //we depend on bottom and right boundary, do not - 1 for y
+            const int y = (t->ry << fc->ps.sps->ctb_log2_size_y);
+            add_progress_listener(col, &t->col_listener, t, s, VVC_PROGRESS_MV, y);
+            return;
+        }
+    }
+    frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
+}
+
+static void submit_entry_point(VVCContext *s, VVCFrameThread *ft, SliceContext *sc, EntryPoint *ep)
+{
+    const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start];
+    VVCTask *t   = ft->tasks + rs;
+
+    frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
+}
+
+static int run_init(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
+{
+    VVCFrameContext *fc = lc->fc;
+    VVCFrameThread *ft  = fc->ft;
+    const int ret       = ff_vvc_per_frame_init(fc);
+
+    if (ret < 0)
+        return ret;
+
+    for (int i = 0; i < fc->nb_slices; i++) {
+        SliceContext *sc = fc->slices[i];
+        for (int j = 0; j < sc->nb_eps; j++) {
+            EntryPoint *ep = sc->eps + j;
+            for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
+                const int rs = sc->sh.ctb_addr_in_curr_slice[k];
+                VVCTask *t   = ft->tasks + rs;
+                check_colocation(s, t);
+            }
+            submit_entry_point(s, ft, sc, ep);
+        }
+    }
+    return 0;
+}
+
 static void report_frame_progress(VVCFrameContext *fc,
    const int ry, const VVCProgress idx)
 {
@@ -547,6 +601,7 @@ static int run_alf(VVCContext *s, VVCLocalContext *lc, VVCTask *t)
 #define VVC_THREAD_DEBUG
 #ifdef VVC_THREAD_DEBUG
 const static char* task_name[] = {
+    "INIT",
     "P",
     "I",
     "R",
@@ -567,6 +622,7 @@ static void task_run_stage(VVCTask *t, VVCContext *s, VVCLocalContext *lc)
     VVCFrameThread *ft       = fc->ft;
     const VVCTaskStage stage = t->stage;
     static const run_func run[] = {
+        run_init,
         run_parse,
         run_inter,
         run_recon,
@@ -726,7 +782,7 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc)
 
     for (int rs = 0; rs < ft->ctu_count; rs++) {
         VVCTask *t = ft->tasks + rs;
-        task_init(t, VVC_TASK_STAGE_PARSE, fc, rs % ft->ctu_width, rs / ft->ctu_width);
+        task_init(t, rs ? VVC_TASK_STAGE_PARSE : VVC_TASK_STAGE_INIT, fc, rs % ft->ctu_width, rs / ft->ctu_width);
     }
 
     memset(&ft->row_progress[0], 0, sizeof(ft->row_progress));
@@ -745,59 +801,25 @@ fail:
     return AVERROR(ENOMEM);
 }
 
-static void check_colocation(VVCContext *s, VVCTask *t)
-{
-    const VVCFrameContext *fc = t->fc;
-
-    if (fc->ps.ph.r->ph_temporal_mvp_enabled_flag || fc->ps.sps->r->sps_sbtmvp_enabled_flag) {
-        VVCFrame *col       = fc->ref->collocated_ref;
-        const int first_col = t->rx == fc->ps.pps->ctb_to_col_bd[t->rx];
-        if (col && first_col) {
-            //we depend on bottom and right boundary, do not - 1 for y
-            const int y = (t->ry << fc->ps.sps->ctb_log2_size_y);
-            add_progress_listener(col, &t->col_listener, t, s, VVC_PROGRESS_MV, y);
-            return;
-        }
-    }
-    frame_thread_add_score(s, fc->ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
-}
-
-static void submit_entry_point(VVCContext *s, VVCFrameThread *ft, SliceContext *sc, EntryPoint *ep)
-{
-    const int rs = sc->sh.ctb_addr_in_curr_slice[ep->ctu_start];
-    VVCTask *t   = ft->tasks + rs;
-
-    frame_thread_add_score(s, ft, t->rx, t->ry, VVC_TASK_STAGE_PARSE);
-}
-
 int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc)
 {
     VVCFrameThread *ft = fc->ft;
 
-    // We'll handle this in two passes:
-    // Pass 0 to initialize tasks with parser, this will help detect bit stream error
-    // Pass 1 to shedule location check and submit the entry point
-    for (int pass = 0; pass < 2; pass++) {
-        for (int i = 0; i < fc->nb_slices; i++) {
-            SliceContext *sc = fc->slices[i];
-            for (int j = 0; j < sc->nb_eps; j++) {
-                EntryPoint *ep = sc->eps + j;
-                for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
-                    const int rs = sc->sh.ctb_addr_in_curr_slice[k];
-                    VVCTask *t   = ft->tasks + rs;
-                    if (pass) {
-                        check_colocation(s, t);
-                    } else {
-                        const int ret = task_init_parse(t, sc, ep, k);
-                        if (ret < 0)
-                            return ret;
-                    }
-                }
-                if (pass)
-                    submit_entry_point(s, ft, sc, ep);
+    for (int i = 0; i < fc->nb_slices; i++) {
+        SliceContext *sc = fc->slices[i];
+        for (int j = 0; j < sc->nb_eps; j++) {
+            EntryPoint *ep = sc->eps + j;
+            for (int k = ep->ctu_start; k < ep->ctu_end; k++) {
+                const int rs = sc->sh.ctb_addr_in_curr_slice[k];
+                VVCTask *t   = ft->tasks + rs;
+                const int ret = task_init_parse(t, sc, ep, k);
+                if (ret < 0)
+                    return ret;
             }
         }
     }
+    frame_thread_add_score(s, ft, 0, 0, VVC_TASK_STAGE_INIT);
+
     return 0;
 }
 
diff --git a/libavcodec/vvc/thread.h b/libavcodec/vvc/thread.h
index 8ac59b2ecf..7b15dbee59 100644
--- a/libavcodec/vvc/thread.h
+++ b/libavcodec/vvc/thread.h
@@ -32,5 +32,6 @@ int ff_vvc_frame_thread_init(VVCFrameContext *fc);
 void ff_vvc_frame_thread_free(VVCFrameContext *fc);
 int ff_vvc_frame_submit(VVCContext *s, VVCFrameContext *fc);
 int ff_vvc_frame_wait(VVCContext *s, VVCFrameContext *fc);
+int ff_vvc_per_frame_init(VVCFrameContext *fc);
 
 #endif // AVCODEC_VVC_THREAD_H



More information about the ffmpeg-cvslog mailing list