[FFmpeg-devel] [PATCH 6/6] avcodec/vc1: Introduce fast path for unescaping bitstream buffer
Andreas Rheinhardt
andreas.rheinhardt at outlook.com
Fri Mar 18 21:10:11 EET 2022
Ben Avison:
> Populate with implementations suitable for 32-bit and 64-bit Arm.
>
> Signed-off-by: Ben Avison <bavison at riscosopen.org>
> ---
> libavcodec/aarch64/vc1dsp_init_aarch64.c | 60 ++++++++
> libavcodec/aarch64/vc1dsp_neon.S | 176 +++++++++++++++++++++++
> libavcodec/arm/vc1dsp_init_neon.c | 60 ++++++++
> libavcodec/arm/vc1dsp_neon.S | 118 +++++++++++++++
> libavcodec/vc1dec.c | 20 +--
> libavcodec/vc1dsp.c | 2 +
> libavcodec/vc1dsp.h | 3 +
> 7 files changed, 429 insertions(+), 10 deletions(-)
>
> diff --git a/libavcodec/aarch64/vc1dsp_init_aarch64.c b/libavcodec/aarch64/vc1dsp_init_aarch64.c
> index b672b2aa99..2fc2d5d1d3 100644
> --- a/libavcodec/aarch64/vc1dsp_init_aarch64.c
> +++ b/libavcodec/aarch64/vc1dsp_init_aarch64.c
> @@ -51,6 +51,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
> void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
> int h, int x, int y);
>
> +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
> +
> +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
> +{
> + /* Dealing with starting and stopping, and removing escape bytes, are
> + * comparatively less time-sensitive, so are more clearly expressed using
> + * a C wrapper around the assembly inner loop. Note that we assume a
> + * little-endian machine that supports unaligned loads. */
> + int dsize = 0;
> + while (size >= 4)
> + {
> + int found = 0;
> + while (!found && (((uintptr_t) dst) & 7) && size >= 4)
> + {
> + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
> + if (!found)
> + {
> + *dst++ = *src++;
> + --size;
> + ++dsize;
> + }
> + }
> + if (!found)
> + {
> + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
> + dst += skip;
> + src += skip;
> + size -= skip;
> + dsize += skip;
> + while (!found && size >= 4)
> + {
> + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
> + if (!found)
> + {
> + *dst++ = *src++;
> + --size;
> + ++dsize;
> + }
> + }
> + }
> + if (found)
> + {
> + *dst++ = *src++;
> + *dst++ = *src++;
> + ++src;
> + size -= 3;
> + dsize += 2;
> + }
> + }
> + while (size > 0)
> + {
> + *dst++ = *src++;
> + --size;
> + ++dsize;
> + }
> + return dsize;
> +}
> +
> av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
> {
> int cpu_flags = av_get_cpu_flags();
> @@ -76,5 +134,7 @@ av_cold void ff_vc1dsp_init_aarch64(VC1DSPContext *dsp)
> dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
> dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
> dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
> +
> + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
> }
> }
> diff --git a/libavcodec/aarch64/vc1dsp_neon.S b/libavcodec/aarch64/vc1dsp_neon.S
> index c3ca3eae1e..8bdeffab44 100644
> --- a/libavcodec/aarch64/vc1dsp_neon.S
> +++ b/libavcodec/aarch64/vc1dsp_neon.S
> @@ -1374,3 +1374,179 @@ function ff_vc1_h_loop_filter16_neon, export=1
> st2 {v2.b, v3.b}[7], [x6]
> 4: ret
> endfunc
> +
> +// Copy at most the specified number of bytes from source to destination buffer,
> +// stopping at a multiple of 32 bytes, none of which are the start of an escape sequence
> +// On entry:
> +// x0 -> source buffer
> +// w1 = max number of bytes to copy
> +// x2 -> destination buffer, optimally 8-byte aligned
> +// On exit:
> +// w0 = number of bytes not copied
> +function ff_vc1_unescape_buffer_helper_neon, export=1
> + // Offset by 80 to screen out cases that are too short for us to handle,
> + // and also make it easy to test for loop termination, or to determine
> + // whether we need an odd number of half-iterations of the loop.
> + subs w1, w1, #80
> + b.mi 90f
> +
> + // Set up useful constants
> + movi v20.4s, #3, lsl #24
> + movi v21.4s, #3, lsl #16
> +
> + tst w1, #32
> + b.ne 1f
> +
> + ld1 {v0.16b, v1.16b, v2.16b}, [x0], #48
> + ext v25.16b, v0.16b, v1.16b, #1
> + ext v26.16b, v0.16b, v1.16b, #2
> + ext v27.16b, v0.16b, v1.16b, #3
> + ext v29.16b, v1.16b, v2.16b, #1
> + ext v30.16b, v1.16b, v2.16b, #2
> + ext v31.16b, v1.16b, v2.16b, #3
> + bic v24.16b, v0.16b, v20.16b
> + bic v25.16b, v25.16b, v20.16b
> + bic v26.16b, v26.16b, v20.16b
> + bic v27.16b, v27.16b, v20.16b
> + bic v28.16b, v1.16b, v20.16b
> + bic v29.16b, v29.16b, v20.16b
> + bic v30.16b, v30.16b, v20.16b
> + bic v31.16b, v31.16b, v20.16b
> + eor v24.16b, v24.16b, v21.16b
> + eor v25.16b, v25.16b, v21.16b
> + eor v26.16b, v26.16b, v21.16b
> + eor v27.16b, v27.16b, v21.16b
> + eor v28.16b, v28.16b, v21.16b
> + eor v29.16b, v29.16b, v21.16b
> + eor v30.16b, v30.16b, v21.16b
> + eor v31.16b, v31.16b, v21.16b
> + cmeq v24.4s, v24.4s, #0
> + cmeq v25.4s, v25.4s, #0
> + cmeq v26.4s, v26.4s, #0
> + cmeq v27.4s, v27.4s, #0
> + add w1, w1, #32
> + b 3f
> +
> +1: ld1 {v3.16b, v4.16b, v5.16b}, [x0], #48
> + ext v25.16b, v3.16b, v4.16b, #1
> + ext v26.16b, v3.16b, v4.16b, #2
> + ext v27.16b, v3.16b, v4.16b, #3
> + ext v29.16b, v4.16b, v5.16b, #1
> + ext v30.16b, v4.16b, v5.16b, #2
> + ext v31.16b, v4.16b, v5.16b, #3
> + bic v24.16b, v3.16b, v20.16b
> + bic v25.16b, v25.16b, v20.16b
> + bic v26.16b, v26.16b, v20.16b
> + bic v27.16b, v27.16b, v20.16b
> + bic v28.16b, v4.16b, v20.16b
> + bic v29.16b, v29.16b, v20.16b
> + bic v30.16b, v30.16b, v20.16b
> + bic v31.16b, v31.16b, v20.16b
> + eor v24.16b, v24.16b, v21.16b
> + eor v25.16b, v25.16b, v21.16b
> + eor v26.16b, v26.16b, v21.16b
> + eor v27.16b, v27.16b, v21.16b
> + eor v28.16b, v28.16b, v21.16b
> + eor v29.16b, v29.16b, v21.16b
> + eor v30.16b, v30.16b, v21.16b
> + eor v31.16b, v31.16b, v21.16b
> + cmeq v24.4s, v24.4s, #0
> + cmeq v25.4s, v25.4s, #0
> + cmeq v26.4s, v26.4s, #0
> + cmeq v27.4s, v27.4s, #0
> + // Drop through...
> +2: mov v0.16b, v5.16b
> + ld1 {v1.16b, v2.16b}, [x0], #32
> + cmeq v28.4s, v28.4s, #0
> + cmeq v29.4s, v29.4s, #0
> + cmeq v30.4s, v30.4s, #0
> + cmeq v31.4s, v31.4s, #0
> + orr v24.16b, v24.16b, v25.16b
> + orr v26.16b, v26.16b, v27.16b
> + orr v28.16b, v28.16b, v29.16b
> + orr v30.16b, v30.16b, v31.16b
> + ext v25.16b, v0.16b, v1.16b, #1
> + orr v22.16b, v24.16b, v26.16b
> + ext v26.16b, v0.16b, v1.16b, #2
> + ext v27.16b, v0.16b, v1.16b, #3
> + ext v29.16b, v1.16b, v2.16b, #1
> + orr v23.16b, v28.16b, v30.16b
> + ext v30.16b, v1.16b, v2.16b, #2
> + ext v31.16b, v1.16b, v2.16b, #3
> + bic v24.16b, v0.16b, v20.16b
> + bic v25.16b, v25.16b, v20.16b
> + bic v26.16b, v26.16b, v20.16b
> + orr v22.16b, v22.16b, v23.16b
> + bic v27.16b, v27.16b, v20.16b
> + bic v28.16b, v1.16b, v20.16b
> + bic v29.16b, v29.16b, v20.16b
> + bic v30.16b, v30.16b, v20.16b
> + bic v31.16b, v31.16b, v20.16b
> + addv s22, v22.4s
> + eor v24.16b, v24.16b, v21.16b
> + eor v25.16b, v25.16b, v21.16b
> + eor v26.16b, v26.16b, v21.16b
> + eor v27.16b, v27.16b, v21.16b
> + eor v28.16b, v28.16b, v21.16b
> + mov w3, v22.s[0]
> + eor v29.16b, v29.16b, v21.16b
> + eor v30.16b, v30.16b, v21.16b
> + eor v31.16b, v31.16b, v21.16b
> + cmeq v24.4s, v24.4s, #0
> + cmeq v25.4s, v25.4s, #0
> + cmeq v26.4s, v26.4s, #0
> + cmeq v27.4s, v27.4s, #0
> + cbnz w3, 90f
> + st1 {v3.16b, v4.16b}, [x2], #32
> +3: mov v3.16b, v2.16b
> + ld1 {v4.16b, v5.16b}, [x0], #32
> + cmeq v28.4s, v28.4s, #0
> + cmeq v29.4s, v29.4s, #0
> + cmeq v30.4s, v30.4s, #0
> + cmeq v31.4s, v31.4s, #0
> + orr v24.16b, v24.16b, v25.16b
> + orr v26.16b, v26.16b, v27.16b
> + orr v28.16b, v28.16b, v29.16b
> + orr v30.16b, v30.16b, v31.16b
> + ext v25.16b, v3.16b, v4.16b, #1
> + orr v22.16b, v24.16b, v26.16b
> + ext v26.16b, v3.16b, v4.16b, #2
> + ext v27.16b, v3.16b, v4.16b, #3
> + ext v29.16b, v4.16b, v5.16b, #1
> + orr v23.16b, v28.16b, v30.16b
> + ext v30.16b, v4.16b, v5.16b, #2
> + ext v31.16b, v4.16b, v5.16b, #3
> + bic v24.16b, v3.16b, v20.16b
> + bic v25.16b, v25.16b, v20.16b
> + bic v26.16b, v26.16b, v20.16b
> + orr v22.16b, v22.16b, v23.16b
> + bic v27.16b, v27.16b, v20.16b
> + bic v28.16b, v4.16b, v20.16b
> + bic v29.16b, v29.16b, v20.16b
> + bic v30.16b, v30.16b, v20.16b
> + bic v31.16b, v31.16b, v20.16b
> + addv s22, v22.4s
> + eor v24.16b, v24.16b, v21.16b
> + eor v25.16b, v25.16b, v21.16b
> + eor v26.16b, v26.16b, v21.16b
> + eor v27.16b, v27.16b, v21.16b
> + eor v28.16b, v28.16b, v21.16b
> + mov w3, v22.s[0]
> + eor v29.16b, v29.16b, v21.16b
> + eor v30.16b, v30.16b, v21.16b
> + eor v31.16b, v31.16b, v21.16b
> + cmeq v24.4s, v24.4s, #0
> + cmeq v25.4s, v25.4s, #0
> + cmeq v26.4s, v26.4s, #0
> + cmeq v27.4s, v27.4s, #0
> + cbnz w3, 91f
> + st1 {v0.16b, v1.16b}, [x2], #32
> + subs w1, w1, #64
> + b.pl 2b
> +
> +90: add w0, w1, #80
> + ret
> +
> +91: sub w1, w1, #32
> + b 90b
> +endfunc
> diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
> index f5f5c702d7..3aefbcaf6d 100644
> --- a/libavcodec/arm/vc1dsp_init_neon.c
> +++ b/libavcodec/arm/vc1dsp_init_neon.c
> @@ -84,6 +84,64 @@ void ff_put_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
> void ff_avg_vc1_chroma_mc4_neon(uint8_t *dst, uint8_t *src, ptrdiff_t stride,
> int h, int x, int y);
>
> +int ff_vc1_unescape_buffer_helper_neon(const uint8_t *src, int size, uint8_t *dst);
> +
> +static int vc1_unescape_buffer_neon(const uint8_t *src, int size, uint8_t *dst)
> +{
> + /* Dealing with starting and stopping, and removing escape bytes, are
> + * comparatively less time-sensitive, so are more clearly expressed using
> + * a C wrapper around the assembly inner loop. Note that we assume a
> + * little-endian machine that supports unaligned loads. */
You should nevertheless use AV_RL32 for your unaligned LE loads
> + int dsize = 0;
> + while (size >= 4)
> + {
> + int found = 0;
> + while (!found && (((uintptr_t) dst) & 7) && size >= 4)
> + {
> + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
> + if (!found)
> + {
> + *dst++ = *src++;
> + --size;
> + ++dsize;
> + }
> + }
> + if (!found)
> + {
> + int skip = size - ff_vc1_unescape_buffer_helper_neon(src, size, dst);
> + dst += skip;
> + src += skip;
> + size -= skip;
> + dsize += skip;
> + while (!found && size >= 4)
> + {
> + found = (*(uint32_t *)src &~ 0x03000000) == 0x00030000;
> + if (!found)
> + {
> + *dst++ = *src++;
> + --size;
> + ++dsize;
> + }
> + }
> + }
> + if (found)
> + {
> + *dst++ = *src++;
> + *dst++ = *src++;
> + ++src;
> + size -= 3;
> + dsize += 2;
> + }
> + }
> + while (size > 0)
> + {
> + *dst++ = *src++;
> + --size;
> + ++dsize;
> + }
> + return dsize;
> +}
> +
> #define FN_ASSIGN(X, Y) \
> dsp->put_vc1_mspel_pixels_tab[0][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_16_neon; \
> dsp->put_vc1_mspel_pixels_tab[1][X+4*Y] = ff_put_vc1_mspel_mc##X##Y##_neon
> @@ -130,4 +188,6 @@ av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
> dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
> dsp->put_no_rnd_vc1_chroma_pixels_tab[1] = ff_put_vc1_chroma_mc4_neon;
> dsp->avg_no_rnd_vc1_chroma_pixels_tab[1] = ff_avg_vc1_chroma_mc4_neon;
> +
> + dsp->vc1_unescape_buffer = vc1_unescape_buffer_neon;
> }
> diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
> index 4ef083102b..9d7333cf12 100644
> --- a/libavcodec/arm/vc1dsp_neon.S
> +++ b/libavcodec/arm/vc1dsp_neon.S
> @@ -1804,3 +1804,121 @@ function ff_vc1_h_loop_filter16_neon, export=1
> 4: vpop {d8-d15}
> pop {r4-r6,pc}
> endfunc
> +
> +@ Copy at most the specified number of bytes from source to destination buffer,
> +@ stopping at a multiple of 16 bytes, none of which are the start of an escape sequence
> +@ On entry:
> +@ r0 -> source buffer
> +@ r1 = max number of bytes to copy
> +@ r2 -> destination buffer, optimally 8-byte aligned
> +@ On exit:
> +@ r0 = number of bytes not copied
> +function ff_vc1_unescape_buffer_helper_neon, export=1
> + @ Offset by 48 to screen out cases that are too short for us to handle,
> + @ and also make it easy to test for loop termination, or to determine
> + @ whether we need an odd number of half-iterations of the loop.
> + subs r1, r1, #48
> + bmi 90f
> +
> + @ Set up useful constants
> + vmov.i32 q0, #0x3000000
> + vmov.i32 q1, #0x30000
> +
> + tst r1, #16
> + bne 1f
> +
> + vld1.8 {q8, q9}, [r0]!
> + vbic q12, q8, q0
> + vext.8 q13, q8, q9, #1
> + vext.8 q14, q8, q9, #2
> + vext.8 q15, q8, q9, #3
> + veor q12, q12, q1
> + vbic q13, q13, q0
> + vbic q14, q14, q0
> + vbic q15, q15, q0
> + vceq.i32 q12, q12, #0
> + veor q13, q13, q1
> + veor q14, q14, q1
> + veor q15, q15, q1
> + vceq.i32 q13, q13, #0
> + vceq.i32 q14, q14, #0
> + vceq.i32 q15, q15, #0
> + add r1, r1, #16
> + b 3f
> +
> +1: vld1.8 {q10, q11}, [r0]!
> + vbic q12, q10, q0
> + vext.8 q13, q10, q11, #1
> + vext.8 q14, q10, q11, #2
> + vext.8 q15, q10, q11, #3
> + veor q12, q12, q1
> + vbic q13, q13, q0
> + vbic q14, q14, q0
> + vbic q15, q15, q0
> + vceq.i32 q12, q12, #0
> + veor q13, q13, q1
> + veor q14, q14, q1
> + veor q15, q15, q1
> + vceq.i32 q13, q13, #0
> + vceq.i32 q14, q14, #0
> + vceq.i32 q15, q15, #0
> + @ Drop through...
> +2: vmov q8, q11
> + vld1.8 {q9}, [r0]!
> + vorr q13, q12, q13
> + vorr q15, q14, q15
> + vbic q12, q8, q0
> + vorr q3, q13, q15
> + vext.8 q13, q8, q9, #1
> + vext.8 q14, q8, q9, #2
> + vext.8 q15, q8, q9, #3
> + veor q12, q12, q1
> + vorr d6, d6, d7
> + vbic q13, q13, q0
> + vbic q14, q14, q0
> + vbic q15, q15, q0
> + vceq.i32 q12, q12, #0
> + vmov r3, r12, d6
> + veor q13, q13, q1
> + veor q14, q14, q1
> + veor q15, q15, q1
> + vceq.i32 q13, q13, #0
> + vceq.i32 q14, q14, #0
> + vceq.i32 q15, q15, #0
> + orrs r3, r3, r12
> + bne 90f
> + vst1.64 {q10}, [r2]!
> +3: vmov q10, q9
> + vld1.8 {q11}, [r0]!
> + vorr q13, q12, q13
> + vorr q15, q14, q15
> + vbic q12, q10, q0
> + vorr q3, q13, q15
> + vext.8 q13, q10, q11, #1
> + vext.8 q14, q10, q11, #2
> + vext.8 q15, q10, q11, #3
> + veor q12, q12, q1
> + vorr d6, d6, d7
> + vbic q13, q13, q0
> + vbic q14, q14, q0
> + vbic q15, q15, q0
> + vceq.i32 q12, q12, #0
> + vmov r3, r12, d6
> + veor q13, q13, q1
> + veor q14, q14, q1
> + veor q15, q15, q1
> + vceq.i32 q13, q13, #0
> + vceq.i32 q14, q14, #0
> + vceq.i32 q15, q15, #0
> + orrs r3, r3, r12
> + bne 91f
> + vst1.64 {q8}, [r2]!
> + subs r1, r1, #32
> + bpl 2b
> +
> +90: add r0, r1, #48
> + bx lr
> +
> +91: sub r1, r1, #16
> + b 90b
> +endfunc
> diff --git a/libavcodec/vc1dec.c b/libavcodec/vc1dec.c
> index 1c92b9d401..6a30b5b664 100644
> --- a/libavcodec/vc1dec.c
> +++ b/libavcodec/vc1dec.c
> @@ -490,7 +490,7 @@ static av_cold int vc1_decode_init(AVCodecContext *avctx)
> size = next - start - 4;
> if (size <= 0)
> continue;
> - buf2_size = vc1_unescape_buffer(start + 4, size, buf2);
> + buf2_size = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
> init_get_bits(&gb, buf2, buf2_size * 8);
> switch (AV_RB32(start)) {
> case VC1_CODE_SEQHDR:
> @@ -680,7 +680,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
> case VC1_CODE_FRAME:
> if (avctx->hwaccel)
> buf_start = start;
> - buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
> + buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
> break;
> case VC1_CODE_FIELD: {
> int buf_size3;
> @@ -697,8 +697,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
> ret = AVERROR(ENOMEM);
> goto err;
> }
> - buf_size3 = vc1_unescape_buffer(start + 4, size,
> - slices[n_slices].buf);
> + buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
> + slices[n_slices].buf);
> init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
> buf_size3 << 3);
> slices[n_slices].mby_start = avctx->coded_height + 31 >> 5;
> @@ -709,7 +709,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
> break;
> }
> case VC1_CODE_ENTRYPOINT: /* it should be before frame data */
> - buf_size2 = vc1_unescape_buffer(start + 4, size, buf2);
> + buf_size2 = v->vc1dsp.vc1_unescape_buffer(start + 4, size, buf2);
> init_get_bits(&s->gb, buf2, buf_size2 * 8);
> ff_vc1_decode_entry_point(avctx, v, &s->gb);
> break;
> @@ -726,8 +726,8 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
> ret = AVERROR(ENOMEM);
> goto err;
> }
> - buf_size3 = vc1_unescape_buffer(start + 4, size,
> - slices[n_slices].buf);
> + buf_size3 = v->vc1dsp.vc1_unescape_buffer(start + 4, size,
> + slices[n_slices].buf);
> init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
> buf_size3 << 3);
> slices[n_slices].mby_start = get_bits(&slices[n_slices].gb, 9);
> @@ -761,7 +761,7 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
> ret = AVERROR(ENOMEM);
> goto err;
> }
> - buf_size3 = vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
> + buf_size3 = v->vc1dsp.vc1_unescape_buffer(divider + 4, buf + buf_size - divider - 4, slices[n_slices].buf);
> init_get_bits(&slices[n_slices].gb, slices[n_slices].buf,
> buf_size3 << 3);
> slices[n_slices].mby_start = s->mb_height + 1 >> 1;
> @@ -770,9 +770,9 @@ static int vc1_decode_frame(AVCodecContext *avctx, void *data,
> n_slices1 = n_slices - 1;
> n_slices++;
> }
> - buf_size2 = vc1_unescape_buffer(buf, divider - buf, buf2);
> + buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, divider - buf, buf2);
> } else {
> - buf_size2 = vc1_unescape_buffer(buf, buf_size, buf2);
> + buf_size2 = v->vc1dsp.vc1_unescape_buffer(buf, buf_size, buf2);
> }
> init_get_bits(&s->gb, buf2, buf_size2*8);
> } else{
> diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
> index a29b91bf3d..11d493f002 100644
> --- a/libavcodec/vc1dsp.c
> +++ b/libavcodec/vc1dsp.c
> @@ -34,6 +34,7 @@
> #include "rnd_avg.h"
> #include "vc1dsp.h"
> #include "startcode.h"
> +#include "vc1_common.h"
>
> /* Apply overlap transform to horizontal edge */
> static void vc1_v_overlap_c(uint8_t *src, int stride)
> @@ -1030,6 +1031,7 @@ av_cold void ff_vc1dsp_init(VC1DSPContext *dsp)
> #endif /* CONFIG_WMV3IMAGE_DECODER || CONFIG_VC1IMAGE_DECODER */
>
> dsp->startcode_find_candidate = ff_startcode_find_candidate_c;
> + dsp->vc1_unescape_buffer = vc1_unescape_buffer;
>
> if (ARCH_AARCH64)
> ff_vc1dsp_init_aarch64(dsp);
> diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
> index c6443acb20..8be1198071 100644
> --- a/libavcodec/vc1dsp.h
> +++ b/libavcodec/vc1dsp.h
> @@ -80,6 +80,9 @@ typedef struct VC1DSPContext {
> * one or more further zero bytes and a one byte.
> */
> int (*startcode_find_candidate)(const uint8_t *buf, int size);
> +
> + /* Copy a buffer, removing startcode emulation escape bytes as we go */
> + int (*vc1_unescape_buffer)(const uint8_t *src, int size, uint8_t *dst);
> } VC1DSPContext;
>
> void ff_vc1dsp_init(VC1DSPContext* c);
1. You should add some benchmarks to the commit message.
2. The unescaping process for VC1 is basically the same as for H.264 and
HEVC* and for those we already have better optimized code in
libavcodec/h2645_parse.c. Can you check the performance of this code
here against (re)using the code from h2645_parse.c?
(3. Btw: The code in h2645_parse.c could even be optimized further along
the lines of
https://ffmpeg.org/pipermail/ffmpeg-devel/2019-June/245203.html (The
H.264 and VC1 parsers use a quite suboptimal startcode search; this
patch is part of a patchset I submitted ages ago to improve it.).)
- Andreas
*: Except for the fact that VC-1 seems to allow 0x00 0x00 0x03 0xXY with
0xXY > 3 (where the 0x03 is not escaped) to occur inside a EBDU; it also
allows 0x00 0x00 0x02 (while the informative process for encoders is the
same as for H.2645; it does not produce the byte sequences disallows by
H.264).
More information about the ffmpeg-devel
mailing list