[FFmpeg-devel] [PATCH] avfilter/dctdnoiz: rewrite [f/i]dct

Mon Aug 4 00:44:48 CEST 2014

On Sun, Aug 03, 2014 at 10:27:21PM +0200, Clément Bœsch wrote:
> This removes the avcodec dependency and make the code almost twice as
> fast. More to come.
> 
> The DCT factorization is based on "Fast and numerically stable
> algorithms for discrete cosine transforms" from Gerlind Plonkaa &
> Manfred Tasche (DOI: 10.1016/j.laa.2004.07.015).
> ---
>  configure                 |   2 -
>  libavfilter/vf_dctdnoiz.c | 328 +++++++++++++++++++++++++++++++++-------------
>  2 files changed, 240 insertions(+), 90 deletions(-)
> 
> diff --git a/configure b/configure
> index 9c3af50..6196b2a 100755
> --- a/configure
> +++ b/configure
> @@ -2526,8 +2526,6 @@ boxblur_filter_deps="gpl"
>  bs2b_filter_deps="libbs2b"
>  colormatrix_filter_deps="gpl"
>  cropdetect_filter_deps="gpl"
> -dctdnoiz_filter_deps="avcodec"
> -dctdnoiz_filter_select="dct"
>  delogo_filter_deps="gpl"
>  deshake_filter_deps="avcodec"
>  deshake_filter_select="me_cmp"
> diff --git a/libavfilter/vf_dctdnoiz.c b/libavfilter/vf_dctdnoiz.c
> index 71b8536..6d24934 100644
> --- a/libavfilter/vf_dctdnoiz.c
> +++ b/libavfilter/vf_dctdnoiz.c
> @@ -1,5 +1,5 @@
>  /*
> - * Copyright (c) 2013 Clément Bœsch
> + * Copyright (c) 2013-2014 Clément Bœsch
>   *
>   * This file is part of FFmpeg.
>   *
> @@ -23,7 +23,6 @@
>   * @see http://www.ipol.im/pub/art/2011/ys-dct/
>   */
>  
> -#include "libavcodec/avfft.h"
>  #include "libavutil/eval.h"
>  #include "libavutil/opt.h"
>  #include "drawutils.h"
> @@ -35,7 +34,7 @@
>  static const char *const var_names[] = { "c", NULL };
>  enum { VAR_C, VAR_VARS_NB };
>  
> -typedef struct {
> +typedef struct DCTdnoizContext {
>      const AVClass *class;
>  
>      /* coefficient factor expression */
> @@ -52,8 +51,9 @@ typedef struct {
>      int p_linesize;             // line sizes for color and weights
>      int overlap;                // number of block overlapping pixels
>      int step;                   // block step increment (BSIZE - overlap)
> -    DCTContext *dct, *idct;     // DCT and inverse DCT contexts
> -    float *block, *tmp_block;   // two BSIZE x BSIZE block buffers
> +    void (*filter_freq_func)(struct DCTdnoizContext *s,
> +                             const float *src, int src_linesize,
> +                             float *dst, int dst_linesize);
>  } DCTdnoizContext;
>  
>  #define OFFSET(x) offsetof(DCTdnoizContext, x)
> @@ -69,66 +69,245 @@ static const AVOption dctdnoiz_options[] = {
>  
>  AVFILTER_DEFINE_CLASS(dctdnoiz);
>  
> -static float *dct_block(DCTdnoizContext *ctx, const float *src, int src_linesize)
> +static void av_always_inline fdct16_1d(float *dst, const float *src,
> +                                       int dst_stridea, int dst_strideb,
> +                                       int src_stridea, int src_strideb)
>  {
> -    int x, y;
> -    float *column;
> -
> -    for (y = 0; y < BSIZE; y++) {
> -        float *line = ctx->block;
> +    int i;
>  
> -        memcpy(line, src, BSIZE * sizeof(*line));
> -        src += src_linesize;
> -        av_dct_calc(ctx->dct, line);
> -
> -        column = ctx->tmp_block + y;
> -        column[0] = line[0] * (1. / sqrt(BSIZE));
> -        column += BSIZE;
> -        for (x = 1; x < BSIZE; x++) {
> -            *column = line[x] * sqrt(2. / BSIZE);
> -            column += BSIZE;
> -        }
> +    for (i = 0; i < BSIZE; i++) {
> +        const float x0_0 = src[ 0*src_stridea] + src[15*src_stridea];
> +        const float x0_1 = src[ 1*src_stridea] + src[14*src_stridea];
> +        const float x0_2 = src[ 2*src_stridea] + src[13*src_stridea];
> +        const float x0_3 = src[ 3*src_stridea] + src[12*src_stridea];
> +        const float x0_4 = src[ 4*src_stridea] + src[11*src_stridea];
> +        const float x0_5 = src[ 5*src_stridea] + src[10*src_stridea];
> +        const float x0_6 = src[ 6*src_stridea] + src[ 9*src_stridea];
> +        const float x0_7 = src[ 7*src_stridea] + src[ 8*src_stridea];
> +        const float x0_8 = src[ 0*src_stridea] - src[15*src_stridea];
> +        const float x0_9 = src[ 1*src_stridea] - src[14*src_stridea];
> +        const float x0_a = src[ 2*src_stridea] - src[13*src_stridea];
> +        const float x0_b = src[ 3*src_stridea] - src[12*src_stridea];
> +        const float x0_c = src[ 4*src_stridea] - src[11*src_stridea];
> +        const float x0_d = src[ 5*src_stridea] - src[10*src_stridea];
> +        const float x0_e = src[ 6*src_stridea] - src[ 9*src_stridea];
> +        const float x0_f = src[ 7*src_stridea] - src[ 8*src_stridea];
> +        const float x2_0 = x0_0 + x0_7;
> +        const float x2_1 = x0_1 + x0_6;
> +        const float x2_2 = x0_2 + x0_5;
> +        const float x2_3 = x0_3 + x0_4;
> +        const float x2_4 = x0_0 - x0_7;
> +        const float x2_5 = x0_1 - x0_6;
> +        const float x2_6 = x0_2 - x0_5;
> +        const float x2_7 = x0_3 - x0_4;
> +        const float x4_0 = x2_0 + x2_3;
> +        const float x4_1 = x2_1 + x2_2;
> +        const float x4_2 = x2_0 - x2_3;
> +        const float x4_3 = x2_1 - x2_2;
> +        const float x5_0 = x4_0 + x4_1;
> +        const float x5_1 = x4_0 - x4_1;
> +        const float x5_2 =  1.306562964876380*x4_2 + 0.541196100146197*x4_3;
> +        const float x5_3 =  0.541196100146197*x4_2 - 1.306562964876380*x4_3;
> +        const float x6_0 =  1.387039845322150*x2_4 + 0.275899379282943*x2_7;
> +        const float x6_1 =  1.175875602419360*x2_5 + 0.785694958387102*x2_6;
> +        const float x6_2 = -0.785694958387102*x2_5 + 1.175875602419360*x2_6;
> +        const float x6_3 =  0.275899379282943*x2_4 - 1.387039845322150*x2_7;
> +        const float x7_0 = x6_0 + x6_1;
> +        const float x7_1 = x6_0 - x6_1;
> +        const float x7_2 = x6_2 + x6_3;
> +        const float x7_3 = x6_2 - x6_3;
> +        const float x3_5 =  0.707106781186547*x7_1 - 0.707106781186547*x7_3;
> +        const float x3_6 =  0.707106781186547*x7_1 + 0.707106781186547*x7_3;
> +        const float x8_0 =  1.407403737526380*x0_8 + 0.138617169199091*x0_f;
> +        const float x8_1 =  1.353318001174350*x0_9 + 0.410524527522357*x0_e;
> +        const float x8_2 =  1.247225012986670*x0_a + 0.666655658477747*x0_d;
> +        const float x8_3 =  1.093201867001760*x0_b + 0.897167586342636*x0_c;
> +        const float x8_4 = -0.897167586342636*x0_b + 1.093201867001760*x0_c;
> +        const float x8_5 =  0.666655658477747*x0_a - 1.247225012986670*x0_d;
> +        const float x8_6 = -0.410524527522357*x0_9 + 1.353318001174350*x0_e;
> +        const float x8_7 =  0.138617169199091*x0_8 - 1.407403737526380*x0_f;
> +        const float xa_0 = x8_0 + x8_3;
> +        const float xa_1 = x8_1 + x8_2;
> +        const float xa_2 = x8_0 - x8_3;
> +        const float xa_3 = x8_1 - x8_2;
> +        const float xb_0 = xa_0 + xa_1;
> +        const float xb_1 = xa_0 - xa_1;
> +        const float xb_2 = 1.306562964876380*xa_2 + 0.541196100146197*xa_3;
> +        const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3;
> +        const float xc_0 = x8_4 + x8_7;
> +        const float xc_1 = x8_5 + x8_6;
> +        const float xc_2 = x8_4 - x8_7;
> +        const float xc_3 = x8_5 - x8_6;
> +        const float xd_0 = xc_0 + xc_1;
> +        const float xd_1 = xc_0 - xc_1;
> +        const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3;
> +        const float xd_3 = 0.541196100146197*xc_2 - 1.306562964876380*xc_3;
> +        const float x1_9 = 0.707106781186547*xb_2 - 0.707106781186547*xd_3;
> +        const float x1_a = 0.707106781186547*xb_2 + 0.707106781186547*xd_3;
> +        const float x1_b = 0.707106781186547*xb_1 + 0.707106781186547*xd_1;
> +        const float x1_c = 0.707106781186547*xb_1 - 0.707106781186547*xd_1;
> +        const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2;
> +        const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2;
> +        dst[ 0*dst_stridea] = 0.25*x5_0;
> +        dst[ 1*dst_stridea] = 0.25*xb_0;
> +        dst[ 2*dst_stridea] = 0.25*x7_0;
> +        dst[ 3*dst_stridea] = 0.25*x1_9;
> +        dst[ 4*dst_stridea] = 0.25*x5_2;
> +        dst[ 5*dst_stridea] = 0.25*x1_a;
> +        dst[ 6*dst_stridea] = 0.25*x3_5;
> +        dst[ 7*dst_stridea] = 0.25*x1_b;
> +        dst[ 8*dst_stridea] = 0.25*x5_1;
> +        dst[ 9*dst_stridea] = 0.25*x1_c;
> +        dst[10*dst_stridea] = 0.25*x3_6;
> +        dst[11*dst_stridea] = 0.25*x1_d;
> +        dst[12*dst_stridea] = 0.25*x5_3;
> +        dst[13*dst_stridea] = 0.25*x1_e;
> +        dst[14*dst_stridea] = 0.25*x7_2;
> +        dst[15*dst_stridea] = 0.25*xd_0;

many of these multiplies look like they can be merged into other
multiplies

for example see:

const float xd_2 = 1.306562964876380*xc_2 + 0.541196100146197*xc_3;
const float xb_3 = 0.541196100146197*xa_2 - 1.306562964876380*xa_3;
const float x1_d = 0.707106781186547*xb_3 - 0.707106781186547*xd_2;
const float x1_e = 0.707106781186547*xb_3 + 0.707106781186547*xd_2;
dst[11*dst_stridea] = 0.25*x1_d;
dst[13*dst_stridea] = 0.25*x1_e;

vs.

const float xd_2 = (0.25*0.707106781186547*1.306562964876380)*xc_2 + (0.25*0.707106781186547*0.541196100146197)*xc_3;
const float xb_3 = (0.25*0.707106781186547*0.541196100146197)*xa_2 - (0.25*0.707106781186547*1.306562964876380)*xa_3;
dst[11*dst_stridea] = xb_3 - xd_2;
dst[13*dst_stridea] = xb_3 + xd_2;

[...]
-- 
Michael     GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB

It is what and why we do it that matters, not just one of them.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: not available
Type: application/pgp-signature
Size: 181 bytes
Desc: Digital signature
URL: <https://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20140804/9ca32c44/attachment.asc>