[FFmpeg-devel] [PATCH] avfilter/vf_bwdif_cuda: CUDA implementation of bwdif

Sun Oct 11 19:36:42 EEST 2020

Hi Philip,

Am Fr., 9. Okt. 2020 um 18:33 Uhr schrieb Philip Langdale <philipl at overt.org
>:

> I've been sitting on this for a couple of years now, and I figured I
> should just send it out. This is what I believe is a conceptually
> correct port of bwdif to cuda (modulo edge handling which is not done
> in the same way because the conditional checks for edges are expensive
> in cuda, but that's the same as for yadif_cuda).
>
> However, I see glitches in some samples where black or white pixels
> appear in white or black areas respectively. This seems like some
> sort of under/overflow. I've tried to use the largest cuda types
> everywhere, and that did appear to improve things but didn't make
> it go away. This is what led to me never sending this diff over the
> years, but maybe someone else has insights about this.
>

I am not familiar with cuda. So here is just one difference, which I
noticed compared to the c code.
Maybe that is the reason for the glitches.

> ---
>  configure                    |   2 +
>  libavfilter/Makefile         |   2 +
>  libavfilter/allfilters.c     |   1 +
>  libavfilter/vf_bwdif_cuda.c  | 394 +++++++++++++++++++++++++++++++++++
>  libavfilter/vf_bwdif_cuda.cu | 290 ++++++++++++++++++++++++++
>  5 files changed, 689 insertions(+)
>  create mode 100644 libavfilter/vf_bwdif_cuda.c
>  create mode 100644 libavfilter/vf_bwdif_cuda.cu
>
> ...

> +
> +template<typename T>
> +__inline__ __device__ T filter(T A, T B, T C, T D,
> +                               T a, T b, T c, T d, T e, T f, T g,
> +                               T h, T i, T j, T k, T l, T m, T n,
> +                               int clip_max)
> +{
> +    T final;
> +
> +    int fc = C;
> +    int fd = (c + l) >> 1;
> +    int fe = B;
>

In the following you sometimes use B and C directly and sometimes fc and
fe. Is there a reason for this?

> +
> +    int temporal_diff0 = abs(c - l);
> +    int temporal_diff1 = (abs(g - fc) + abs(f - fe)) >> 1;
> +    int temporal_diff2 = (abs(i - fc) + abs(h - fe)) >> 1;
> +    int diff = max3(temporal_diff0 >> 1, temporal_diff1, temporal_diff2);
> +
> +    if (!diff) {
> +        final = fd;
> +    } else {
> +        int fb = ((d + m) >> 1) - fc;
> +        int ff = ((c + l) >> 1) - fe;
>

If I don´t miss anything this should be:
int ff = ((b + k) >> 1) - fe;

> +        int dc = fd - fc;
> +        int de = fd - fe;
> +        int mmax = max3(de, dc, min(fb, ff));
> +        int mmin = min3(de, dc, max(fb, ff));
> +        diff = max3(diff, mmin, -mmax);
> +
> +        int interpol;
> +        if (abs(fc - fe) > temporal_diff0) {
> +            interpol = (((coef_hf[0] * (c + l)
> +                - coef_hf[1] * (d + m + b + k)
> +                + coef_hf[2] * (e + n + a + j)) >> 2)
> +                + coef_lf[0] * (C + B) - coef_lf[1] * (D + A)) >> 13;
> +        } else {
> +            interpol = (coef_sp[0] * (C + B) - coef_sp[1] * (D + A)) >>
> 13;
> +        }
> +        if (interpol > fd + diff) {
> +            interpol = fd + diff;
> +        } else if (interpol < fd - diff) {
> +            interpol = fd - diff;
> +        }
> +        final = clip(interpol, 0, clip_max);
> +    }
> +
> +    return final;
> +}
> +
> +template<typename T>
> +__inline__ __device__ void bwdif_single(T *dst,
> +                                        cudaTextureObject_t prev,
> +                                        cudaTextureObject_t cur,
> +                                        cudaTextureObject_t next,
> +                                        int dst_width, int dst_height,
> int dst_pitch,
> +                                        int src_width, int src_height,
> +                                        int parity, int tff, bool
> skip_spatial_check,
> +                                        int clip_max)
> +{
> +    // Identify location
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (xo >= dst_width || yo >= dst_height) {
> +        return;
> +    }
> +
> +    // Don't modify the primary field
> +    if (yo % 2 == parity) {
> +      dst[yo*dst_pitch+xo] = tex2D<T>(cur, xo, yo);
> +      return;
> +    }
> +
> +    T A = tex2D<T>(cur, xo, yo + 3);
> +    T B = tex2D<T>(cur, xo, yo + 1);
> +    T C = tex2D<T>(cur, xo, yo - 1);
> +    T D = tex2D<T>(cur, xo, yo - 3);
> +
> +    // Calculate temporal prediction
> +    int is_second_field = !(parity ^ tff);
> +
> +    cudaTextureObject_t prev2 = prev;
> +    cudaTextureObject_t prev1 = is_second_field ? cur : prev;
> +    cudaTextureObject_t next1 = is_second_field ? next : cur;
> +    cudaTextureObject_t next2 = next;
> +
> +    T a = tex2D<T>(prev2, xo,  yo + 4);
> +    T b = tex2D<T>(prev2, xo,  yo + 2);
> +    T c = tex2D<T>(prev2, xo,  yo + 0);
> +    T d = tex2D<T>(prev2, xo,  yo - 2);
> +    T e = tex2D<T>(prev2, xo,  yo - 4);
> +    T f = tex2D<T>(prev1, xo,  yo + 1);
> +    T g = tex2D<T>(prev1, xo,  yo - 1);
> +    T h = tex2D<T>(next1, xo,  yo + 1);
> +    T i = tex2D<T>(next1, xo,  yo - 1);
> +    T j = tex2D<T>(next2, xo,  yo + 4);
> +    T k = tex2D<T>(next2, xo,  yo + 2);
> +    T l = tex2D<T>(next2, xo,  yo + 0);
> +    T m = tex2D<T>(next2, xo,  yo - 2);
> +    T n = tex2D<T>(next2, xo,  yo - 4);
> +
> +    dst[yo*dst_pitch+xo] = filter(A, B, C, D,
> +                                  a, b, c, d, e, f, g,
> +                                  h, i, j, k, l, m, n,
> +                                  clip_max);
> +}
> +
> +template <typename T>
> +__inline__ __device__ void bwdif_double(T *dst,
> +                                        cudaTextureObject_t prev,
> +                                        cudaTextureObject_t cur,
> +                                        cudaTextureObject_t next,
> +                                        int dst_width, int dst_height,
> int dst_pitch,
> +                                        int src_width, int src_height,
> +                                        int parity, int tff, bool
> skip_spatial_check,
> +                                        int clip_max)
> +{
> +    int xo = blockIdx.x * blockDim.x + threadIdx.x;
> +    int yo = blockIdx.y * blockDim.y + threadIdx.y;
> +
> +    if (xo >= dst_width || yo >= dst_height) {
> +        return;
> +    }
> +
> +    if (yo % 2 == parity) {
> +      // Don't modify the primary field
> +      dst[yo*dst_pitch+xo] = tex2D<T>(cur, xo, yo);
> +      return;
> +    }
> +
> +    T A = tex2D<T>(cur, xo, yo + 3);
> +    T B = tex2D<T>(cur, xo, yo + 1);
> +    T C = tex2D<T>(cur, xo, yo - 1);
> +    T D = tex2D<T>(cur, xo, yo - 3);
> +
> +    // Calculate temporal prediction
> +    int is_second_field = !(parity ^ tff);
> +
> +    cudaTextureObject_t prev2 = prev;
> +    cudaTextureObject_t prev1 = is_second_field ? cur : prev;
> +    cudaTextureObject_t next1 = is_second_field ? next : cur;
> +    cudaTextureObject_t next2 = next;
> +
> +    T a = tex2D<T>(prev2, xo,  yo + 4);
> +    T b = tex2D<T>(prev2, xo,  yo + 2);
> +    T c = tex2D<T>(prev2, xo,  yo + 0);
> +    T d = tex2D<T>(prev2, xo,  yo - 2);
> +    T e = tex2D<T>(prev2, xo,  yo - 4);
> +    T f = tex2D<T>(prev1, xo,  yo + 1);
> +    T g = tex2D<T>(prev1, xo,  yo - 1);
> +    T h = tex2D<T>(next1, xo,  yo + 1);
> +    T i = tex2D<T>(next1, xo,  yo - 1);
> +    T j = tex2D<T>(next2, xo,  yo + 4);
> +    T k = tex2D<T>(next2, xo,  yo + 2);
> +    T l = tex2D<T>(next2, xo,  yo + 0);
> +    T m = tex2D<T>(next2, xo,  yo - 2);
> +    T n = tex2D<T>(next2, xo,  yo - 4);
> +
> +    T final;
> +    final.x = filter(A.x, B.x, C.x, D.x,
> +                     a.x, b.x, c.x, d.x, e.x, f.x, g.x,
> +                     h.x, i.x, j.x, k.x, l.x, m.x, n.x,
> +                     clip_max);
> +    final.y = filter(A.y, B.y, C.y, D.y,
> +                     a.y, b.y, c.y, d.y, e.y, f.y, g.y,
> +                     h.y, i.y, j.y, k.y, l.y, m.y, n.y,
> +                     clip_max);
> +
> +
> +
> +
> +    dst[yo*dst_pitch+xo] = final;
> +}
> +
>
...

> +
> +} /* extern "C" */
> --
> 2.25.1
>

Best regards,
Thomas