[FFmpeg-devel] [PATCH 02/10] diracdsp: add dequantization SIMD

Mon Jun 27 13:53:47 CEST 2016

On 24 June 2016 at 16:38, James Almer <jamrial at gmail.com> wrote:

> On 6/24/2016 8:43 AM, Rostislav Pehlivanov wrote:
> > From 154e4312b09f568108dd97089e394c10bb3c28a9 Mon Sep 17 00:00:00 2001
> > From: Rostislav Pehlivanov <rpehlivanov at ob-encoder.com>
> > Date: Thu, 23 Jun 2016 18:06:56 +0100
> > Subject: [PATCH 2/2] diracdsp: add dequantization SIMD
> >
> > Currently unused, to be used in the following commits.
> >
> > Signed-off-by: Rostislav Pehlivanov <rpehlivanov at obe.tv>
> > ---
> >  libavcodec/diracdsp.c          | 24 ++++++++++++++++++++++++
> >  libavcodec/diracdsp.h          |  4 ++++
> >  libavcodec/x86/diracdsp.asm    | 36 ++++++++++++++++++++++++++++++++++++
> >  libavcodec/x86/diracdsp_init.c |  2 ++
> >  4 files changed, 66 insertions(+)
> >
> > diff --git a/libavcodec/diracdsp.c b/libavcodec/diracdsp.c
> > index ab8d149..cd1209e 100644
> > --- a/libavcodec/diracdsp.c
> > +++ b/libavcodec/diracdsp.c
> > @@ -189,6 +189,27 @@ static void add_rect_clamped_c(uint8_t *dst, const
> uint16_t *src, int stride,
> >      }
> >  }
> >
> > +#define DEQUANT_SUBBAND(PX)
>                     \
> > +static void dequant_subband_ ## PX ## _c(uint8_t *src, uint8_t *dst,
> ptrdiff_t stride,     \
> > +                                         const int qf, const int qs,
> int tot_v, int tot_h) \
> > +{
>                     \
> > +    int i, y;
>                     \
> > +    for (y = 0; y < tot_v; y++) {
>                     \
> > +        PX c, sign, *src_r = (PX *)src, *dst_r = (PX *)dst;
>                     \
> > +        for (i = 0; i < tot_h; i++) {
>                     \
> > +            c = *src_r++;
>                     \
> > +            sign = FFSIGN(c)*(!!c);
>                     \
> > +            c = (FFABS(c)*qf + qs) >> 2;
>                    \
> > +            *dst_r++ = c*sign;
>                    \
> > +        }
>                     \
> > +        src += tot_h << (sizeof(PX) >> 1);
>                    \
> > +        dst += stride;
>                    \
> > +    }
>                     \
> > +}
> > +
> > +DEQUANT_SUBBAND(int16_t)
> > +DEQUANT_SUBBAND(int32_t)
> > +
> >  #define PIXFUNC(PFX, WIDTH)
>  \
> >      c->PFX ## _dirac_pixels_tab[WIDTH>>4][0] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _c; \
> >      c->PFX ## _dirac_pixels_tab[WIDTH>>4][1] = ff_ ## PFX ##
> _dirac_pixels ## WIDTH ## _l2_c; \
> > @@ -214,6 +235,9 @@ av_cold void ff_diracdsp_init(DiracDSPContext *c)
> >      c->biweight_dirac_pixels_tab[1] = biweight_dirac_pixels16_c;
> >      c->biweight_dirac_pixels_tab[2] = biweight_dirac_pixels32_c;
> >
> > +    c->dequant_subband[0] = c->dequant_subband[2] =
> dequant_subband_int16_t_c;
> > +    c->dequant_subband[1] = c->dequant_subband[3] =
> dequant_subband_int32_t_c;
> > +
> >      PIXFUNC(put, 8);
> >      PIXFUNC(put, 16);
> >      PIXFUNC(put, 32);
> > diff --git a/libavcodec/diracdsp.h b/libavcodec/diracdsp.h
> > index 25a872d..224828d 100644
> > --- a/libavcodec/diracdsp.h
> > +++ b/libavcodec/diracdsp.h
> > @@ -22,6 +22,7 @@
> >  #define AVCODEC_DIRACDSP_H
> >
> >  #include <stdint.h>
> > +#include <stddef.h>
> >
> >  typedef void (*dirac_weight_func)(uint8_t *block, int stride, int
> log2_denom, int weight, int h);
> >  typedef void (*dirac_biweight_func)(uint8_t *dst, const uint8_t *src,
> int stride, int log2_denom, int weightd, int weights, int h);
> > @@ -46,6 +47,9 @@ typedef struct {
> >      void (*add_rect_clamped)(uint8_t *dst/*align 16*/, const uint16_t
> *src/*align 16*/, int stride, const int16_t *idwt/*align 16*/, int
> idwt_stride, int width, int height/*mod 2*/);
> >      void (*add_dirac_obmc[3])(uint16_t *dst, const uint8_t *src, int
> stride, const uint8_t *obmc_weight, int yblen);
> >
> > +    /* 0-1: int16_t and int32_t asm/c, 2-3: int16 and int32_t, C only */
> > +    void (*dequant_subband[4])(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int tot_v, int tot_h);
> > +
> >      dirac_weight_func weight_dirac_pixels_tab[3];
> >      dirac_biweight_func biweight_dirac_pixels_tab[3];
> >  } DiracDSPContext;
> > diff --git a/libavcodec/x86/diracdsp.asm b/libavcodec/x86/diracdsp.asm
> > index a0d6788..a764706 100644
> > --- a/libavcodec/x86/diracdsp.asm
> > +++ b/libavcodec/x86/diracdsp.asm
> > @@ -307,4 +307,40 @@ cglobal put_signed_rect_clamped_10, 6, 9, 6, dst,
> dst_stride, src, src_stride, w
> >
> >      RET
> >
> > +; void dequant_subband_32(uint8_t *src, uint8_t *dst, ptrdiff_t stride,
> const int qf, const int qs, int tot_v, int tot_h)
> > +cglobal dequant_subband_32, 7, 9, 4, src, dst, stride, qf, qs, tot_v,
> tot_h
> > +
> > +    movd   m2, qfd
> > +    movd   m3, qsd
> > +    SPLATD m2
> > +    SPLATD m3
> > +    mov    r7, dstq
> > +    mov    r8, tot_hq
>
> Replace every r7 and r8 with r3 and r4, make the cglobal line 7, 7, 4
> and the function will work on x86_32.
>
> > +
> > +    .loop_v:
> > +    mov    dstq,   r7
> > +    mov    tot_hq, r8
> > +
> > +    .loop_h:
> > +    movu   m0, [srcq]
> > +
> > +    pabsd  m1, m0
> > +    pmulld m1, m2
> > +    paddd  m1, m3
> > +    psrld  m1,  2
> > +    psignd m1, m0
> > +
> > +    movu   [dstq], m1
> > +
> > +    add    srcq, mmsize
> > +    add    dstq, mmsize
> > +    sub    tot_hq, 4
> > +    jl     .loop_h
>
> Jump if greater. Also use tot_hd, or change the prototypes.
>
> > +
> > +    add    r7, strideq
> > +    sub    tot_vq, 1
> > +    jl     .loop_v
>
> Ditto.
>
> > +
> > +    RET
> > +
> >  %endif
> > diff --git a/libavcodec/x86/diracdsp_init.c
> b/libavcodec/x86/diracdsp_init.c
> > index 7fa554e..a1bab9c 100644
> > --- a/libavcodec/x86/diracdsp_init.c
> > +++ b/libavcodec/x86/diracdsp_init.c
> > @@ -48,6 +48,7 @@ void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int
> dst_stride, const int16_t
> >
> >  #if ARCH_X86_64
> >  void ff_put_signed_rect_clamped_10_sse4(uint8_t *dst, int dst_stride,
> const uint8_t *src, int src_stride, int width, int height);
> > +void ff_dequant_subband_32_sse4(uint8_t *src, uint8_t *dst, ptrdiff_t
> stride, const int qf, const int qs, int tot_v, int tot_h);
> >  #endif
> >
> >  #if HAVE_YASM
> > @@ -191,6 +192,7 @@ void ff_diracdsp_init_x86(DiracDSPContext* c)
> >
> >  #if ARCH_X86_64
> >      if (EXTERNAL_SSE4(mm_flags)) {
> > +        c->dequant_subband[1]         = ff_dequant_subband_32_sse4;
> >          c->put_signed_rect_clamped[1] =
> ff_put_signed_rect_clamped_10_sse4;
> >      }
> >  #endif
> > -- 2.8.1.369.geae769a
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>

I've attached another patch which should work fine now.
I did this after the put_signed_rect so it does require the first patch,
but if this patch is okay I'll amend and tidy things before I push.
For some reason changing dstq to be stored at r4 or r3 broke it and I've no
idea why. Neither is used after loading m2 and m3. Should work on x86_32
now, but I'm wondering why I can't save that register.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-diracdsp-add-dequantization-SIMD.patch
Type: text/x-patch
Size: 6575 bytes
Desc: not available
URL: <http://ffmpeg.org/pipermail/ffmpeg-devel/attachments/20160627/7ce4b0bd/attachment.bin>