[FFmpeg-devel] [PATCH 7/9] sbcenc: add MMX optimizations

Rostislav Pehlivanov atomnuker at gmail.com
Mon Mar 5 22:48:24 EET 2018


On 24 February 2018 at 12:05, Aurelien Jacobs <aurel at gnuage.org> wrote:

> On Thu, Feb 22, 2018 at 05:21:57PM +0000, Rostislav Pehlivanov wrote:
> > On 21 February 2018 at 22:37, Aurelien Jacobs <aurel at gnuage.org> wrote:
> > [...]
> > > +;*******************************************************************
> > > +;void ff_sbc_analyze_4(const int16_t *in, int32_t *out, const int16_t
> > > *consts);
> > > +;*******************************************************************
> > > +INIT_MMX mmx
> > > +cglobal sbc_analyze_4, 3, 3, 4, in, out, consts
> > > +    movq          m0, [inq]
> > > +    movq          m1, [inq+8]
> > > +    pmaddwd       m0, [constsq]
> > > +    pmaddwd       m1, [constsq+8]
> > > +    paddd         m0, [scale_mask]
> > > +    paddd         m1, [scale_mask]
> > > +
> > > +    movq          m2, [inq+16]
> > > +    movq          m3, [inq+24]
> > > +    pmaddwd       m2, [constsq+16]
> > > +    pmaddwd       m3, [constsq+24]
> > > +    paddd         m0, m2
> > > +    paddd         m1, m3
> > > +
> > > +    movq          m2, [inq+32]
> > > +    movq          m3, [inq+40]
> > > +    pmaddwd       m2, [constsq+32]
> > > +    pmaddwd       m3, [constsq+40]
> > > +    paddd         m0, m2
> > > +    paddd         m1, m3
> > > +
> > > +    movq          m2, [inq+48]
> > > +    movq          m3, [inq+56]
> > > +    pmaddwd       m2, [constsq+48]
> > > +    pmaddwd       m3, [constsq+56]
> > > +    paddd         m0, m2
> > > +    paddd         m1, m3
> > > +
> > > +    movq          m2, [inq+64]
> > > +    movq          m3, [inq+72]
> > > +    pmaddwd       m2, [constsq+64]
> > > +    pmaddwd       m3, [constsq+72]
> > > +    paddd         m0, m2
> > > +    paddd         m1, m3
> > >
> >
> > You can macro the top 3 blocks
> >
> > [...]
> > > +;*******************************************************************
> > > +;void ff_sbc_analyze_8(const int16_t *in, int32_t *out, const int16_t
> > > *consts);
> > > +;*******************************************************************
> > > +INIT_MMX mmx
> > > +cglobal sbc_analyze_8, 3, 3, 4, in, out, consts
> > > +    movq          m0, [inq]
> > > +    movq          m1, [inq+8]
> > > +    movq          m2, [inq+16]
> > > +    movq          m3, [inq+24]
> > > +    pmaddwd       m0, [constsq]
> > > +    pmaddwd       m1, [constsq+8]
> > > +    pmaddwd       m2, [constsq+16]
> > > +    pmaddwd       m3, [constsq+24]
> > > +    paddd         m0, [scale_mask]
> > > +    paddd         m1, [scale_mask]
> > > +    paddd         m2, [scale_mask]
> > > +    paddd         m3, [scale_mask]
> > > +
> > > +    movq          m4, [inq+32]
> > > +    movq          m5, [inq+40]
> > > +    movq          m6, [inq+48]
> > > +    movq          m7, [inq+56]
> > > +    pmaddwd       m4, [constsq+32]
> > > +    pmaddwd       m5, [constsq+40]
> > > +    pmaddwd       m6, [constsq+48]
> > > +    pmaddwd       m7, [constsq+56]
> > > +    paddd         m0, m4
> > > +    paddd         m1, m5
> > > +    paddd         m2, m6
> > > +    paddd         m3, m7
> > > +
> > > +    movq          m4, [inq+64]
> > > +    movq          m5, [inq+72]
> > > +    movq          m6, [inq+80]
> > > +    movq          m7, [inq+88]
> > > +    pmaddwd       m4, [constsq+64]
> > > +    pmaddwd       m5, [constsq+72]
> > > +    pmaddwd       m6, [constsq+80]
> > > +    pmaddwd       m7, [constsq+88]
> > > +    paddd         m0, m4
> > > +    paddd         m1, m5
> > > +    paddd         m2, m6
> > > +    paddd         m3, m7
> > > +
> > > +    movq          m4, [inq+96]
> > > +    movq          m5, [inq+104]
> > > +    movq          m6, [inq+112]
> > > +    movq          m7, [inq+120]
> > > +    pmaddwd       m4, [constsq+96]
> > > +    pmaddwd       m5, [constsq+104]
> > > +    pmaddwd       m6, [constsq+112]
> > > +    pmaddwd       m7, [constsq+120]
> > > +    paddd         m0, m4
> > > +    paddd         m1, m5
> > > +    paddd         m2, m6
> > > +    paddd         m3, m7
> > > +
> > > +    movq          m4, [inq+128]
> > > +    movq          m5, [inq+136]
> > > +    movq          m6, [inq+144]
> > > +    movq          m7, [inq+152]
> > > +    pmaddwd       m4, [constsq+128]
> > > +    pmaddwd       m5, [constsq+136]
> > > +    pmaddwd       m6, [constsq+144]
> > > +    pmaddwd       m7, [constsq+152]
> > > +    paddd         m0, m4
> > > +    paddd         m1, m5
> > > +    paddd         m2, m6
> > > +    paddd         m3, m7
> > >
> >
> > And those 5 blocks
> >
> >
> > > +
> > > +    psrad         m0, 16    ; SBC_PROTO_FIXED_SCALE
> > > +    psrad         m1, 16    ; SBC_PROTO_FIXED_SCALE
> > > +    psrad         m2, 16    ; SBC_PROTO_FIXED_SCALE
> > > +    psrad         m3, 16    ; SBC_PROTO_FIXED_SCALE
> > > +
> > > +    packssdw      m0, m0
> > > +    packssdw      m1, m1
> > > +    packssdw      m2, m2
> > > +    packssdw      m3, m3
> > > +
> > > +    movq          m4, m0
> > > +    movq          m5, m0
> > > +    pmaddwd       m4, [constsq+160]
> > > +    pmaddwd       m5, [constsq+168]
> > > +
> > > +    movq          m6, m1
> > > +    movq          m7, m1
> > > +    pmaddwd       m6, [constsq+192]
> > > +    pmaddwd       m7, [constsq+200]
> > > +    paddd         m4, m6
> > > +    paddd         m5, m7
> > > +
> > > +    movq          m6, m2
> > > +    movq          m7, m2
> > > +    pmaddwd       m6, [constsq+224]
> > > +    pmaddwd       m7, [constsq+232]
> > > +    paddd         m4, m6
> > > +    paddd         m5, m7
> > > +
> > > +    movq          m6, m3
> > > +    movq          m7, m3
> > > +    pmaddwd       m6, [constsq+256]
> > > +    pmaddwd       m7, [constsq+264]
> > > +    paddd         m4, m6
> > > +    paddd         m5, m7
> > >
> >
> > Reuse the first macro here
> >
> > Should save quite a bit of code
>
> OK, here is a "macroified" version of the code.
>
> _______________________________________________
> ffmpeg-devel mailing list
> ffmpeg-devel at ffmpeg.org
> http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
>
>
Looks fine to me, but I'd like to get someone else's opinion on this.
jamrial / nevcairiel / gramner?


More information about the ffmpeg-devel mailing list