[FFmpeg-devel] [PATCH] NEON code for basic scalar ops

Kostya kostya.shishkov
Tue Aug 18 19:42:53 CEST 2009


On Thu, Aug 13, 2009 at 10:51:36AM +0100, M?ns Rullg?rd wrote:
> Kostya <kostya.shishkov at gmail.com> writes:
> 
> > On Thu, Aug 13, 2009 at 12:33:07AM +0100, M?ns Rullg?rd wrote:
> >> Kostya <kostya.shishkov at gmail.com> writes:
> >> 
> >> > On Tue, Jul 21, 2009 at 03:23:58PM +0100, M?ns Rullg?rd wrote:
> >> >> Kostya <kostya.shishkov at gmail.com> writes:
> >> >> 
> >> >> > While waiting for RTMP patch review, here's a bit of NEON code to speed
> >> >> > up int16 array addition/subtraction and scalar product calculation.
> >> >> >
> >> >> > This about halves decoding time for APE compressed at insane level
> >> >> > (so it's only 7 times slower than realtime on my BeagleBoard).
> >> >> 
> >> >> These functions are far from optimal.
> >> >
> >> > Since I won't be able to work at it for some time I post here version
> >> > that is few cycles closer to optimal (but still far away).
> >> >
> >> > +function ff_scalarproduct_int16_neon, export=1
> >> > +        vmov.i16        q0,  #0
> >> > +        vmov.i16        q1,  #0
> >> > +        vmov.i16        q2,  #0
> >> > +        vmov.i16        q3,  #0
> >> > +1:      vld1.16         {d16-d17}, [r0]!
> >> > +        vld1.16         {d20-d21}, [r1,:128]!
> >> > +        vmlal.s16       q0,  d16,  d20
> >> > +        vld1.16         {d18-d19}, [r0]!
> >> > +        vmlal.s16       q1,  d17,  d21
> >> > +        vld1.16         {d22-d23}, [r1,:128]!
> >> > +        vmlal.s16       q2,  d18,  d22
> >> > +        vmlal.s16       q3,  d19,  d23
> >> > +        subs            r2,  r2,   #16
> >> > +        bne             1b
> >> > +        vpadd.s32       d8,  d0,   d1
> >> > +        vpadd.s32       d9,  d2,   d3
> >> > +        vpadd.s32       d10, d4,   d5
> >> > +        vpadd.s32       d11, d6,   d7
> >> > +        vpadd.s32       d0,  d8,   d9
> >> > +        vpadd.s32       d1,  d10,  d11
> >> > +        vpadd.s32       d2,  d0,   d1
> >> > +        vpaddl.s32      d3,  d2
> >> > +        vmov.32         r0,  d3[0]
> >> > +        asr             r0,  r3
> >> > +        bx              lr
> >> > +        .endfunc
> >> 
> >> This doesn't do exactly the same thing as the C version, which shifts
> >> immediately after the multiplication, before accumulating.  However,
> >> all calls to DSPContext.scalarproduct_int16 have a zero shift.
> >> 
> >> Since shifting at the end is both more accurate and faster, maybe we
> >> should change it.  Someone would have to update the sse and altivec
> >> versions of course.
> >
> > The intent was to have sped-up scalar product calculating for Monkey
> > Audio but with CELP filters in mind too. Since those use fixed point
> > values, shift right after multiplication is logical there (and will
> > prevent overflows).
> 
> If you shift after multiplying, you can't use multiply-accumulate
> instructions.

How about this?
I've tested both branches, seems to work fine.
 
> -- 
> M?ns Rullg?rd
> mans at mansr.com
-------------- next part --------------
/*
 * ARM NEON optimised integer operations
 * Copyright (c) 2009 Kostya Shishkov
 *
 * This file is part of FFmpeg.
 *
 * FFmpeg is free software; you can redistribute it and/or
 * modify it under the terms of the GNU Lesser General Public
 * License as published by the Free Software Foundation; either
 * version 2.1 of the License, or (at your option) any later version.
 *
 * FFmpeg is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public
 * License along with FFmpeg; if not, write to the Free Software
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
 */

#include "asm.S"

        preserve8
        .fpu neon
        .text

function ff_add_int16_neon, export=1
1:      vld1.16         {d16-d19}, [r0, :128]
        vld1.16         {d20-d23}, [r1]!
        vadd.i16        d24,       d16,  d20
        vadd.i16        d25,       d17,  d21
        vadd.i16        d26,       d18,  d22
        vadd.i16        d27,       d19,  d23
        vst1.16         {d24-d27}, [r0, :128]!
        subs            r2,        r2,   #16
        bne             1b
        bx              lr
        .endfunc

function ff_sub_int16_neon, export=1
1:      vld1.16         {d20-d21}, [r1]!
        vld1.16         {d16-d17}, [r0,:128]
        vsub.i16        d24,       d16,  d20
        vsub.i16        d25,       d17,  d21
        vst1.16         {d24-d25}, [r0,:128]!
        vld1.16         {d22-d23}, [r1]!
        vld1.16         {d18-d19}, [r0,:128]
        vsub.i16        d26,       d18,  d22
        vsub.i16        d27,       d19,  d23
        vst1.16         {d26-d27}, [r0,:128]!
        subs            r2,        r2,   #16
        bne             1b
        bx              lr
        .endfunc

function ff_scalarproduct_int16_neon, export=1
        vmov.i16        q0,  #0
        vmov.i16        q1,  #0
        vmov.i16        q2,  #0
        vmov.i16        q3,  #0
        cmp             r3,  #0
        bne             shift_ver
1:      vld1.16         {d16-d17}, [r0]!
        vld1.16         {d20-d21}, [r1,:128]!
        vmlal.s16       q0,  d16,  d20
        vld1.16         {d18-d19}, [r0]!
        vmlal.s16       q1,  d17,  d21
        vld1.16         {d22-d23}, [r1,:128]!
        vmlal.s16       q2,  d18,  d22
        vmlal.s16       q3,  d19,  d23
        subs            r2,  r2,   #16
        bne             1b
        b               tail
shift_ver:
        neg             r3,  r3
        vdup.s32        q12, r3
2:      vld1.16         {d16-d17}, [r0]!
        vld1.16         {d20-d21}, [r1,:128]!
        vmull.s16       q4,  d16,  d20
        vld1.16         {d18-d19}, [r0]!
        vmull.s16       q5,  d17,  d21
        vld1.16         {d22-d23}, [r1,:128]!
        vmull.s16       q6,  d18,  d22
        vmull.s16       q7,  d19,  d23
        vshl.s32        q8,  q4,   q12
        vshl.s32        q9,  q5,   q12
        vadd.s32        q0,  q0,   q8
        vshl.s32        q10, q6,   q12
        vadd.s32        q1,  q1,   q9
        vshl.s32        q11, q7,   q12
        vadd.s32        q2,  q2,   q10
        vadd.s32        q3,  q3,   q11
        subs            r2,  r2,   #16
        bne             2b
tail:
        vpadd.s32       d8,  d0,   d1
        vpadd.s32       d9,  d2,   d3
        vpadd.s32       d10, d4,   d5
        vpadd.s32       d11, d6,   d7
        vpadd.s32       d0,  d8,   d9
        vpadd.s32       d1,  d10,  d11
        vpadd.s32       d2,  d0,   d1
        vpaddl.s32      d3,  d2
        vmov.32         r0,  d3[0]
        bx              lr
        .endfunc




More information about the ffmpeg-devel mailing list