[01/10] arm: Add VFP-accelerated version of synth_filter_float

Message ID 1373980994-30628-1-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö July 16, 2013, 1:23 p.m.
From: Ben Avison <bavison@riscosopen.org>

               Before           After
               Mean    StdDev   Mean    StdDev  Change
This function   9295.0 114.9     4853.2 83.5    +91.5%
Overall        23699.8 397.6    19285.5 292.0   +22.9%
---
 libavcodec/arm/Makefile           |    3 +-
 libavcodec/arm/fft_init_arm.c     |    8 ++
 libavcodec/arm/synth_filter_vfp.S |  244 +++++++++++++++++++++++++++++++++++++
 3 files changed, 254 insertions(+), 1 deletion(-)
 create mode 100644 libavcodec/arm/synth_filter_vfp.S

Comments

Diego Biurrun July 16, 2013, 4:27 p.m. | #1
On 2013-07-16 15:23, Martin Storsjö wrote:
> --- a/libavcodec/arm/fft_init_arm.c
> +++ b/libavcodec/arm/fft_init_arm.c
> @@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input)
>
>   void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
>
> +void ff_synth_filter_float_vfp(FFTContext *imdct,
> +                               float *synth_buf_ptr, int *synth_buf_offset,
> +                               float synth_buf2[32], const float window[512],
> +                               float out[32], const float in[32],
> +                               float scale);
> +
>   void ff_synth_filter_float_neon(FFTContext *imdct,
>                                   float *synth_buf_ptr, int *synth_buf_offset,
>                                   float synth_buf2[32], const float window[512],
> @@ -69,6 +75,8 @@ av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
>   {
>       int cpu_flags = av_get_cpu_flags();
>
> +    if (have_vfp(cpu_flags))
> +        s->synth_filter_float = ff_synth_filter_float_vfp;
>       if (have_neon(cpu_flags))
>           s->synth_filter_float = ff_synth_filter_float_neon;
>   }

nit: Move the vfp functions below the neon functions.

> --- /dev/null
> +++ b/libavcodec/arm/synth_filter_vfp.S
> @@ -0,0 +1,244 @@
> +/*
> + * Copyright (c) 2013 RISC OS Open Ltd
> + *
> + * This file is part of Libav.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + *
> + * Author: Ben Avison <bavison@riscosopen.org>
> + */

Move the attribution next to the copyright notice.

Diego
Martin Storsjö July 16, 2013, 4:31 p.m. | #2
On Tue, 16 Jul 2013, Diego Biurrun wrote:

> On 2013-07-16 15:23, Martin Storsjö wrote:
>> --- a/libavcodec/arm/fft_init_arm.c
>> +++ b/libavcodec/arm/fft_init_arm.c
>> @@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, 
>> const FFTSample *input)
>>
>>   void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
>> 
>> +void ff_synth_filter_float_vfp(FFTContext *imdct,
>> +                               float *synth_buf_ptr, int 
>> *synth_buf_offset,
>> +                               float synth_buf2[32], const float 
>> window[512],
>> +                               float out[32], const float in[32],
>> +                               float scale);
>> +
>>   void ff_synth_filter_float_neon(FFTContext *imdct,
>>                                   float *synth_buf_ptr, int 
>> *synth_buf_offset,
>>                                   float synth_buf2[32], const float 
>> window[512],
>> @@ -69,6 +75,8 @@ av_cold void ff_synth_filter_init_arm(SynthFilterContext 
>> *s)
>>   {
>>       int cpu_flags = av_get_cpu_flags();
>> 
>> +    if (have_vfp(cpu_flags))
>> +        s->synth_filter_float = ff_synth_filter_float_vfp;
>>       if (have_neon(cpu_flags))
>>           s->synth_filter_float = ff_synth_filter_float_neon;
>>   }
>
> nit: Move the vfp functions below the neon functions.

Absolutely not.

Just as for x86 instruction sets, these are ordered from the least 
preferred to the most preferred ones. If you have both a neon and vfp 
implementation of a certain function (and your hw supports both) you want 
to have the neon one set.

>> --- /dev/null
>> +++ b/libavcodec/arm/synth_filter_vfp.S
>> @@ -0,0 +1,244 @@
>> +/*
>> + * Copyright (c) 2013 RISC OS Open Ltd
>> + *
>> + * This file is part of Libav.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with Libav; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 
>> 02110-1301 USA
>> + *
>> + * Author: Ben Avison <bavison@riscosopen.org>
>> + */
>
> Move the attribution next to the copyright notice.

I'll update the patchset locally (and on github) accordingly.

// Martin
Diego Biurrun July 16, 2013, 4:37 p.m. | #3
On 2013-07-16 18:31, Martin Storsjö wrote:
> On Tue, 16 Jul 2013, Diego Biurrun wrote:
>
>> On 2013-07-16 15:23, Martin Storsjö wrote:
>>> --- a/libavcodec/arm/fft_init_arm.c
>>> +++ b/libavcodec/arm/fft_init_arm.c
>>> @@ -32,6 +32,12 @@ void ff_mdct_calc_neon(FFTContext *s, FFTSample
>>> *output, const FFTSample *input)
>>>
>>>   void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
>>>
>>> +void ff_synth_filter_float_vfp(FFTContext *imdct,
>>> +                               float *synth_buf_ptr, int
>>> *synth_buf_offset,
>>> +                               float synth_buf2[32], const float
>>> window[512],
>>> +                               float out[32], const float in[32],
>>> +                               float scale);
>>> +
>>>   void ff_synth_filter_float_neon(FFTContext *imdct,
>>>                                   float *synth_buf_ptr, int
>>> *synth_buf_offset,
>>>                                   float synth_buf2[32], const float
>>> window[512],
>>> @@ -69,6 +75,8 @@ av_cold void
>>> ff_synth_filter_init_arm(SynthFilterContext *s)
>>>   {
>>>       int cpu_flags = av_get_cpu_flags();
>>>
>>> +    if (have_vfp(cpu_flags))
>>> +        s->synth_filter_float = ff_synth_filter_float_vfp;
>>>       if (have_neon(cpu_flags))
>>>           s->synth_filter_float = ff_synth_filter_float_neon;
>>>   }
>>
>> nit: Move the vfp functions below the neon functions.
>
> Absolutely not.
>
> Just as for x86 instruction sets, these are ordered from the least
> preferred to the most preferred ones. If you have both a neon and vfp
> implementation of a certain function (and your hw supports both) you
> want to have the neon one set.

OK, makes sense.

>>> --- /dev/null
>>> +++ b/libavcodec/arm/synth_filter_vfp.S
>>> @@ -0,0 +1,244 @@
>>> +/*
>>> + * Copyright (c) 2013 RISC OS Open Ltd
>>> + *
>>> + * This file is part of Libav.
>>> + *
>>> + * You should have received a copy of the GNU Lesser General Public
>>> + * License along with Libav; if not, write to the Free Software
>>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
>>> 02110-1301 USA
>>> + *
>>> + * Author: Ben Avison <bavison@riscosopen.org>
>>> + */
>>
>> Move the attribution next to the copyright notice.
>
> I'll update the patchset locally (and on github) accordingly.

I mentioned a few of those, please check all new files.

Diego
Ben Avison July 16, 2013, 5:08 p.m. | #4
On Tue, 16 Jul 2013 17:31:03 +0100, Martin Storsjö <martin@martin.st> wrote:
> On Tue, 16 Jul 2013, Diego Biurrun wrote:
>> nit: Move the vfp functions below the neon functions.
>
> Absolutely not.

Indeed. Contrary to normal practice, I'm going back and writing
optimisations for older CPUs some years after optimisations for newer CPUs
have already been done. So they are the correct way round :)

Ben
Kieran Kunhya July 17, 2013, 1:19 p.m. | #5
On Tue, Jul 16, 2013 at 6:08 PM, Ben Avison <bavison@riscosopen.org> wrote:
> On Tue, 16 Jul 2013 17:31:03 +0100, Martin Storsjö <martin@martin.st> wrote:
>>
>> On Tue, 16 Jul 2013, Diego Biurrun wrote:
>>>
>>> nit: Move the vfp functions below the neon functions.
>>
>>
>> Absolutely not.
>
>
> Indeed. Contrary to normal practice, I'm going back and writing
> optimisations for older CPUs some years after optimisations for newer CPUs
> have already been done. So they are the correct way round :)
>
> Ben

If you don't mind me asking, why? Are you not allowed to use the DSP
on the RPi for this?

Patch

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 875c3cc..6c0ed71 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -52,7 +52,8 @@  ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o               \
                                           arm/vp8dsp_init_armv6.o       \
                                           arm/vp8dsp_armv6.o
 
-VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o
+VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \
+                                          arm/synth_filter_vfp.o
 
 NEON-OBJS                              += arm/fmtconvert_neon.o
 
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 9ec620f..335bbec 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -32,6 +32,12 @@  void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input)
 
 void ff_rdft_calc_neon(struct RDFTContext *s, FFTSample *z);
 
+void ff_synth_filter_float_vfp(FFTContext *imdct,
+                               float *synth_buf_ptr, int *synth_buf_offset,
+                               float synth_buf2[32], const float window[512],
+                               float out[32], const float in[32],
+                               float scale);
+
 void ff_synth_filter_float_neon(FFTContext *imdct,
                                 float *synth_buf_ptr, int *synth_buf_offset,
                                 float synth_buf2[32], const float window[512],
@@ -69,6 +75,8 @@  av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (have_vfp(cpu_flags))
+        s->synth_filter_float = ff_synth_filter_float_vfp;
     if (have_neon(cpu_flags))
         s->synth_filter_float = ff_synth_filter_float_neon;
 }
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
new file mode 100644
index 0000000..5d8f8db
--- /dev/null
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -0,0 +1,244 @@ 
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ *
+ * Author: Ben Avison <bavison@riscosopen.org>
+ */
+
+#include "libavutil/arm/asm.S"
+
+IMDCT         .req    r0
+ORIG_P_SB     .req    r1
+P_SB_OFF      .req    r2
+I             .req    r0
+P_SB2_UP      .req    r1
+OLDFPSCR      .req    r2
+P_SB2_DN      .req    r3
+P_WIN_DN      .req    r4
+P_OUT_DN      .req    r5
+P_SB          .req    r6
+J_WRAP        .req    r7
+P_WIN_UP      .req    r12
+P_OUT_UP      .req    r14
+
+SCALE         .req    s0
+SBUF_DAT_REV0 .req    s4
+SBUF_DAT_REV1 .req    s5
+SBUF_DAT_REV2 .req    s6
+SBUF_DAT_REV3 .req    s7
+VA0           .req    s8
+VA3           .req    s11
+VB0           .req    s12
+VB3           .req    s15
+VC0           .req    s8
+VC3           .req    s11
+VD0           .req    s12
+VD3           .req    s15
+SBUF_DAT0     .req    s16
+SBUF_DAT1     .req    s17
+SBUF_DAT2     .req    s18
+SBUF_DAT3     .req    s19
+SBUF_DAT_ALT0 .req    s20
+SBUF_DAT_ALT1 .req    s21
+SBUF_DAT_ALT2 .req    s22
+SBUF_DAT_ALT3 .req    s23
+WIN_DN_DAT0   .req    s24
+WIN_UP_DAT0   .req    s28
+
+
+.macro inner_loop  half, tail, head
+ .if (OFFSET & (64*4)) == 0                @ even numbered call
+        SBUF_DAT_THIS0 .req SBUF_DAT0
+        SBUF_DAT_THIS1 .req SBUF_DAT1
+        SBUF_DAT_THIS2 .req SBUF_DAT2
+        SBUF_DAT_THIS3 .req SBUF_DAT3
+  .ifnc "\head",""
+        vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT
+        vldr    d9, [P_SB, #OFFSET+8]
+  .endif
+ .else
+        SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
+        SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
+        SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
+        SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
+  .ifnc "\head",""
+        vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT
+        vldr    d11, [P_SB, #OFFSET+8]
+  .endif
+ .endif
+ .ifnc "\tail",""
+  .ifc "\half","ab"
+        vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
+  .else
+        vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
+  .endif
+ .endif
+ .ifnc "\head",""
+        vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT
+        vldr    d15, [P_WIN_UP, #OFFSET+8]
+        vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT
+        vldr    d13, [P_WIN_DN, #OFFSET+8]
+        vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0
+        vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1
+        vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2
+        vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3
+  .ifc "\half","ab"
+        vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
+  .else
+        vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
+  .endif
+        teq     J_WRAP, #J
+        bne     2f             @ strongly predictable, so better than cond exec in this case
+        sub     P_SB, P_SB, #512*4
+2:
+  .set J, J - 64
+  .set OFFSET, OFFSET + 64*4
+ .endif
+        .unreq  SBUF_DAT_THIS0
+        .unreq  SBUF_DAT_THIS1
+        .unreq  SBUF_DAT_THIS2
+        .unreq  SBUF_DAT_THIS3
+.endm
+
+
+/* void ff_synth_filter_float_vfp(FFTContext *imdct,
+ *                                float *synth_buf_ptr, int *synth_buf_offset,
+ *                                float synth_buf2[32], const float window[512],
+ *                                float out[32], const float in[32], float scale)
+ */
+function ff_synth_filter_float_vfp, export=1
+        push    {r3-r7,lr}
+        vpush   {s16-s31}
+        ldr     lr, [P_SB_OFF]
+        add     a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
+        mov     P_SB, a2                  @ and keep a copy for ourselves
+        bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop
+        sub     lr, lr, #32
+        and     lr, lr, #512-32
+        str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
+        ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
+VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
+        bl      ff_imdct_half_c
+VFP     vmov    SCALE, s16
+
+        vmrs    OLDFPSCR, FPSCR
+        ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1
+        vmsr    FPSCR, lr
+        ldr     P_SB2_DN, [sp, #16*4]
+        ldr     P_WIN_DN, [sp, #(16+6+0)*4]
+        ldr     P_OUT_DN, [sp, #(16+6+1)*4]
+NOVFP   vldr    SCALE, [sp, #(16+6+3)*4]
+
+#define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */
+        add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range
+        add     P_SB2_UP, P_SB2_DN, #16*4
+        add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
+        add     P_OUT_UP, P_OUT_DN, #16*4
+        add     P_SB2_DN, P_SB2_DN, #16*4
+        add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
+        add     P_OUT_DN, P_OUT_DN, #16*4
+        mov     I, #4
+1:
+        vldmia  P_SB2_UP!, {VB0-VB3}
+        vldmdb  P_SB2_DN!, {VA0-VA3}
+ .set J, 512 - 64
+ .set OFFSET, -IMM_OFF_SKEW
+        inner_loop  ab,, head
+ .rept 7
+        inner_loop  ab, tail, head
+ .endr
+        inner_loop  ab, tail
+        add     P_WIN_UP, P_WIN_UP, #4*4
+        sub     P_WIN_DN, P_WIN_DN, #4*4
+        vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar
+        add     P_SB, P_SB, #(512+4)*4
+        subs    I, I, #1
+        vmul.f  VA0, VA0, SCALE
+        vstmia  P_OUT_UP!, {VB0-VB3}
+        vstmdb  P_OUT_DN!, {VA0-VA3}
+        bne     1b
+
+        add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4
+        sub     P_SB2_UP, P_SB2_UP, #(16+16)*4
+        add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
+        mov     I, #4
+1:
+        vldr.d  d4, zero             @ d4 = VC0
+        vldr.d  d5, zero
+        vldr.d  d6, zero             @ d6 = VD0
+        vldr.d  d7, zero
+ .set J, 512 - 64
+ .set OFFSET, -IMM_OFF_SKEW
+        inner_loop  cd,, head
+ .rept 7
+        inner_loop  cd, tail, head
+ .endr
+        inner_loop  cd, tail
+        add     P_WIN_UP, P_WIN_UP, #4*4
+        sub     P_WIN_DN, P_WIN_DN, #4*4
+        add     P_SB, P_SB, #(512+4)*4
+        subs    I, I, #1
+        vstmia  P_SB2_UP!, {VC0-VC3}
+        vstmdb  P_SB2_DN!, {VD0-VD3}
+        bne     1b
+
+        vmsr    FPSCR, OLDFPSCR
+        vpop    {s16-s31}
+        pop     {r3-r7,pc}
+endfunc
+
+        .unreq  IMDCT
+        .unreq  ORIG_P_SB
+        .unreq  P_SB_OFF
+        .unreq  I
+        .unreq  P_SB2_UP
+        .unreq  OLDFPSCR
+        .unreq  P_SB2_DN
+        .unreq  P_WIN_DN
+        .unreq  P_OUT_DN
+        .unreq  P_SB
+        .unreq  J_WRAP
+        .unreq  P_WIN_UP
+        .unreq  P_OUT_UP
+
+        .unreq  SCALE
+        .unreq  SBUF_DAT_REV0
+        .unreq  SBUF_DAT_REV1
+        .unreq  SBUF_DAT_REV2
+        .unreq  SBUF_DAT_REV3
+        .unreq  VA0
+        .unreq  VA3
+        .unreq  VB0
+        .unreq  VB3
+        .unreq  VC0
+        .unreq  VC3
+        .unreq  VD0
+        .unreq  VD3
+        .unreq  SBUF_DAT0
+        .unreq  SBUF_DAT1
+        .unreq  SBUF_DAT2
+        .unreq  SBUF_DAT3
+        .unreq  SBUF_DAT_ALT0
+        .unreq  SBUF_DAT_ALT1
+        .unreq  SBUF_DAT_ALT2
+        .unreq  SBUF_DAT_ALT3
+        .unreq  WIN_DN_DAT0
+        .unreq  WIN_UP_DAT0
+
+        .align  3
+zero:   .word   0, 0