[06/10] arm: Add VFP-accelerated version of imdct_half

Message ID 1374085586-9721-6-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö July 17, 2013, 6:26 p.m.
From: Ben Avison <bavison@riscosopen.org>

               Before           After
               Mean    StdDev   Mean    StdDev  Change
This function   2653.0  28.5     1108.8  51.4   +139.3%
Overall        17049.5 408.2    15973.0 223.2     +6.7%
---
 libavcodec/arm/Makefile           |    1 +
 libavcodec/arm/fft_init_arm.c     |   11 +-
 libavcodec/arm/mdct_vfp.S         |  206 +++++++++++++++++++++++++++++++++++++
 libavcodec/arm/synth_filter_vfp.S |    2 +-
 4 files changed, 218 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/arm/mdct_vfp.S

Comments

Martin Storsjö July 19, 2013, 8:30 a.m. | #1
On Wed, 17 Jul 2013, Martin Storsjö wrote:

> From: Ben Avison <bavison@riscosopen.org>
>
>               Before           After
>               Mean    StdDev   Mean    StdDev  Change
> This function   2653.0  28.5     1108.8  51.4   +139.3%
> Overall        17049.5 408.2    15973.0 223.2     +6.7%
> ---
> libavcodec/arm/Makefile           |    1 +
> libavcodec/arm/fft_init_arm.c     |   11 +-
> libavcodec/arm/mdct_vfp.S         |  206 +++++++++++++++++++++++++++++++++++++
> libavcodec/arm/synth_filter_vfp.S |    2 +-
> 4 files changed, 218 insertions(+), 2 deletions(-)
> create mode 100644 libavcodec/arm/mdct_vfp.S
>
> diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
> index 6c0ed71..ed94061 100644
> --- a/libavcodec/arm/Makefile
> +++ b/libavcodec/arm/Makefile
> @@ -53,6 +53,7 @@ ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o               \
>                                           arm/vp8dsp_armv6.o
>
> VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \
> +                                          arm/mdct_vfp.o                \
>                                           arm/synth_filter_vfp.o

I changed this to build the new assembly files conditionally based on 
which context/decoder they belong to (as for the existing files for neon), 
avoiding including them in builds that don't have the decoders enabled 
that need them. This also fixed builds on e.g. armv5/vfp build 
configurations (since the assembly files were earlier only built if 
targeting >= armv6).

I also changed vmsr/vmrs to fmxr/fmrx to make it work on some older 
assemblers. Even if using UAL syntax for the asm, we've kept fmxr/fmrx 
(see b0e8ce55ae for instance).

The patchset in itself is shaping up pretty well IMO now - do Justin or 
Janne have anything more to add, or do you think it's soon good to go?

// Martin

Patch

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 6c0ed71..ed94061 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -53,6 +53,7 @@  ARMV6-OBJS-$(CONFIG_VP8_DECODER)       += arm/vp8_armv6.o               \
                                           arm/vp8dsp_armv6.o
 
 VFP-OBJS-$(HAVE_ARMV6)                 += arm/fmtconvert_vfp.o          \
+                                          arm/mdct_vfp.o                \
                                           arm/synth_filter_vfp.o
 
 NEON-OBJS                              += arm/fmtconvert_neon.o
diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 335bbec..1c4568d 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -26,6 +26,8 @@ 
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
+void ff_imdct_half_vfp(FFTContext *s, FFTSample *output, const FFTSample *input);
+
 void ff_imdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_imdct_half_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
 void ff_mdct_calc_neon(FFTContext *s, FFTSample *output, const FFTSample *input);
@@ -48,6 +50,13 @@  av_cold void ff_fft_init_arm(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
+    if (have_vfp(cpu_flags)) {
+#if CONFIG_MDCT
+        if (!have_vfpv3(cpu_flags))
+            s->imdct_half   = ff_imdct_half_vfp;
+#endif
+    }
+
     if (have_neon(cpu_flags)) {
         s->fft_permute  = ff_fft_permute_neon;
         s->fft_calc     = ff_fft_calc_neon;
@@ -75,7 +84,7 @@  av_cold void ff_synth_filter_init_arm(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (have_vfp(cpu_flags))
+    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags))
         s->synth_filter_float = ff_synth_filter_float_vfp;
     if (have_neon(cpu_flags))
         s->synth_filter_float = ff_synth_filter_float_neon;
diff --git a/libavcodec/arm/mdct_vfp.S b/libavcodec/arm/mdct_vfp.S
new file mode 100644
index 0000000..75e222f
--- /dev/null
+++ b/libavcodec/arm/mdct_vfp.S
@@ -0,0 +1,206 @@ 
+/*
+ * Copyright (c) 2013 RISC OS Open Ltd
+ * Author: Ben Avison <bavison@riscosopen.org>
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+
+CONTEXT .req    a1
+ORIGOUT .req    a2
+IN      .req    a3
+OUT     .req    v1
+REVTAB  .req    v2
+TCOS    .req    v3
+TSIN    .req    v4
+OLDFPSCR .req   v5
+J0      .req    a2
+J1      .req    a4
+J2      .req    ip
+J3      .req    lr
+
+.macro prerotation_innerloop
+ .set trig_lo, k
+ .set trig_hi, n4 - k - 2
+ .set in_lo, trig_lo * 2
+ .set in_hi, trig_hi * 2
+        vldr    d8, [TCOS, #trig_lo*4]          @ s16,s17
+        vldr    d9, [TCOS, #trig_hi*4]          @ s18,s19
+        vldr    s0, [IN, #in_hi*4 + 12]
+        vldr    s1, [IN, #in_hi*4 + 4]
+        vldr    s2, [IN, #in_lo*4 + 12]
+        vldr    s3, [IN, #in_lo*4 + 4]
+        vmul.f  s8, s0, s16                     @ vector operation
+        vldr    d10, [TSIN, #trig_lo*4]         @ s20,s21
+        vldr    d11, [TSIN, #trig_hi*4]         @ s22,s23
+        vldr    s4, [IN, #in_lo*4]
+        vldr    s5, [IN, #in_lo*4 + 8]
+        vldr    s6, [IN, #in_hi*4]
+        vldr    s7, [IN, #in_hi*4 + 8]
+        ldr     J0, [REVTAB, #trig_lo*2]
+        vmul.f  s12, s0, s20                    @ vector operation
+        ldr     J2, [REVTAB, #trig_hi*2]
+        mov     J1, J0, lsr #16
+        and     J0, J0, #255                    @ halfword value will be < n4
+        vmls.f  s8, s4, s20                     @ vector operation
+        mov     J3, J2, lsr #16
+        and     J2, J2, #255                    @ halfword value will be < n4
+        add     J0, OUT, J0, lsl #3
+        vmla.f  s12, s4, s16                    @ vector operation
+        add     J1, OUT, J1, lsl #3
+        add     J2, OUT, J2, lsl #3
+        add     J3, OUT, J3, lsl #3
+        vstr    s8, [J0]
+        vstr    s9, [J1]
+        vstr    s10, [J2]
+        vstr    s11, [J3]
+        vstr    s12, [J0, #4]
+        vstr    s13, [J1, #4]
+        vstr    s14, [J2, #4]
+        vstr    s15, [J3, #4]
+ .set k, k + 2
+.endm
+
+.macro postrotation_innerloop tail, head
+ .set trig_lo_head, n8 - k - 2
+ .set trig_hi_head, n8 + k
+ .set out_lo_head, trig_lo_head * 2
+ .set out_hi_head, trig_hi_head * 2
+ .set trig_lo_tail, n8 - (k - 2) - 2
+ .set trig_hi_tail, n8 + (k - 2)
+ .set out_lo_tail, trig_lo_tail * 2
+ .set out_hi_tail, trig_hi_tail * 2
+ .if (k & 2) == 0
+  TCOS_D0_HEAD .req d10 @ s20,s21
+  TCOS_D1_HEAD .req d11 @ s22,s23
+  TCOS_S0_TAIL .req s24
+ .else
+  TCOS_D0_HEAD .req d12 @ s24,s25
+  TCOS_D1_HEAD .req d13 @ s26,s27
+  TCOS_S0_TAIL .req s20
+ .endif
+ .ifnc "\tail",""
+        vmls.f  s8, s0, TCOS_S0_TAIL        @ vector operation
+ .endif
+ .ifnc "\head",""
+        vldr    d8, [TSIN, #trig_lo_head*4] @ s16,s17
+        vldr    d9, [TSIN, #trig_hi_head*4] @ s18,s19
+        vldr    TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
+ .endif
+ .ifnc "\tail",""
+        vmla.f  s12, s4, TCOS_S0_TAIL       @ vector operation
+ .endif
+ .ifnc "\head",""
+        vldr    s0, [OUT, #out_lo_head*4]
+        vldr    s1, [OUT, #out_lo_head*4 + 8]
+        vldr    s2, [OUT, #out_hi_head*4]
+        vldr    s3, [OUT, #out_hi_head*4 + 8]
+        vldr    s4, [OUT, #out_lo_head*4 + 4]
+        vldr    s5, [OUT, #out_lo_head*4 + 12]
+        vldr    s6, [OUT, #out_hi_head*4 + 4]
+        vldr    s7, [OUT, #out_hi_head*4 + 12]
+ .endif
+ .ifnc "\tail",""
+        vstr    s8, [OUT, #out_lo_tail*4]
+        vstr    s9, [OUT, #out_lo_tail*4 + 8]
+        vstr    s10, [OUT, #out_hi_tail*4]
+        vstr    s11, [OUT, #out_hi_tail*4 + 8]
+ .endif
+ .ifnc "\head",""
+        vmul.f  s8, s4, s16                 @ vector operation
+ .endif
+ .ifnc "\tail",""
+        vstr    s12, [OUT, #out_hi_tail*4 + 12]
+        vstr    s13, [OUT, #out_hi_tail*4 + 4]
+        vstr    s14, [OUT, #out_lo_tail*4 + 12]
+        vstr    s15, [OUT, #out_lo_tail*4 + 4]
+ .endif
+ .ifnc "\head",""
+        vmul.f  s12, s0, s16                @ vector operation
+        vldr    TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
+ .endif
+ .unreq TCOS_D0_HEAD
+ .unreq TCOS_D1_HEAD
+ .unreq TCOS_S0_TAIL
+ .ifnc "\head",""
+  .set k, k + 2
+ .endif
+.endm
+
+
+/* void ff_imdct_half_vfp(FFTContext *s,
+ *                        FFTSample *output,
+ *                        const FFTSample *input)
+ */
+function ff_imdct_half_vfp, export=1
+        ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
+        teq     ip, #6
+        it      ne
+        bne     ff_imdct_half_c             @ only case currently accelerated is the one used by DCA
+
+ .set n, 1<<6
+ .set n2, n/2
+ .set n4, n/4
+ .set n8, n/8
+
+        push    {v1-v5,lr}
+        vpush   {s16-s27}
+        vmrs    OLDFPSCR, FPSCR
+        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
+        vmsr    FPSCR, lr
+        mov     OUT, ORIGOUT
+        ldr     REVTAB, [CONTEXT, #2*4]
+        ldr     TCOS, [CONTEXT, #6*4]
+        ldr     TSIN, [CONTEXT, #7*4]
+
+ .set k, 0
+ .rept n8/2
+        prerotation_innerloop
+ .endr
+
+        vmsr    FPSCR, OLDFPSCR
+        mov     ORIGOUT, OUT
+        ldr     ip, [CONTEXT, #9*4]
+        blx     ip                          @ s->fft_calc(s, output)
+        ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
+        vmsr    FPSCR, lr
+
+ .set k, 0
+        postrotation_innerloop , head
+ .rept n8/2 - 1
+        postrotation_innerloop tail, head
+ .endr
+        postrotation_innerloop tail
+
+        vmsr    FPSCR, OLDFPSCR
+        vpop    {s16-s27}
+        pop     {v1-v5,pc}
+endfunc
+
+        .unreq  CONTEXT
+        .unreq  ORIGOUT
+        .unreq  IN
+        .unreq  OUT
+        .unreq  REVTAB
+        .unreq  TCOS
+        .unreq  TSIN
+        .unreq  OLDFPSCR
+        .unreq  J0
+        .unreq  J1
+        .unreq  J2
+        .unreq  J3
diff --git a/libavcodec/arm/synth_filter_vfp.S b/libavcodec/arm/synth_filter_vfp.S
index 98af555..3f19da2 100644
--- a/libavcodec/arm/synth_filter_vfp.S
+++ b/libavcodec/arm/synth_filter_vfp.S
@@ -132,7 +132,7 @@  function ff_synth_filter_float_vfp, export=1
         str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
         ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
 VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
-        bl      ff_imdct_half_c
+        bl      ff_imdct_half_vfp
 VFP     vmov    SCALE, s16
 
         vmrs    OLDFPSCR, FPSCR