[Updated,2/4] armv6: Accelerate ff_fft_calc for general case (nbits != 4)

Message ID 1405074838-23873-1-git-send-email-bavison@riscosopen.org
State New
Headers show

Commit Message

Ben Avison July 11, 2014, 10:33 a.m.
The previous implementation targeted DTS Coherent Acoustics, which only
requires nbits == 4 (fft16()). This case was (and still is) linked directly
rather than being indirected through ff_fft_calc_vfp(), but now the full
range from radix-4 up to radix-65536 is available. This benefits other codecs
such as AAC and AC3.

The implementaion is based upon the C version, with each routine larger than
radix-16 calling a hierarchy of smaller FFT functions, then performing a
post-processing pass. This pass benefits a lot from loop unrolling to
counter the long pipelines in the VFP. A relaxed calling standard also
reduces the overhead of the call hierarchy, and avoiding the excessive
inlining performed by GCC probably helps with I-cache utilisation too.

I benchmarked the result by measuring the number of gperftools samples that
hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
specifically in the FFT routines (fft4() to fft512() and pass()) for the
same sample AAC stream:

              Before          After
              Mean   StdDev   Mean   StdDev  Confidence  Change
Audio decode  2245.5 53.1     1599.6 43.8    100.0%      +40.4%
FFT routines  940.6  22.0     348.1  20.8    100.0%      +170.2%
---
 libavcodec/arm/fft_init_arm.c |    8 +-
 libavcodec/arm/fft_vfp.S      |  284 +++++++++++++++++++++++++++++++++++++++--
 2 files changed, 275 insertions(+), 17 deletions(-)

Comments

Martin Storsjö July 13, 2014, 9:10 a.m. | #1
On Fri, 11 Jul 2014, Ben Avison wrote:

> The previous implementation targeted DTS Coherent Acoustics, which only
> requires nbits == 4 (fft16()). This case was (and still is) linked directly
> rather than being indirected through ff_fft_calc_vfp(), but now the full
> range from radix-4 up to radix-65536 is available. This benefits other codecs
> such as AAC and AC3.
>
> The implementaion is based upon the C version, with each routine larger than
> radix-16 calling a hierarchy of smaller FFT functions, then performing a
> post-processing pass. This pass benefits a lot from loop unrolling to
> counter the long pipelines in the VFP. A relaxed calling standard also
> reduces the overhead of the call hierarchy, and avoiding the excessive
> inlining performed by GCC probably helps with I-cache utilisation too.
>
> I benchmarked the result by measuring the number of gperftools samples that
> hit anywhere in the AAC decoder (starting from aac_decode_frame()) or
> specifically in the FFT routines (fft4() to fft512() and pass()) for the
> same sample AAC stream:
>
>              Before          After
>              Mean   StdDev   Mean   StdDev  Confidence  Change
> Audio decode  2245.5 53.1     1599.6 43.8    100.0%      +40.4%
> FFT routines  940.6  22.0     348.1  20.8    100.0%      +170.2%
> ---
> libavcodec/arm/fft_init_arm.c |    8 +-
> libavcodec/arm/fft_vfp.S      |  284 +++++++++++++++++++++++++++++++++++++++--
> 2 files changed, 275 insertions(+), 17 deletions(-)

I tried this code on most of our odd configurations, and ran into a bunch 
of issues, most of which I've been able to resolve in one way or another, 
but some parts of it is a bit ugly...

The other three patches don't seem to require any extra tweaks to 
build/run in these configurations though.


> diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
> index 3a3d1a7..bc143c1 100644
> --- a/libavcodec/arm/fft_init_arm.c
> +++ b/libavcodec/arm/fft_init_arm.c
> @@ -23,6 +23,8 @@
> #include "libavcodec/rdft.h"
> #include "libavcodec/synth_filter.h"
>
> +void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
> +
> void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
> void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
>
> @@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s)
> {
>     int cpu_flags = av_get_cpu_flags();
>
> -    if (have_vfp(cpu_flags)) {
> +    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
> +        s->fft_calc     = ff_fft_calc_vfp;
> #if CONFIG_MDCT
> -        if (!have_vfpv3(cpu_flags))
> -            s->imdct_half   = ff_imdct_half_vfp;
> +        s->imdct_half   = ff_imdct_half_vfp;
> #endif
>     }
>
> diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
> index 7845ebb..e7d710b 100644
> --- a/libavcodec/arm/fft_vfp.S
> +++ b/libavcodec/arm/fft_vfp.S
> @@ -21,8 +21,52 @@
>
> #include "libavutil/arm/asm.S"
>
> -@ TODO: * FFTs wider than 16
> -@       * dispatch code
> +@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
> +@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
> +@ all single-precision VFP registers may be corrupted on exit.
> +
> +function ff_fft_calc_vfp, export=1
> +        ldr     ip, [a1, #0]    @ nbits
> +        mov     a1, a2
> +A       ldr     ip, [pc, ip, lsl #2]
> +A       bx      ip
> +A       .word   0
> +A       .word   0
> +A       .word   fft4_vfp
> +A       .word   fft8_vfp
> +A       .word   ff_fft16_vfp    @ this one alone is exported

This reference needs to be wrapped in X()

> +A       .word   fft32_vfp
> +A       .word   fft64_vfp
> +A       .word   fft128_vfp
> +A       .word   fft256_vfp
> +A       .word   fft512_vfp
> +A       .word   fft1024_vfp
> +A       .word   fft2048_vfp
> +A       .word   fft4096_vfp
> +A       .word   fft8192_vfp
> +A       .word   fft16384_vfp
> +A       .word   fft32768_vfp
> +A       .word   fft65536_vfp
> +T       tbh     [pc, ip, lsl #1]
> +T 0:    .short  0
> +T       .short  0
> +T       .short  fft4_vfp - 0b
> +T       .short  fft4_vfp - 0b

You've got one entry duplicated here, and these entries need to be divided 
by 2 (just as they are in mlpdsp_armv5te.S).

> +T       .short  fft8_vfp - 0b
> +T       .short  fft16_vfp - 0b
> +T       .short  fft32_vfp - 0b
> +T       .short  fft64_vfp - 0b
> +T       .short  fft128_vfp - 0b
> +T       .short  fft256_vfp - 0b
> +T       .short  fft512_vfp - 0b
> +T       .short  fft1024_vfp - 0b
> +T       .short  fft2048_vfp - 0b
> +T       .short  fft4096_vfp - 0b
> +T       .short  fft8192_vfp - 0b
> +T       .short  fft16384_vfp - 0b
> +T       .short  fft32768_vfp - 0b
> +T       .short  fft65536_vfp - 0b
> +endfunc

This jump table (the thumb version, after adding the division by two) 
fails to assemble with apple tools (both the current clang based ones, and 
the old gcc based ones). It only considers these assemble-time constants 
as long as there's no non-local labels between the two labels being 
subtracted.

To fix building, I renamed the fftX_vfp symbols into .LfftX_vfp, likewise 
for the fftX_internal_vfp and the cosXpiY constants. This also forced me 
to move the ff_fft16_vfp public function to the bottom of the file - so 
that there's no non-local labels between the jump table and all the 
functions it points to.

This does seem a bit overly complicated. Would it make sense to just turn 
it into a const table, just as fft_tab_neon in fft_neon.S? I guess it 
isn't significantly slower, although it generates more text relocations. 
But I think that would be a fair tradeoff here, if it fixes the build 
issues on apple platforms (and probably also avoids the label difference 
bug in armasm).

> function fft4_vfp
>         vldr    d0, [a1, #0*2*4]   @ s0,s1   = z[0]
> @@ -131,18 +175,22 @@ endfunc
>              vstr    d9, [a1, #3 * 2*4]
> .endm
>
> +function fft8_internal_vfp
> +        macro_fft8_head
> +        macro_fft8_tail
> +        bx      lr
> +endfunc
> +
> function fft8_vfp
>         ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
>         fmrx    a2, FPSCR
>         fmxr    FPSCR, a3
>         vpush   {s16-s31}
> -
> -        macro_fft8_head
> -        macro_fft8_tail
> -
> +        mov     ip, lr
> +        bl      fft8_internal_vfp
>         vpop    {s16-s31}
>         fmxr    FPSCR, a2
> -        bx      lr
> +        bx      ip
> endfunc
>
> .align 3

Is there any particular reason why this is aligned to 8 bytes instead of 4 
- shouldn't 4 (aka .align 2) be enough for float constants? (Yes, I know 
this isn't added by this patch though.)

The extra alignment here triggered a bug in MS armasm where the jump table 
offsets were miscalculated, which seemed to be triggered by this extra 
alignment here. (I haven't reduced the bug to a reportable testcase yet.)

> @@ -153,12 +201,7 @@ cos1pi8:    @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
> cos3pi8:    @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
>         .float  0.3826834261417388916015625
>
> -function ff_fft16_vfp, export=1
> -        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
> -        fmrx    a2, FPSCR
> -        fmxr    FPSCR, a3
> -        vpush   {s16-s31}
> -
> +function fft16_internal_vfp
>         macro_fft8_head
>         @ FFT4(z+8)
>         vldr    d10, [a1, #8 * 2*4]
> @@ -292,7 +335,220 @@ function ff_fft16_vfp, export=1
>               vstr    d8, [a1, #0 * 2*4]
>               vstr    d9, [a1, #4 * 2*4]
>
> +        bx      lr
> +endfunc
> +
> +function ff_fft16_vfp, export=1
> +T       b       fft16_vfp
> +T endfunc
> +T
> +T function fft16_vfp
> +        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
> +        fmrx    a2, FPSCR
> +        fmxr    FPSCR, a3
> +        vpush   {s16-s31}
> +        mov     ip, lr
> +        bl      fft16_internal_vfp
>         vpop    {s16-s31}
>         fmxr    FPSCR, a2
> -        bx      lr
> +        bx      ip
> +endfunc
> +
> +.macro pass n, z0, z1, z2, z3
> +        add     v6, v5, #4*2*\n
> +        @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
> +            @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
> +                @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
> +                    @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
> +            vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
> +            vldmdb  v6!, {s2}
> +            vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
> +            vldmia  v5!, {s0,s1}                @ s0 is unused
> +        vldr    s7, [\z2, #8*o2]            @ t1
> +            vmul.f  s20, s16, s2                @ vector * scalar
> +        vldr    s0, [\z3, #8*o3]            @ t5
> +        vldr    s6, [\z2, #8*o2+4]          @ t2
> +        vldr    s3, [\z3, #8*o3+4]          @ t6
> +            vmul.f  s16, s16, s1                @ vector * scalar
> +        ldr     a4, =\n-1
> +1:      add     \z0, \z0, #8*2
> + .if \n*4*2 >= 512
> +        add     \z1, \z1, #8*2
> + .endif
> + .if \n*4*2 >= 256
> +        add     \z2, \z2, #8*2
> + .endif
> + .if \n*4*2 >= 512
> +        add     \z3, \z3, #8*2
> + .endif
> +        @ up to 2 stalls (VFP vector issuing / waiting for s0)
> +        @ depending upon whether this is the first iteration and
> +        @ how many add instructions are inserted above
> +        vadd.f  s4, s0, s7                  @ t5
> +        vadd.f  s5, s6, s3                  @ t6
> +        vsub.f  s6, s6, s3                  @ t4
> +        vsub.f  s7, s0, s7                  @ t3
> +        vldr    d6, [\z0, #8*0-8*2]         @ s12,s13
> +            vadd.f  s0, s16, s21                @ t1
> +        vldr    d7, [\z1, #8*o1-8*2]        @ s14,s15
> +            vsub.f  s1, s18, s23                @ t5
> +        vadd.f  s8, s4, s12                 @ vector + vector
> +        @ stall (VFP vector issuing)
> +        @ stall (VFP vector issuing)
> +        @ stall (VFP vector issuing)
> +        vsub.f  s4, s12, s4
> +        vsub.f  s5, s13, s5
> +        vsub.f  s6, s14, s6
> +        vsub.f  s7, s15, s7
> +            vsub.f  s2, s17, s20                @ t2
> +            vadd.f  s3, s19, s22                @ t6
> +        vstr    d4, [\z0, #8*0-8*2]         @ s8,s9
> +        vstr    d5, [\z1, #8*o1-8*2]        @ s10,s11
> +        @ stall (waiting for s5)
> +        vstr    d2, [\z2, #8*o2-8*2]        @ s4,s5
> +            vadd.f  s4, s1, s0                  @ t5
> +        vstr    d3, [\z3, #8*o3-8*2]        @ s6,s7
> +            vsub.f  s7, s1, s0                  @ t3
> +            vadd.f  s5, s2, s3                  @ t6
> +            vsub.f  s6, s2, s3                  @ t4
> +            vldr    d6, [\z0, #8*1-8*2]         @ s12,s13
> +            vldr    d7, [\z1, #8*(o1+1)-8*2]    @ s14,s15
> +                vldr    d4, [\z2, #8*o2]            @ s8,s9
> +                vldmdb  v6!, {s2,s3}
> +                vldr    d5, [\z3, #8*o3]            @ s10,s11
> +            vadd.f  s20, s4, s12                @ vector + vector
> +                vldmia  v5!, {s0,s1}
> +                    vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
> +            @ stall (VFP vector issuing)
> +            vsub.f  s4, s12, s4
> +            vsub.f  s5, s13, s5
> +            vsub.f  s6, s14, s6
> +            vsub.f  s7, s15, s7
> +                vmul.f  s12, s8, s3                 @ vector * scalar
> +            vstr    d10, [\z0, #8*1-8*2]        @ s20,s21
> +                    vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
> +            vstr    d11, [\z1, #8*(o1+1)-8*2]   @ s22,s23
> +                vmul.f  s8, s8, s0                  @ vector * scalar
> +            vstr    d2, [\z2, #8*(o2+1)-8*2]    @ s4,s5
> +            @ stall (waiting for s7)
> +            vstr    d3, [\z3, #8*(o3+1)-8*2]    @ s6,s7
> +                    vmul.f  s20, s16, s2                @ vector * scalar
> +                @ stall (VFP vector issuing)
> +                @ stall (VFP vector issuing)
> +                @ stall (VFP vector issuing)
> +                vadd.f  s7, s8, s13                 @ t1
> +                vsub.f  s6, s9, s12                 @ t2
> +                vsub.f  s0, s10, s15                @ t5
> +                vadd.f  s3, s11, s14                @ t6
> +                    vmul.f  s16, s16, s1                @ vector * scalar
> +        subs    a4, a4, #1
> +        bne     1b
> +        @ What remains is identical to the first two indentations of
> +        @ the above, but without the increment of z
> +        vadd.f  s4, s0, s7                  @ t5
> +        vadd.f  s5, s6, s3                  @ t6
> +        vsub.f  s6, s6, s3                  @ t4
> +        vsub.f  s7, s0, s7                  @ t3
> +        vldr    d6, [\z0, #8*0]             @ s12,s13
> +            vadd.f  s0, s16, s21                @ t1
> +        vldr    d7, [\z1, #8*o1]            @ s14,s15
> +            vsub.f  s1, s18, s23                @ t5
> +        vadd.f  s8, s4, s12                 @ vector + vector
> +        vsub.f  s4, s12, s4
> +        vsub.f  s5, s13, s5
> +        vsub.f  s6, s14, s6
> +        vsub.f  s7, s15, s7
> +            vsub.f  s2, s17, s20                @ t2
> +            vadd.f  s3, s19, s22                @ t6
> +        vstr    d4, [\z0, #8*0]             @ s8,s9
> +        vstr    d5, [\z1, #8*o1]            @ s10,s11
> +        vstr    d2, [\z2, #8*o2]            @ s4,s5
> +            vadd.f  s4, s1, s0                  @ t5
> +        vstr    d3, [\z3, #8*o3]            @ s6,s7
> +            vsub.f  s7, s1, s0                  @ t3
> +            vadd.f  s5, s2, s3                  @ t6
> +            vsub.f  s6, s2, s3                  @ t4
> +            vldr    d6, [\z0, #8*1]             @ s12,s13
> +            vldr    d7, [\z1, #8*(o1+1)]        @ s14,s15
> +            vadd.f  s20, s4, s12                @ vector + vector
> +            vsub.f  s4, s12, s4
> +            vsub.f  s5, s13, s5
> +            vsub.f  s6, s14, s6
> +            vsub.f  s7, s15, s7
> +            vstr    d10, [\z0, #8*1]            @ s20,s21
> +            vstr    d11, [\z1, #8*(o1+1)]       @ s22,s23
> +            vstr    d2, [\z2, #8*(o2+1)]        @ s4,s5
> +            vstr    d3, [\z3, #8*(o3+1)]        @ s6,s7
> +.endm
> +
> +.macro fft_internal_vfp name, name2, name4, costable, n
> +function \name
> + .if \n >= 512
> +        push    {v1-v6,lr}
> + .elseif \n >= 256
> +        push    {v1-v2,v5-v6,lr}
> + .else
> +        push    {v1,v5-v6,lr}
> + .endif
> +        mov     v1, a1
> +        bl      \name2
> +        add     a1, v1, #8*(\n/4)*2
> +        bl      \name4
> +        ldr     v5, =\costable

Should this perhaps use the movrelx macro?

> +        add     a1, v1, #8*(\n/4)*3
> +        bl      \name4
> + .if \n >= 512
> +  .set o1, 0*(\n/4/2)
> +  .set o2, 0*(\n/4/2)
> +  .set o3, 0*(\n/4/2)
> +        add     v2, v1, #8*2*(\n/4/2)
> +        add     v3, v1, #8*4*(\n/4/2)
> +        add     v4, v1, #8*6*(\n/4/2)
> +        pass    (\n/4/2), v1, v2, v3, v4
> +        pop     {v1-v6,pc}
> + .elseif \n >= 256
> +  .set o1, 2*(\n/4/2)
> +  .set o2, 0*(\n/4/2)
> +  .set o3, 2*(\n/4/2)
> +        add     v2, v1, #8*4*(\n/4/2)
> +        pass    (\n/4/2), v1, v1, v2, v2
> +        pop     {v1-v2,v5-v6,pc}
> + .else
> +  .set o1, 2*(\n/4/2)
> +  .set o2, 4*(\n/4/2)
> +  .set o3, 6*(\n/4/2)
> +        pass    (\n/4/2), v1, v1, v1, v1
> +        pop     {v1,v5-v6,pc}
> + .endif
> endfunc
> +.endm
> +
> +#define DECL_FFT(n,n2,n4)               \
> +fft_internal_vfp  fft##n##_internal_vfp, fft##n2##_internal_vfp, fft##n4##_internal_vfp, ff_cos_##n, n ;\

You need to add X() around the ff_cos_N name here

> +                                       ;\
> +function fft##n##_vfp                  ;\
> +        ldr     a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ ;\
> +        fmrx    a2, FPSCR              ;\
> +        fmxr    FPSCR, a3              ;\
> +        vpush   {s16-s31}              ;\
> +        mov     ip, lr                 ;\
> +        bl      fft##n##_internal_vfp  ;\
> +        vpop    {s16-s31}              ;\
> +        fmxr    FPSCR, a2              ;\
> +        bx      ip                     ;\
> +endfunc                                ;\
> +                                       ;\
> +.ltorg

gas-preprocessor didn't previously support multiple instructions 
concatenated on one line separated with semicolons - I did write a patch 
for doing that now though.

Nevertheless, wouldn't this be more readable written just as a normal gas 
macro instead of as a cpp macro? That's how the corresponding macro is 
done in fft_neon.S.

// Martin
Martin Storsjö July 13, 2014, 6 p.m. | #2
On Sun, 13 Jul 2014, Martin Storsjö wrote:

> On Fri, 11 Jul 2014, Ben Avison wrote:
>
>> .align 3
>
> Is there any particular reason why this is aligned to 8 bytes instead of 4 - 
> shouldn't 4 (aka .align 2) be enough for float constants? (Yes, I know this 
> isn't added by this patch though.)
>
> The extra alignment here triggered a bug in MS armasm where the jump table 
> offsets were miscalculated, which seemed to be triggered by this extra 
> alignment here. (I haven't reduced the bug to a reportable testcase yet.)

Actually, changing ".align 3" to ".align 2" doesn't really help much - 
armasm seems to be quite broken when it comes to calculating label 
differences - in this case it can be triggered when there's any sort of 
alignment directives between the two labels.

So this is another argument for going with a plain pointer list (as in 
fft_neon.S), as would solve issues with apple's tools as well.

// Martin
Ben Avison July 16, 2014, 3 p.m. | #3
On Sun, 13 Jul 2014 10:10:00 +0100, Martin Storsjö <martin@martin.st> wrote:
> On Fri, 11 Jul 2014, Ben Avison wrote:
> I tried this code on most of our odd configurations, and ran into a
> bunch of issues, most of which I've been able to resolve in one way or
> another, but some parts of it is a bit ugly...

Thanks Martin. It might seem like it sometimes, but I don't deliberately
set out to break gas-preprocessor with every new patch series :)

It's a shame we have to worry about getting this code to assemble
correctly in Thumb mode, as it seems rather unlikely that it will ever
actually be used that way - most CPUs either don't support enough Thumb
instructions for it to assemble, or have NEON that will be used in
preference, or don't support short vector VFP mode, which the code
requires.

Nevertheless, I have made the changes you suggest - probably the same as
the ones you made locally. I'll repost this patch shortly.

>> +A       .word   ff_fft16_vfp    @ this one alone is exported
>
> This reference needs to be wrapped in X()

I've only ever seen X() resolve to a no-op, so I don't get this sort of
thing flagged during my own tests. When does it make any difference? I
was never clear on whether it only applied to labels from *other*
compilation units rather than the current one (made less clear by the
fact that it never seems to be used when a label is defined), or whether
it applied just to code labels, or both code and data. The name "X"
doesn't really give much of a hint!

>> .align 3
>
> Is there any particular reason why this is aligned to 8 bytes instead
> of 4 - shouldn't 4 (aka .align 2) be enough for float constants? (Yes,
> I know this isn't added by this patch though.)

It's because the ARM11's memory bus is actually 64 bits wide. cos1pi4 and
cos1pi8 are loaded in the same vldr instruction; if cos1pi4 is 64-bit
aligned then this takes 1 cycle, otherwise 2 cycles. This multiplies up,
so for something like FFT512, that's quite a few extra cycles.

>> +        ldr     v5, =\costable
>
> Should this perhaps use the movrelx macro?

Um, maybe. I must admit to not really understanding what the movrel and
movrelx macros are for and when to use which. Could I respectfully ask
for some comments in asm.S to explain this to those of us who aren't
familiar with the full range of supported ABIs?

> Nevertheless, wouldn't this be more readable written just as a normal
> gas macro instead of as a cpp macro? That's how the corresponding macro
> is done in fft_neon.S.

I couldn't figure out how to do string concatenations in gas - it's not
my favourite assembly language and I often find string manipulation is
one of its weaknesses. Nevertheless, the trick in fft_neon.S seems to
work, so thanks for the pointer.

Ben
Martin Storsjö July 16, 2014, 4:33 p.m. | #4
Hi Ben,

On Wed, 16 Jul 2014, Ben Avison wrote:

> On Sun, 13 Jul 2014 10:10:00 +0100, Martin Storsjö <martin@martin.st> wrote:
>> On Fri, 11 Jul 2014, Ben Avison wrote:
>> I tried this code on most of our odd configurations, and ran into a
>> bunch of issues, most of which I've been able to resolve in one way or
>> another, but some parts of it is a bit ugly...
>
> Thanks Martin. It might seem like it sometimes, but I don't deliberately
> set out to break gas-preprocessor with every new patch series :)

Well, given that gas-preprocessor originally only supported the (at that 
time) small subset of gas features used by the existing arm and ppc 
assembly, it's quite understandable that new code finds issues - this only 
makes the tool better :-)

> It's a shame we have to worry about getting this code to assemble
> correctly in Thumb mode, as it seems rather unlikely that it will ever
> actually be used that way - most CPUs either don't support enough Thumb
> instructions for it to assemble, or have NEON that will be used in
> preference, or don't support short vector VFP mode, which the code
> requires.
>
> Nevertheless, I have made the changes you suggest - probably the same as
> the ones you made locally. I'll repost this patch shortly.

Thanks! Indeed, it is a bit of extra burden to make sure it works in those 
mostly irrelevant configurations, but it eases the maintainance of the 
code if it is as portable as possible.

Btw, I saw that you renamed the local symbols to all use the .L prefix - 
sorry if I was unclear about this. This would only have been necessary if 
we kept the thumb offset table (and even then, we would have had to jump 
through some extra hoops, to make sure there were no non-local labels 
anywhere between the offset table and the functions it pointed to). Since 
this new patch only uses a normal function pointer table, this shouldn't 
be necessary any longer, so in case there's a need for yet another round 
of the patch you can do this whichever way you prefer. Keeping the symbol 
names visible would perhaps be useful for debugging?

If you prefer to do it that way I can also amend that before pushing (no 
need to resend just for that), in case there's nothing else that needs to 
be done.

>>> +A       .word   ff_fft16_vfp    @ this one alone is exported
>> 
>> This reference needs to be wrapped in X()
>
> I've only ever seen X() resolve to a no-op, so I don't get this sort of
> thing flagged during my own tests. When does it make any difference? I
> was never clear on whether it only applied to labels from *other*
> compilation units rather than the current one (made less clear by the
> fact that it never seems to be used when a label is defined), or whether
> it applied just to code labels, or both code and data. The name "X"
> doesn't really give much of a hint!

I think the name X() was chosen just as a macro as short as possible. It's 
used for mangling of external names - on arm it only matters on 
darwin/iOS, where external symbols are prefixed with an underscore. This 
underscore is automatically added when you declare functions with the 
function macro, but has to be added whenever you reference such symbols 
elsewhere. Given the naming scheme, you would use it when referencing any 
ff_ prefixed labels.

>>> .align 3
>> 
>> Is there any particular reason why this is aligned to 8 bytes instead
>> of 4 - shouldn't 4 (aka .align 2) be enough for float constants? (Yes,
>> I know this isn't added by this patch though.)
>
> It's because the ARM11's memory bus is actually 64 bits wide. cos1pi4 and
> cos1pi8 are loaded in the same vldr instruction; if cos1pi4 is 64-bit
> aligned then this takes 1 cycle, otherwise 2 cycles. This multiplies up,
> so for something like FFT512, that's quite a few extra cycles.

Ok, thanks for the clarification. This issue should have gone away now 
that you use a table with function pointers instead of manually 
calculating offsets.

>>> +        ldr     v5, =\costable
>> 
>> Should this perhaps use the movrelx macro?
>
> Um, maybe. I must admit to not really understanding what the movrel and
> movrelx macros are for and when to use which. Could I respectfully ask
> for some comments in asm.S to explain this to those of us who aren't
> familiar with the full range of supported ABIs?

Sure. I'm actually not all that familiar with it myself, hopefully Janne 
can fill in the details.

I noticed this since gas-preprocessor for MS armasm didn't handle ldr with 
imported symbols yet - because all the similar cases had been wrapped in 
the movrel/movrelx macros, that end up as movw/movt in the armasm case.

>> Nevertheless, wouldn't this be more readable written just as a normal
>> gas macro instead of as a cpp macro? That's how the corresponding macro
>> is done in fft_neon.S.
>
> I couldn't figure out how to do string concatenations in gas - it's not
> my favourite assembly language and I often find string manipulation is
> one of its weaknesses. Nevertheless, the trick in fft_neon.S seems to
> work, so thanks for the pointer.

Ok - thanks for the resend, I'll retest it in all these configurations 
now.

// Martin
Janne Grunau July 18, 2014, 8:48 a.m. | #5
On 2014-07-16 19:33:59 +0300, Martin Storsjö wrote:
> Hi Ben,
> 
> On Wed, 16 Jul 2014, Ben Avison wrote:
> 
> >>>+        ldr     v5, =\costable
> >>
> >>Should this perhaps use the movrelx macro?
> >
> >Um, maybe. I must admit to not really understanding what the movrel and
> >movrelx macros are for and when to use which. Could I respectfully ask
> >for some comments in asm.S to explain this to those of us who aren't
> >familiar with the full range of supported ABIs?
> 
> Sure. I'm actually not all that familiar with it myself, hopefully
> Janne can fill in the details.
> 
> I noticed this since gas-preprocessor for MS armasm didn't handle
> ldr with imported symbols yet - because all the similar cases had
> been wrapped in the movrel/movrelx macros, that end up as movw/movt
> in the armasm case.

movrel and movrelx are needed for position independent or PC relative 
loads. External symbols need to be handled differently so there is 
movrelx for loading X(symbol).

Janne

Patch

diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c
index 3a3d1a7..bc143c1 100644
--- a/libavcodec/arm/fft_init_arm.c
+++ b/libavcodec/arm/fft_init_arm.c
@@ -23,6 +23,8 @@ 
 #include "libavcodec/rdft.h"
 #include "libavcodec/synth_filter.h"
 
+void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z);
+
 void ff_fft_permute_neon(FFTContext *s, FFTComplex *z);
 void ff_fft_calc_neon(FFTContext *s, FFTComplex *z);
 
@@ -38,10 +40,10 @@  av_cold void ff_fft_init_arm(FFTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
-    if (have_vfp(cpu_flags)) {
+    if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) {
+        s->fft_calc     = ff_fft_calc_vfp;
 #if CONFIG_MDCT
-        if (!have_vfpv3(cpu_flags))
-            s->imdct_half   = ff_imdct_half_vfp;
+        s->imdct_half   = ff_imdct_half_vfp;
 #endif
     }
 
diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S
index 7845ebb..e7d710b 100644
--- a/libavcodec/arm/fft_vfp.S
+++ b/libavcodec/arm/fft_vfp.S
@@ -21,8 +21,52 @@ 
 
 #include "libavutil/arm/asm.S"
 
-@ TODO: * FFTs wider than 16
-@       * dispatch code
+@ The fftx_internal_vfp versions of the functions obey a modified AAPCS:
+@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and
+@ all single-precision VFP registers may be corrupted on exit.
+
+function ff_fft_calc_vfp, export=1
+        ldr     ip, [a1, #0]    @ nbits
+        mov     a1, a2
+A       ldr     ip, [pc, ip, lsl #2]
+A       bx      ip
+A       .word   0
+A       .word   0
+A       .word   fft4_vfp
+A       .word   fft8_vfp
+A       .word   ff_fft16_vfp    @ this one alone is exported
+A       .word   fft32_vfp
+A       .word   fft64_vfp
+A       .word   fft128_vfp
+A       .word   fft256_vfp
+A       .word   fft512_vfp
+A       .word   fft1024_vfp
+A       .word   fft2048_vfp
+A       .word   fft4096_vfp
+A       .word   fft8192_vfp
+A       .word   fft16384_vfp
+A       .word   fft32768_vfp
+A       .word   fft65536_vfp
+T       tbh     [pc, ip, lsl #1]
+T 0:    .short  0
+T       .short  0
+T       .short  fft4_vfp - 0b
+T       .short  fft4_vfp - 0b
+T       .short  fft8_vfp - 0b
+T       .short  fft16_vfp - 0b
+T       .short  fft32_vfp - 0b
+T       .short  fft64_vfp - 0b
+T       .short  fft128_vfp - 0b
+T       .short  fft256_vfp - 0b
+T       .short  fft512_vfp - 0b
+T       .short  fft1024_vfp - 0b
+T       .short  fft2048_vfp - 0b
+T       .short  fft4096_vfp - 0b
+T       .short  fft8192_vfp - 0b
+T       .short  fft16384_vfp - 0b
+T       .short  fft32768_vfp - 0b
+T       .short  fft65536_vfp - 0b
+endfunc
 
 function fft4_vfp
         vldr    d0, [a1, #0*2*4]   @ s0,s1   = z[0]
@@ -131,18 +175,22 @@  endfunc
              vstr    d9, [a1, #3 * 2*4]
 .endm
 
+function fft8_internal_vfp
+        macro_fft8_head
+        macro_fft8_tail
+        bx      lr
+endfunc
+
 function fft8_vfp
         ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
         fmrx    a2, FPSCR
         fmxr    FPSCR, a3
         vpush   {s16-s31}
-
-        macro_fft8_head
-        macro_fft8_tail
-
+        mov     ip, lr
+        bl      fft8_internal_vfp
         vpop    {s16-s31}
         fmxr    FPSCR, a2
-        bx      lr
+        bx      ip
 endfunc
 
 .align 3
@@ -153,12 +201,7 @@  cos1pi8:    @ cos(1*pi/8) = sqrt(2+sqrt(2))/2
 cos3pi8:    @ cos(2*pi/8) = sqrt(2-sqrt(2))/2
         .float  0.3826834261417388916015625
 
-function ff_fft16_vfp, export=1
-        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
-        fmrx    a2, FPSCR
-        fmxr    FPSCR, a3
-        vpush   {s16-s31}
-
+function fft16_internal_vfp
         macro_fft8_head
         @ FFT4(z+8)
         vldr    d10, [a1, #8 * 2*4]
@@ -292,7 +335,220 @@  function ff_fft16_vfp, export=1
               vstr    d8, [a1, #0 * 2*4]
               vstr    d9, [a1, #4 * 2*4]
 
+        bx      lr
+endfunc
+
+function ff_fft16_vfp, export=1
+T       b       fft16_vfp
+T endfunc
+T
+T function fft16_vfp
+        ldr     a3, =0x03030000     @ RunFast mode, vector length 4, stride 1
+        fmrx    a2, FPSCR
+        fmxr    FPSCR, a3
+        vpush   {s16-s31}
+        mov     ip, lr
+        bl      fft16_internal_vfp
         vpop    {s16-s31}
         fmxr    FPSCR, a2
-        bx      lr
+        bx      ip
+endfunc
+
+.macro pass n, z0, z1, z2, z3
+        add     v6, v5, #4*2*\n
+        @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3])
+            @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
+                @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0])
+                    @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1])
+            vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
+            vldmdb  v6!, {s2}
+            vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
+            vldmia  v5!, {s0,s1}                @ s0 is unused
+        vldr    s7, [\z2, #8*o2]            @ t1
+            vmul.f  s20, s16, s2                @ vector * scalar
+        vldr    s0, [\z3, #8*o3]            @ t5
+        vldr    s6, [\z2, #8*o2+4]          @ t2
+        vldr    s3, [\z3, #8*o3+4]          @ t6
+            vmul.f  s16, s16, s1                @ vector * scalar
+        ldr     a4, =\n-1
+1:      add     \z0, \z0, #8*2
+ .if \n*4*2 >= 512
+        add     \z1, \z1, #8*2
+ .endif
+ .if \n*4*2 >= 256
+        add     \z2, \z2, #8*2
+ .endif
+ .if \n*4*2 >= 512
+        add     \z3, \z3, #8*2
+ .endif
+        @ up to 2 stalls (VFP vector issuing / waiting for s0)
+        @ depending upon whether this is the first iteration and
+        @ how many add instructions are inserted above
+        vadd.f  s4, s0, s7                  @ t5
+        vadd.f  s5, s6, s3                  @ t6
+        vsub.f  s6, s6, s3                  @ t4
+        vsub.f  s7, s0, s7                  @ t3
+        vldr    d6, [\z0, #8*0-8*2]         @ s12,s13
+            vadd.f  s0, s16, s21                @ t1
+        vldr    d7, [\z1, #8*o1-8*2]        @ s14,s15
+            vsub.f  s1, s18, s23                @ t5
+        vadd.f  s8, s4, s12                 @ vector + vector
+        @ stall (VFP vector issuing)
+        @ stall (VFP vector issuing)
+        @ stall (VFP vector issuing)
+        vsub.f  s4, s12, s4
+        vsub.f  s5, s13, s5
+        vsub.f  s6, s14, s6
+        vsub.f  s7, s15, s7
+            vsub.f  s2, s17, s20                @ t2
+            vadd.f  s3, s19, s22                @ t6
+        vstr    d4, [\z0, #8*0-8*2]         @ s8,s9
+        vstr    d5, [\z1, #8*o1-8*2]        @ s10,s11
+        @ stall (waiting for s5)
+        vstr    d2, [\z2, #8*o2-8*2]        @ s4,s5
+            vadd.f  s4, s1, s0                  @ t5
+        vstr    d3, [\z3, #8*o3-8*2]        @ s6,s7
+            vsub.f  s7, s1, s0                  @ t3
+            vadd.f  s5, s2, s3                  @ t6
+            vsub.f  s6, s2, s3                  @ t4
+            vldr    d6, [\z0, #8*1-8*2]         @ s12,s13
+            vldr    d7, [\z1, #8*(o1+1)-8*2]    @ s14,s15
+                vldr    d4, [\z2, #8*o2]            @ s8,s9
+                vldmdb  v6!, {s2,s3}
+                vldr    d5, [\z3, #8*o3]            @ s10,s11
+            vadd.f  s20, s4, s12                @ vector + vector
+                vldmia  v5!, {s0,s1}
+                    vldr    d8, [\z2, #8*(o2+1)]        @ s16,s17
+            @ stall (VFP vector issuing)
+            vsub.f  s4, s12, s4
+            vsub.f  s5, s13, s5
+            vsub.f  s6, s14, s6
+            vsub.f  s7, s15, s7
+                vmul.f  s12, s8, s3                 @ vector * scalar
+            vstr    d10, [\z0, #8*1-8*2]        @ s20,s21
+                    vldr    d9, [\z3, #8*(o3+1)]        @ s18,s19
+            vstr    d11, [\z1, #8*(o1+1)-8*2]   @ s22,s23
+                vmul.f  s8, s8, s0                  @ vector * scalar
+            vstr    d2, [\z2, #8*(o2+1)-8*2]    @ s4,s5
+            @ stall (waiting for s7)
+            vstr    d3, [\z3, #8*(o3+1)-8*2]    @ s6,s7
+                    vmul.f  s20, s16, s2                @ vector * scalar
+                @ stall (VFP vector issuing)
+                @ stall (VFP vector issuing)
+                @ stall (VFP vector issuing)
+                vadd.f  s7, s8, s13                 @ t1
+                vsub.f  s6, s9, s12                 @ t2
+                vsub.f  s0, s10, s15                @ t5
+                vadd.f  s3, s11, s14                @ t6
+                    vmul.f  s16, s16, s1                @ vector * scalar
+        subs    a4, a4, #1
+        bne     1b
+        @ What remains is identical to the first two indentations of
+        @ the above, but without the increment of z
+        vadd.f  s4, s0, s7                  @ t5
+        vadd.f  s5, s6, s3                  @ t6
+        vsub.f  s6, s6, s3                  @ t4
+        vsub.f  s7, s0, s7                  @ t3
+        vldr    d6, [\z0, #8*0]             @ s12,s13
+            vadd.f  s0, s16, s21                @ t1
+        vldr    d7, [\z1, #8*o1]            @ s14,s15
+            vsub.f  s1, s18, s23                @ t5
+        vadd.f  s8, s4, s12                 @ vector + vector
+        vsub.f  s4, s12, s4
+        vsub.f  s5, s13, s5
+        vsub.f  s6, s14, s6
+        vsub.f  s7, s15, s7
+            vsub.f  s2, s17, s20                @ t2
+            vadd.f  s3, s19, s22                @ t6
+        vstr    d4, [\z0, #8*0]             @ s8,s9
+        vstr    d5, [\z1, #8*o1]            @ s10,s11
+        vstr    d2, [\z2, #8*o2]            @ s4,s5
+            vadd.f  s4, s1, s0                  @ t5
+        vstr    d3, [\z3, #8*o3]            @ s6,s7
+            vsub.f  s7, s1, s0                  @ t3
+            vadd.f  s5, s2, s3                  @ t6
+            vsub.f  s6, s2, s3                  @ t4
+            vldr    d6, [\z0, #8*1]             @ s12,s13
+            vldr    d7, [\z1, #8*(o1+1)]        @ s14,s15
+            vadd.f  s20, s4, s12                @ vector + vector
+            vsub.f  s4, s12, s4
+            vsub.f  s5, s13, s5
+            vsub.f  s6, s14, s6
+            vsub.f  s7, s15, s7
+            vstr    d10, [\z0, #8*1]            @ s20,s21
+            vstr    d11, [\z1, #8*(o1+1)]       @ s22,s23
+            vstr    d2, [\z2, #8*(o2+1)]        @ s4,s5
+            vstr    d3, [\z3, #8*(o3+1)]        @ s6,s7
+.endm
+
+.macro fft_internal_vfp name, name2, name4, costable, n
+function \name
+ .if \n >= 512
+        push    {v1-v6,lr}
+ .elseif \n >= 256
+        push    {v1-v2,v5-v6,lr}
+ .else
+        push    {v1,v5-v6,lr}
+ .endif
+        mov     v1, a1
+        bl      \name2
+        add     a1, v1, #8*(\n/4)*2
+        bl      \name4
+        ldr     v5, =\costable
+        add     a1, v1, #8*(\n/4)*3
+        bl      \name4
+ .if \n >= 512
+  .set o1, 0*(\n/4/2)
+  .set o2, 0*(\n/4/2)
+  .set o3, 0*(\n/4/2)
+        add     v2, v1, #8*2*(\n/4/2)
+        add     v3, v1, #8*4*(\n/4/2)
+        add     v4, v1, #8*6*(\n/4/2)
+        pass    (\n/4/2), v1, v2, v3, v4
+        pop     {v1-v6,pc}
+ .elseif \n >= 256
+  .set o1, 2*(\n/4/2)
+  .set o2, 0*(\n/4/2)
+  .set o3, 2*(\n/4/2)
+        add     v2, v1, #8*4*(\n/4/2)
+        pass    (\n/4/2), v1, v1, v2, v2
+        pop     {v1-v2,v5-v6,pc}
+ .else
+  .set o1, 2*(\n/4/2)
+  .set o2, 4*(\n/4/2)
+  .set o3, 6*(\n/4/2)
+        pass    (\n/4/2), v1, v1, v1, v1
+        pop     {v1,v5-v6,pc}
+ .endif
 endfunc
+.endm
+
+#define DECL_FFT(n,n2,n4)               \
+fft_internal_vfp  fft##n##_internal_vfp, fft##n2##_internal_vfp, fft##n4##_internal_vfp, ff_cos_##n, n ;\
+                                       ;\
+function fft##n##_vfp                  ;\
+        ldr     a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ ;\
+        fmrx    a2, FPSCR              ;\
+        fmxr    FPSCR, a3              ;\
+        vpush   {s16-s31}              ;\
+        mov     ip, lr                 ;\
+        bl      fft##n##_internal_vfp  ;\
+        vpop    {s16-s31}              ;\
+        fmxr    FPSCR, a2              ;\
+        bx      ip                     ;\
+endfunc                                ;\
+                                       ;\
+.ltorg
+
+DECL_FFT(32,16,8)
+DECL_FFT(64,32,16)
+DECL_FFT(128,64,32)
+DECL_FFT(256,128,64)
+DECL_FFT(512,256,128)
+DECL_FFT(1024,512,256)
+DECL_FFT(2048,1024,512)
+DECL_FFT(4096,2048,1024)
+DECL_FFT(8192,4096,2048)
+DECL_FFT(16384,8192,4096)
+DECL_FFT(32768,16384,8192)
+DECL_FFT(65536,32768,16384)