Message ID | 1405074838-23873-1-git-send-email-bavison@riscosopen.org |
---|---|

State | New |

Headers | show |

On Fri, 11 Jul 2014, Ben Avison wrote: > The previous implementation targeted DTS Coherent Acoustics, which only > requires nbits == 4 (fft16()). This case was (and still is) linked directly > rather than being indirected through ff_fft_calc_vfp(), but now the full > range from radix-4 up to radix-65536 is available. This benefits other codecs > such as AAC and AC3. > > The implementaion is based upon the C version, with each routine larger than > radix-16 calling a hierarchy of smaller FFT functions, then performing a > post-processing pass. This pass benefits a lot from loop unrolling to > counter the long pipelines in the VFP. A relaxed calling standard also > reduces the overhead of the call hierarchy, and avoiding the excessive > inlining performed by GCC probably helps with I-cache utilisation too. > > I benchmarked the result by measuring the number of gperftools samples that > hit anywhere in the AAC decoder (starting from aac_decode_frame()) or > specifically in the FFT routines (fft4() to fft512() and pass()) for the > same sample AAC stream: > > Before After > Mean StdDev Mean StdDev Confidence Change > Audio decode 2245.5 53.1 1599.6 43.8 100.0% +40.4% > FFT routines 940.6 22.0 348.1 20.8 100.0% +170.2% > --- > libavcodec/arm/fft_init_arm.c | 8 +- > libavcodec/arm/fft_vfp.S | 284 +++++++++++++++++++++++++++++++++++++++-- > 2 files changed, 275 insertions(+), 17 deletions(-) I tried this code on most of our odd configurations, and ran into a bunch of issues, most of which I've been able to resolve in one way or another, but some parts of it is a bit ugly... The other three patches don't seem to require any extra tweaks to build/run in these configurations though. > diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c > index 3a3d1a7..bc143c1 100644 > --- a/libavcodec/arm/fft_init_arm.c > +++ b/libavcodec/arm/fft_init_arm.c > @@ -23,6 +23,8 @@ > #include "libavcodec/rdft.h" > #include "libavcodec/synth_filter.h" > > +void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z); > + > void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); > void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); > > @@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s) > { > int cpu_flags = av_get_cpu_flags(); > > - if (have_vfp(cpu_flags)) { > + if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) { > + s->fft_calc = ff_fft_calc_vfp; > #if CONFIG_MDCT > - if (!have_vfpv3(cpu_flags)) > - s->imdct_half = ff_imdct_half_vfp; > + s->imdct_half = ff_imdct_half_vfp; > #endif > } > > diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S > index 7845ebb..e7d710b 100644 > --- a/libavcodec/arm/fft_vfp.S > +++ b/libavcodec/arm/fft_vfp.S > @@ -21,8 +21,52 @@ > > #include "libavutil/arm/asm.S" > > -@ TODO: * FFTs wider than 16 > -@ * dispatch code > +@ The fftx_internal_vfp versions of the functions obey a modified AAPCS: > +@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and > +@ all single-precision VFP registers may be corrupted on exit. > + > +function ff_fft_calc_vfp, export=1 > + ldr ip, [a1, #0] @ nbits > + mov a1, a2 > +A ldr ip, [pc, ip, lsl #2] > +A bx ip > +A .word 0 > +A .word 0 > +A .word fft4_vfp > +A .word fft8_vfp > +A .word ff_fft16_vfp @ this one alone is exported This reference needs to be wrapped in X() > +A .word fft32_vfp > +A .word fft64_vfp > +A .word fft128_vfp > +A .word fft256_vfp > +A .word fft512_vfp > +A .word fft1024_vfp > +A .word fft2048_vfp > +A .word fft4096_vfp > +A .word fft8192_vfp > +A .word fft16384_vfp > +A .word fft32768_vfp > +A .word fft65536_vfp > +T tbh [pc, ip, lsl #1] > +T 0: .short 0 > +T .short 0 > +T .short fft4_vfp - 0b > +T .short fft4_vfp - 0b You've got one entry duplicated here, and these entries need to be divided by 2 (just as they are in mlpdsp_armv5te.S). > +T .short fft8_vfp - 0b > +T .short fft16_vfp - 0b > +T .short fft32_vfp - 0b > +T .short fft64_vfp - 0b > +T .short fft128_vfp - 0b > +T .short fft256_vfp - 0b > +T .short fft512_vfp - 0b > +T .short fft1024_vfp - 0b > +T .short fft2048_vfp - 0b > +T .short fft4096_vfp - 0b > +T .short fft8192_vfp - 0b > +T .short fft16384_vfp - 0b > +T .short fft32768_vfp - 0b > +T .short fft65536_vfp - 0b > +endfunc This jump table (the thumb version, after adding the division by two) fails to assemble with apple tools (both the current clang based ones, and the old gcc based ones). It only considers these assemble-time constants as long as there's no non-local labels between the two labels being subtracted. To fix building, I renamed the fftX_vfp symbols into .LfftX_vfp, likewise for the fftX_internal_vfp and the cosXpiY constants. This also forced me to move the ff_fft16_vfp public function to the bottom of the file - so that there's no non-local labels between the jump table and all the functions it points to. This does seem a bit overly complicated. Would it make sense to just turn it into a const table, just as fft_tab_neon in fft_neon.S? I guess it isn't significantly slower, although it generates more text relocations. But I think that would be a fair tradeoff here, if it fixes the build issues on apple platforms (and probably also avoids the label difference bug in armasm). > function fft4_vfp > vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] > @@ -131,18 +175,22 @@ endfunc > vstr d9, [a1, #3 * 2*4] > .endm > > +function fft8_internal_vfp > + macro_fft8_head > + macro_fft8_tail > + bx lr > +endfunc > + > function fft8_vfp > ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 > fmrx a2, FPSCR > fmxr FPSCR, a3 > vpush {s16-s31} > - > - macro_fft8_head > - macro_fft8_tail > - > + mov ip, lr > + bl fft8_internal_vfp > vpop {s16-s31} > fmxr FPSCR, a2 > - bx lr > + bx ip > endfunc > > .align 3 Is there any particular reason why this is aligned to 8 bytes instead of 4 - shouldn't 4 (aka .align 2) be enough for float constants? (Yes, I know this isn't added by this patch though.) The extra alignment here triggered a bug in MS armasm where the jump table offsets were miscalculated, which seemed to be triggered by this extra alignment here. (I haven't reduced the bug to a reportable testcase yet.) > @@ -153,12 +201,7 @@ cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 > cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 > .float 0.3826834261417388916015625 > > -function ff_fft16_vfp, export=1 > - ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 > - fmrx a2, FPSCR > - fmxr FPSCR, a3 > - vpush {s16-s31} > - > +function fft16_internal_vfp > macro_fft8_head > @ FFT4(z+8) > vldr d10, [a1, #8 * 2*4] > @@ -292,7 +335,220 @@ function ff_fft16_vfp, export=1 > vstr d8, [a1, #0 * 2*4] > vstr d9, [a1, #4 * 2*4] > > + bx lr > +endfunc > + > +function ff_fft16_vfp, export=1 > +T b fft16_vfp > +T endfunc > +T > +T function fft16_vfp > + ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 > + fmrx a2, FPSCR > + fmxr FPSCR, a3 > + vpush {s16-s31} > + mov ip, lr > + bl fft16_internal_vfp > vpop {s16-s31} > fmxr FPSCR, a2 > - bx lr > + bx ip > +endfunc > + > +.macro pass n, z0, z1, z2, z3 > + add v6, v5, #4*2*\n > + @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) > + @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) > + @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) > + @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) > + vldr d8, [\z2, #8*(o2+1)] @ s16,s17 > + vldmdb v6!, {s2} > + vldr d9, [\z3, #8*(o3+1)] @ s18,s19 > + vldmia v5!, {s0,s1} @ s0 is unused > + vldr s7, [\z2, #8*o2] @ t1 > + vmul.f s20, s16, s2 @ vector * scalar > + vldr s0, [\z3, #8*o3] @ t5 > + vldr s6, [\z2, #8*o2+4] @ t2 > + vldr s3, [\z3, #8*o3+4] @ t6 > + vmul.f s16, s16, s1 @ vector * scalar > + ldr a4, =\n-1 > +1: add \z0, \z0, #8*2 > + .if \n*4*2 >= 512 > + add \z1, \z1, #8*2 > + .endif > + .if \n*4*2 >= 256 > + add \z2, \z2, #8*2 > + .endif > + .if \n*4*2 >= 512 > + add \z3, \z3, #8*2 > + .endif > + @ up to 2 stalls (VFP vector issuing / waiting for s0) > + @ depending upon whether this is the first iteration and > + @ how many add instructions are inserted above > + vadd.f s4, s0, s7 @ t5 > + vadd.f s5, s6, s3 @ t6 > + vsub.f s6, s6, s3 @ t4 > + vsub.f s7, s0, s7 @ t3 > + vldr d6, [\z0, #8*0-8*2] @ s12,s13 > + vadd.f s0, s16, s21 @ t1 > + vldr d7, [\z1, #8*o1-8*2] @ s14,s15 > + vsub.f s1, s18, s23 @ t5 > + vadd.f s8, s4, s12 @ vector + vector > + @ stall (VFP vector issuing) > + @ stall (VFP vector issuing) > + @ stall (VFP vector issuing) > + vsub.f s4, s12, s4 > + vsub.f s5, s13, s5 > + vsub.f s6, s14, s6 > + vsub.f s7, s15, s7 > + vsub.f s2, s17, s20 @ t2 > + vadd.f s3, s19, s22 @ t6 > + vstr d4, [\z0, #8*0-8*2] @ s8,s9 > + vstr d5, [\z1, #8*o1-8*2] @ s10,s11 > + @ stall (waiting for s5) > + vstr d2, [\z2, #8*o2-8*2] @ s4,s5 > + vadd.f s4, s1, s0 @ t5 > + vstr d3, [\z3, #8*o3-8*2] @ s6,s7 > + vsub.f s7, s1, s0 @ t3 > + vadd.f s5, s2, s3 @ t6 > + vsub.f s6, s2, s3 @ t4 > + vldr d6, [\z0, #8*1-8*2] @ s12,s13 > + vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 > + vldr d4, [\z2, #8*o2] @ s8,s9 > + vldmdb v6!, {s2,s3} > + vldr d5, [\z3, #8*o3] @ s10,s11 > + vadd.f s20, s4, s12 @ vector + vector > + vldmia v5!, {s0,s1} > + vldr d8, [\z2, #8*(o2+1)] @ s16,s17 > + @ stall (VFP vector issuing) > + vsub.f s4, s12, s4 > + vsub.f s5, s13, s5 > + vsub.f s6, s14, s6 > + vsub.f s7, s15, s7 > + vmul.f s12, s8, s3 @ vector * scalar > + vstr d10, [\z0, #8*1-8*2] @ s20,s21 > + vldr d9, [\z3, #8*(o3+1)] @ s18,s19 > + vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 > + vmul.f s8, s8, s0 @ vector * scalar > + vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 > + @ stall (waiting for s7) > + vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 > + vmul.f s20, s16, s2 @ vector * scalar > + @ stall (VFP vector issuing) > + @ stall (VFP vector issuing) > + @ stall (VFP vector issuing) > + vadd.f s7, s8, s13 @ t1 > + vsub.f s6, s9, s12 @ t2 > + vsub.f s0, s10, s15 @ t5 > + vadd.f s3, s11, s14 @ t6 > + vmul.f s16, s16, s1 @ vector * scalar > + subs a4, a4, #1 > + bne 1b > + @ What remains is identical to the first two indentations of > + @ the above, but without the increment of z > + vadd.f s4, s0, s7 @ t5 > + vadd.f s5, s6, s3 @ t6 > + vsub.f s6, s6, s3 @ t4 > + vsub.f s7, s0, s7 @ t3 > + vldr d6, [\z0, #8*0] @ s12,s13 > + vadd.f s0, s16, s21 @ t1 > + vldr d7, [\z1, #8*o1] @ s14,s15 > + vsub.f s1, s18, s23 @ t5 > + vadd.f s8, s4, s12 @ vector + vector > + vsub.f s4, s12, s4 > + vsub.f s5, s13, s5 > + vsub.f s6, s14, s6 > + vsub.f s7, s15, s7 > + vsub.f s2, s17, s20 @ t2 > + vadd.f s3, s19, s22 @ t6 > + vstr d4, [\z0, #8*0] @ s8,s9 > + vstr d5, [\z1, #8*o1] @ s10,s11 > + vstr d2, [\z2, #8*o2] @ s4,s5 > + vadd.f s4, s1, s0 @ t5 > + vstr d3, [\z3, #8*o3] @ s6,s7 > + vsub.f s7, s1, s0 @ t3 > + vadd.f s5, s2, s3 @ t6 > + vsub.f s6, s2, s3 @ t4 > + vldr d6, [\z0, #8*1] @ s12,s13 > + vldr d7, [\z1, #8*(o1+1)] @ s14,s15 > + vadd.f s20, s4, s12 @ vector + vector > + vsub.f s4, s12, s4 > + vsub.f s5, s13, s5 > + vsub.f s6, s14, s6 > + vsub.f s7, s15, s7 > + vstr d10, [\z0, #8*1] @ s20,s21 > + vstr d11, [\z1, #8*(o1+1)] @ s22,s23 > + vstr d2, [\z2, #8*(o2+1)] @ s4,s5 > + vstr d3, [\z3, #8*(o3+1)] @ s6,s7 > +.endm > + > +.macro fft_internal_vfp name, name2, name4, costable, n > +function \name > + .if \n >= 512 > + push {v1-v6,lr} > + .elseif \n >= 256 > + push {v1-v2,v5-v6,lr} > + .else > + push {v1,v5-v6,lr} > + .endif > + mov v1, a1 > + bl \name2 > + add a1, v1, #8*(\n/4)*2 > + bl \name4 > + ldr v5, =\costable Should this perhaps use the movrelx macro? > + add a1, v1, #8*(\n/4)*3 > + bl \name4 > + .if \n >= 512 > + .set o1, 0*(\n/4/2) > + .set o2, 0*(\n/4/2) > + .set o3, 0*(\n/4/2) > + add v2, v1, #8*2*(\n/4/2) > + add v3, v1, #8*4*(\n/4/2) > + add v4, v1, #8*6*(\n/4/2) > + pass (\n/4/2), v1, v2, v3, v4 > + pop {v1-v6,pc} > + .elseif \n >= 256 > + .set o1, 2*(\n/4/2) > + .set o2, 0*(\n/4/2) > + .set o3, 2*(\n/4/2) > + add v2, v1, #8*4*(\n/4/2) > + pass (\n/4/2), v1, v1, v2, v2 > + pop {v1-v2,v5-v6,pc} > + .else > + .set o1, 2*(\n/4/2) > + .set o2, 4*(\n/4/2) > + .set o3, 6*(\n/4/2) > + pass (\n/4/2), v1, v1, v1, v1 > + pop {v1,v5-v6,pc} > + .endif > endfunc > +.endm > + > +#define DECL_FFT(n,n2,n4) \ > +fft_internal_vfp fft##n##_internal_vfp, fft##n2##_internal_vfp, fft##n4##_internal_vfp, ff_cos_##n, n ;\ You need to add X() around the ff_cos_N name here > + ;\ > +function fft##n##_vfp ;\ > + ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ ;\ > + fmrx a2, FPSCR ;\ > + fmxr FPSCR, a3 ;\ > + vpush {s16-s31} ;\ > + mov ip, lr ;\ > + bl fft##n##_internal_vfp ;\ > + vpop {s16-s31} ;\ > + fmxr FPSCR, a2 ;\ > + bx ip ;\ > +endfunc ;\ > + ;\ > +.ltorg gas-preprocessor didn't previously support multiple instructions concatenated on one line separated with semicolons - I did write a patch for doing that now though. Nevertheless, wouldn't this be more readable written just as a normal gas macro instead of as a cpp macro? That's how the corresponding macro is done in fft_neon.S. // Martin

On Sun, 13 Jul 2014, Martin Storsjö wrote: > On Fri, 11 Jul 2014, Ben Avison wrote: > >> .align 3 > > Is there any particular reason why this is aligned to 8 bytes instead of 4 - > shouldn't 4 (aka .align 2) be enough for float constants? (Yes, I know this > isn't added by this patch though.) > > The extra alignment here triggered a bug in MS armasm where the jump table > offsets were miscalculated, which seemed to be triggered by this extra > alignment here. (I haven't reduced the bug to a reportable testcase yet.) Actually, changing ".align 3" to ".align 2" doesn't really help much - armasm seems to be quite broken when it comes to calculating label differences - in this case it can be triggered when there's any sort of alignment directives between the two labels. So this is another argument for going with a plain pointer list (as in fft_neon.S), as would solve issues with apple's tools as well. // Martin

On Sun, 13 Jul 2014 10:10:00 +0100, Martin Storsjö <martin@martin.st> wrote: > On Fri, 11 Jul 2014, Ben Avison wrote: > I tried this code on most of our odd configurations, and ran into a > bunch of issues, most of which I've been able to resolve in one way or > another, but some parts of it is a bit ugly... Thanks Martin. It might seem like it sometimes, but I don't deliberately set out to break gas-preprocessor with every new patch series :) It's a shame we have to worry about getting this code to assemble correctly in Thumb mode, as it seems rather unlikely that it will ever actually be used that way - most CPUs either don't support enough Thumb instructions for it to assemble, or have NEON that will be used in preference, or don't support short vector VFP mode, which the code requires. Nevertheless, I have made the changes you suggest - probably the same as the ones you made locally. I'll repost this patch shortly. >> +A .word ff_fft16_vfp @ this one alone is exported > > This reference needs to be wrapped in X() I've only ever seen X() resolve to a no-op, so I don't get this sort of thing flagged during my own tests. When does it make any difference? I was never clear on whether it only applied to labels from *other* compilation units rather than the current one (made less clear by the fact that it never seems to be used when a label is defined), or whether it applied just to code labels, or both code and data. The name "X" doesn't really give much of a hint! >> .align 3 > > Is there any particular reason why this is aligned to 8 bytes instead > of 4 - shouldn't 4 (aka .align 2) be enough for float constants? (Yes, > I know this isn't added by this patch though.) It's because the ARM11's memory bus is actually 64 bits wide. cos1pi4 and cos1pi8 are loaded in the same vldr instruction; if cos1pi4 is 64-bit aligned then this takes 1 cycle, otherwise 2 cycles. This multiplies up, so for something like FFT512, that's quite a few extra cycles. >> + ldr v5, =\costable > > Should this perhaps use the movrelx macro? Um, maybe. I must admit to not really understanding what the movrel and movrelx macros are for and when to use which. Could I respectfully ask for some comments in asm.S to explain this to those of us who aren't familiar with the full range of supported ABIs? > Nevertheless, wouldn't this be more readable written just as a normal > gas macro instead of as a cpp macro? That's how the corresponding macro > is done in fft_neon.S. I couldn't figure out how to do string concatenations in gas - it's not my favourite assembly language and I often find string manipulation is one of its weaknesses. Nevertheless, the trick in fft_neon.S seems to work, so thanks for the pointer. Ben

Hi Ben, On Wed, 16 Jul 2014, Ben Avison wrote: > On Sun, 13 Jul 2014 10:10:00 +0100, Martin Storsjö <martin@martin.st> wrote: >> On Fri, 11 Jul 2014, Ben Avison wrote: >> I tried this code on most of our odd configurations, and ran into a >> bunch of issues, most of which I've been able to resolve in one way or >> another, but some parts of it is a bit ugly... > > Thanks Martin. It might seem like it sometimes, but I don't deliberately > set out to break gas-preprocessor with every new patch series :) Well, given that gas-preprocessor originally only supported the (at that time) small subset of gas features used by the existing arm and ppc assembly, it's quite understandable that new code finds issues - this only makes the tool better :-) > It's a shame we have to worry about getting this code to assemble > correctly in Thumb mode, as it seems rather unlikely that it will ever > actually be used that way - most CPUs either don't support enough Thumb > instructions for it to assemble, or have NEON that will be used in > preference, or don't support short vector VFP mode, which the code > requires. > > Nevertheless, I have made the changes you suggest - probably the same as > the ones you made locally. I'll repost this patch shortly. Thanks! Indeed, it is a bit of extra burden to make sure it works in those mostly irrelevant configurations, but it eases the maintainance of the code if it is as portable as possible. Btw, I saw that you renamed the local symbols to all use the .L prefix - sorry if I was unclear about this. This would only have been necessary if we kept the thumb offset table (and even then, we would have had to jump through some extra hoops, to make sure there were no non-local labels anywhere between the offset table and the functions it pointed to). Since this new patch only uses a normal function pointer table, this shouldn't be necessary any longer, so in case there's a need for yet another round of the patch you can do this whichever way you prefer. Keeping the symbol names visible would perhaps be useful for debugging? If you prefer to do it that way I can also amend that before pushing (no need to resend just for that), in case there's nothing else that needs to be done. >>> +A .word ff_fft16_vfp @ this one alone is exported >> >> This reference needs to be wrapped in X() > > I've only ever seen X() resolve to a no-op, so I don't get this sort of > thing flagged during my own tests. When does it make any difference? I > was never clear on whether it only applied to labels from *other* > compilation units rather than the current one (made less clear by the > fact that it never seems to be used when a label is defined), or whether > it applied just to code labels, or both code and data. The name "X" > doesn't really give much of a hint! I think the name X() was chosen just as a macro as short as possible. It's used for mangling of external names - on arm it only matters on darwin/iOS, where external symbols are prefixed with an underscore. This underscore is automatically added when you declare functions with the function macro, but has to be added whenever you reference such symbols elsewhere. Given the naming scheme, you would use it when referencing any ff_ prefixed labels. >>> .align 3 >> >> Is there any particular reason why this is aligned to 8 bytes instead >> of 4 - shouldn't 4 (aka .align 2) be enough for float constants? (Yes, >> I know this isn't added by this patch though.) > > It's because the ARM11's memory bus is actually 64 bits wide. cos1pi4 and > cos1pi8 are loaded in the same vldr instruction; if cos1pi4 is 64-bit > aligned then this takes 1 cycle, otherwise 2 cycles. This multiplies up, > so for something like FFT512, that's quite a few extra cycles. Ok, thanks for the clarification. This issue should have gone away now that you use a table with function pointers instead of manually calculating offsets. >>> + ldr v5, =\costable >> >> Should this perhaps use the movrelx macro? > > Um, maybe. I must admit to not really understanding what the movrel and > movrelx macros are for and when to use which. Could I respectfully ask > for some comments in asm.S to explain this to those of us who aren't > familiar with the full range of supported ABIs? Sure. I'm actually not all that familiar with it myself, hopefully Janne can fill in the details. I noticed this since gas-preprocessor for MS armasm didn't handle ldr with imported symbols yet - because all the similar cases had been wrapped in the movrel/movrelx macros, that end up as movw/movt in the armasm case. >> Nevertheless, wouldn't this be more readable written just as a normal >> gas macro instead of as a cpp macro? That's how the corresponding macro >> is done in fft_neon.S. > > I couldn't figure out how to do string concatenations in gas - it's not > my favourite assembly language and I often find string manipulation is > one of its weaknesses. Nevertheless, the trick in fft_neon.S seems to > work, so thanks for the pointer. Ok - thanks for the resend, I'll retest it in all these configurations now. // Martin

On 2014-07-16 19:33:59 +0300, Martin Storsjö wrote: > Hi Ben, > > On Wed, 16 Jul 2014, Ben Avison wrote: > > >>>+ ldr v5, =\costable > >> > >>Should this perhaps use the movrelx macro? > > > >Um, maybe. I must admit to not really understanding what the movrel and > >movrelx macros are for and when to use which. Could I respectfully ask > >for some comments in asm.S to explain this to those of us who aren't > >familiar with the full range of supported ABIs? > > Sure. I'm actually not all that familiar with it myself, hopefully > Janne can fill in the details. > > I noticed this since gas-preprocessor for MS armasm didn't handle > ldr with imported symbols yet - because all the similar cases had > been wrapped in the movrel/movrelx macros, that end up as movw/movt > in the armasm case. movrel and movrelx are needed for position independent or PC relative loads. External symbols need to be handled differently so there is movrelx for loading X(symbol). Janne

diff --git a/libavcodec/arm/fft_init_arm.c b/libavcodec/arm/fft_init_arm.c index 3a3d1a7..bc143c1 100644 --- a/libavcodec/arm/fft_init_arm.c +++ b/libavcodec/arm/fft_init_arm.c @@ -23,6 +23,8 @@ #include "libavcodec/rdft.h" #include "libavcodec/synth_filter.h" +void ff_fft_calc_vfp(FFTContext *s, FFTComplex *z); + void ff_fft_permute_neon(FFTContext *s, FFTComplex *z); void ff_fft_calc_neon(FFTContext *s, FFTComplex *z); @@ -38,10 +40,10 @@ av_cold void ff_fft_init_arm(FFTContext *s) { int cpu_flags = av_get_cpu_flags(); - if (have_vfp(cpu_flags)) { + if (have_vfp(cpu_flags) && !have_vfpv3(cpu_flags)) { + s->fft_calc = ff_fft_calc_vfp; #if CONFIG_MDCT - if (!have_vfpv3(cpu_flags)) - s->imdct_half = ff_imdct_half_vfp; + s->imdct_half = ff_imdct_half_vfp; #endif } diff --git a/libavcodec/arm/fft_vfp.S b/libavcodec/arm/fft_vfp.S index 7845ebb..e7d710b 100644 --- a/libavcodec/arm/fft_vfp.S +++ b/libavcodec/arm/fft_vfp.S @@ -21,8 +21,52 @@ #include "libavutil/arm/asm.S" -@ TODO: * FFTs wider than 16 -@ * dispatch code +@ The fftx_internal_vfp versions of the functions obey a modified AAPCS: +@ VFP is in RunFast mode, vector length 4, stride 1 thoroughout, and +@ all single-precision VFP registers may be corrupted on exit. + +function ff_fft_calc_vfp, export=1 + ldr ip, [a1, #0] @ nbits + mov a1, a2 +A ldr ip, [pc, ip, lsl #2] +A bx ip +A .word 0 +A .word 0 +A .word fft4_vfp +A .word fft8_vfp +A .word ff_fft16_vfp @ this one alone is exported +A .word fft32_vfp +A .word fft64_vfp +A .word fft128_vfp +A .word fft256_vfp +A .word fft512_vfp +A .word fft1024_vfp +A .word fft2048_vfp +A .word fft4096_vfp +A .word fft8192_vfp +A .word fft16384_vfp +A .word fft32768_vfp +A .word fft65536_vfp +T tbh [pc, ip, lsl #1] +T 0: .short 0 +T .short 0 +T .short fft4_vfp - 0b +T .short fft4_vfp - 0b +T .short fft8_vfp - 0b +T .short fft16_vfp - 0b +T .short fft32_vfp - 0b +T .short fft64_vfp - 0b +T .short fft128_vfp - 0b +T .short fft256_vfp - 0b +T .short fft512_vfp - 0b +T .short fft1024_vfp - 0b +T .short fft2048_vfp - 0b +T .short fft4096_vfp - 0b +T .short fft8192_vfp - 0b +T .short fft16384_vfp - 0b +T .short fft32768_vfp - 0b +T .short fft65536_vfp - 0b +endfunc function fft4_vfp vldr d0, [a1, #0*2*4] @ s0,s1 = z[0] @@ -131,18 +175,22 @@ endfunc vstr d9, [a1, #3 * 2*4] .endm +function fft8_internal_vfp + macro_fft8_head + macro_fft8_tail + bx lr +endfunc + function fft8_vfp ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 fmrx a2, FPSCR fmxr FPSCR, a3 vpush {s16-s31} - - macro_fft8_head - macro_fft8_tail - + mov ip, lr + bl fft8_internal_vfp vpop {s16-s31} fmxr FPSCR, a2 - bx lr + bx ip endfunc .align 3 @@ -153,12 +201,7 @@ cos1pi8: @ cos(1*pi/8) = sqrt(2+sqrt(2))/2 cos3pi8: @ cos(2*pi/8) = sqrt(2-sqrt(2))/2 .float 0.3826834261417388916015625 -function ff_fft16_vfp, export=1 - ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 - fmrx a2, FPSCR - fmxr FPSCR, a3 - vpush {s16-s31} - +function fft16_internal_vfp macro_fft8_head @ FFT4(z+8) vldr d10, [a1, #8 * 2*4] @@ -292,7 +335,220 @@ function ff_fft16_vfp, export=1 vstr d8, [a1, #0 * 2*4] vstr d9, [a1, #4 * 2*4] + bx lr +endfunc + +function ff_fft16_vfp, export=1 +T b fft16_vfp +T endfunc +T +T function fft16_vfp + ldr a3, =0x03030000 @ RunFast mode, vector length 4, stride 1 + fmrx a2, FPSCR + fmxr FPSCR, a3 + vpush {s16-s31} + mov ip, lr + bl fft16_internal_vfp vpop {s16-s31} fmxr FPSCR, a2 - bx lr + bx ip +endfunc + +.macro pass n, z0, z1, z2, z3 + add v6, v5, #4*2*\n + @ TRANSFORM_ZERO(z[0],z[o1],z[o2],z[o3]) + @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) + @ TRANSFORM(z[0],z[o1],z[o2],z[o3],wre[0],wim[0]) + @ TRANSFORM(z[1],z[o1+1],z[o2+1],z[o3+1],wre[1],wim[-1]) + vldr d8, [\z2, #8*(o2+1)] @ s16,s17 + vldmdb v6!, {s2} + vldr d9, [\z3, #8*(o3+1)] @ s18,s19 + vldmia v5!, {s0,s1} @ s0 is unused + vldr s7, [\z2, #8*o2] @ t1 + vmul.f s20, s16, s2 @ vector * scalar + vldr s0, [\z3, #8*o3] @ t5 + vldr s6, [\z2, #8*o2+4] @ t2 + vldr s3, [\z3, #8*o3+4] @ t6 + vmul.f s16, s16, s1 @ vector * scalar + ldr a4, =\n-1 +1: add \z0, \z0, #8*2 + .if \n*4*2 >= 512 + add \z1, \z1, #8*2 + .endif + .if \n*4*2 >= 256 + add \z2, \z2, #8*2 + .endif + .if \n*4*2 >= 512 + add \z3, \z3, #8*2 + .endif + @ up to 2 stalls (VFP vector issuing / waiting for s0) + @ depending upon whether this is the first iteration and + @ how many add instructions are inserted above + vadd.f s4, s0, s7 @ t5 + vadd.f s5, s6, s3 @ t6 + vsub.f s6, s6, s3 @ t4 + vsub.f s7, s0, s7 @ t3 + vldr d6, [\z0, #8*0-8*2] @ s12,s13 + vadd.f s0, s16, s21 @ t1 + vldr d7, [\z1, #8*o1-8*2] @ s14,s15 + vsub.f s1, s18, s23 @ t5 + vadd.f s8, s4, s12 @ vector + vector + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vsub.f s2, s17, s20 @ t2 + vadd.f s3, s19, s22 @ t6 + vstr d4, [\z0, #8*0-8*2] @ s8,s9 + vstr d5, [\z1, #8*o1-8*2] @ s10,s11 + @ stall (waiting for s5) + vstr d2, [\z2, #8*o2-8*2] @ s4,s5 + vadd.f s4, s1, s0 @ t5 + vstr d3, [\z3, #8*o3-8*2] @ s6,s7 + vsub.f s7, s1, s0 @ t3 + vadd.f s5, s2, s3 @ t6 + vsub.f s6, s2, s3 @ t4 + vldr d6, [\z0, #8*1-8*2] @ s12,s13 + vldr d7, [\z1, #8*(o1+1)-8*2] @ s14,s15 + vldr d4, [\z2, #8*o2] @ s8,s9 + vldmdb v6!, {s2,s3} + vldr d5, [\z3, #8*o3] @ s10,s11 + vadd.f s20, s4, s12 @ vector + vector + vldmia v5!, {s0,s1} + vldr d8, [\z2, #8*(o2+1)] @ s16,s17 + @ stall (VFP vector issuing) + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vmul.f s12, s8, s3 @ vector * scalar + vstr d10, [\z0, #8*1-8*2] @ s20,s21 + vldr d9, [\z3, #8*(o3+1)] @ s18,s19 + vstr d11, [\z1, #8*(o1+1)-8*2] @ s22,s23 + vmul.f s8, s8, s0 @ vector * scalar + vstr d2, [\z2, #8*(o2+1)-8*2] @ s4,s5 + @ stall (waiting for s7) + vstr d3, [\z3, #8*(o3+1)-8*2] @ s6,s7 + vmul.f s20, s16, s2 @ vector * scalar + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + @ stall (VFP vector issuing) + vadd.f s7, s8, s13 @ t1 + vsub.f s6, s9, s12 @ t2 + vsub.f s0, s10, s15 @ t5 + vadd.f s3, s11, s14 @ t6 + vmul.f s16, s16, s1 @ vector * scalar + subs a4, a4, #1 + bne 1b + @ What remains is identical to the first two indentations of + @ the above, but without the increment of z + vadd.f s4, s0, s7 @ t5 + vadd.f s5, s6, s3 @ t6 + vsub.f s6, s6, s3 @ t4 + vsub.f s7, s0, s7 @ t3 + vldr d6, [\z0, #8*0] @ s12,s13 + vadd.f s0, s16, s21 @ t1 + vldr d7, [\z1, #8*o1] @ s14,s15 + vsub.f s1, s18, s23 @ t5 + vadd.f s8, s4, s12 @ vector + vector + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vsub.f s2, s17, s20 @ t2 + vadd.f s3, s19, s22 @ t6 + vstr d4, [\z0, #8*0] @ s8,s9 + vstr d5, [\z1, #8*o1] @ s10,s11 + vstr d2, [\z2, #8*o2] @ s4,s5 + vadd.f s4, s1, s0 @ t5 + vstr d3, [\z3, #8*o3] @ s6,s7 + vsub.f s7, s1, s0 @ t3 + vadd.f s5, s2, s3 @ t6 + vsub.f s6, s2, s3 @ t4 + vldr d6, [\z0, #8*1] @ s12,s13 + vldr d7, [\z1, #8*(o1+1)] @ s14,s15 + vadd.f s20, s4, s12 @ vector + vector + vsub.f s4, s12, s4 + vsub.f s5, s13, s5 + vsub.f s6, s14, s6 + vsub.f s7, s15, s7 + vstr d10, [\z0, #8*1] @ s20,s21 + vstr d11, [\z1, #8*(o1+1)] @ s22,s23 + vstr d2, [\z2, #8*(o2+1)] @ s4,s5 + vstr d3, [\z3, #8*(o3+1)] @ s6,s7 +.endm + +.macro fft_internal_vfp name, name2, name4, costable, n +function \name + .if \n >= 512 + push {v1-v6,lr} + .elseif \n >= 256 + push {v1-v2,v5-v6,lr} + .else + push {v1,v5-v6,lr} + .endif + mov v1, a1 + bl \name2 + add a1, v1, #8*(\n/4)*2 + bl \name4 + ldr v5, =\costable + add a1, v1, #8*(\n/4)*3 + bl \name4 + .if \n >= 512 + .set o1, 0*(\n/4/2) + .set o2, 0*(\n/4/2) + .set o3, 0*(\n/4/2) + add v2, v1, #8*2*(\n/4/2) + add v3, v1, #8*4*(\n/4/2) + add v4, v1, #8*6*(\n/4/2) + pass (\n/4/2), v1, v2, v3, v4 + pop {v1-v6,pc} + .elseif \n >= 256 + .set o1, 2*(\n/4/2) + .set o2, 0*(\n/4/2) + .set o3, 2*(\n/4/2) + add v2, v1, #8*4*(\n/4/2) + pass (\n/4/2), v1, v1, v2, v2 + pop {v1-v2,v5-v6,pc} + .else + .set o1, 2*(\n/4/2) + .set o2, 4*(\n/4/2) + .set o3, 6*(\n/4/2) + pass (\n/4/2), v1, v1, v1, v1 + pop {v1,v5-v6,pc} + .endif endfunc +.endm + +#define DECL_FFT(n,n2,n4) \ +fft_internal_vfp fft##n##_internal_vfp, fft##n2##_internal_vfp, fft##n4##_internal_vfp, ff_cos_##n, n ;\ + ;\ +function fft##n##_vfp ;\ + ldr a3, =0x03030000 /* RunFast mode, vector length 4, stride 1 */ ;\ + fmrx a2, FPSCR ;\ + fmxr FPSCR, a3 ;\ + vpush {s16-s31} ;\ + mov ip, lr ;\ + bl fft##n##_internal_vfp ;\ + vpop {s16-s31} ;\ + fmxr FPSCR, a2 ;\ + bx ip ;\ +endfunc ;\ + ;\ +.ltorg + +DECL_FFT(32,16,8) +DECL_FFT(64,32,16) +DECL_FFT(128,64,32) +DECL_FFT(256,128,64) +DECL_FFT(512,256,128) +DECL_FFT(1024,512,256) +DECL_FFT(2048,1024,512) +DECL_FFT(4096,2048,1024) +DECL_FFT(8192,4096,2048) +DECL_FFT(16384,8192,4096) +DECL_FFT(32768,16384,8192) +DECL_FFT(65536,32768,16384)