arm: vp9itxfm: Simplify the arm/thumb stack alignment code

Message ID 1479074502-14743-1-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö Nov. 13, 2016, 10:01 p.m.
The subtraction directly on sp is one of the operations that are
allowed in thumb mode.
---
 libavcodec/arm/vp9itxfm_neon.S | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

Comments

Janne Grunau Nov. 17, 2016, 9:44 p.m. | #1
On 2016-11-14 00:01:42 +0200, Martin Storsjö wrote:
> The subtraction directly on sp is one of the operations that are
> allowed in thumb mode.
> ---
>  libavcodec/arm/vp9itxfm_neon.S | 6 ++----
>  1 file changed, 2 insertions(+), 4 deletions(-)
> 
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> index cdb43b5..46fd010 100644
> --- a/libavcodec/arm/vp9itxfm_neon.S
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -796,10 +796,9 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>          @ Align the stack, allocate a temp buffer
>  T       mov             r12, sp
>  T       bic             r12, r12, #15
> -T       sub             r12, r12, #512
>  T       mov             sp,  r12
>  A       bic             sp,  sp,  #15
> -A       sub             sp,  sp,  #512
> +        sub             sp,  sp,  #512

sub r12, sp, #512 works too and saves the first mov

or better

T mov r7, sp
T bic r7, r7, #15
A bic r7, sp, #15
  add r7, r7, #512
  sub sp, r7

and restore the stack with
add sp, r7

Janne
Martin Storsjö Nov. 17, 2016, 10:13 p.m. | #2
On Thu, 17 Nov 2016, Janne Grunau wrote:

> On 2016-11-14 00:01:42 +0200, Martin Storsjö wrote:
>> The subtraction directly on sp is one of the operations that are
>> allowed in thumb mode.
>> ---
>>  libavcodec/arm/vp9itxfm_neon.S | 6 ++----
>>  1 file changed, 2 insertions(+), 4 deletions(-)
>>
>> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
>> index cdb43b5..46fd010 100644
>> --- a/libavcodec/arm/vp9itxfm_neon.S
>> +++ b/libavcodec/arm/vp9itxfm_neon.S
>> @@ -796,10 +796,9 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>>          @ Align the stack, allocate a temp buffer
>>  T       mov             r12, sp
>>  T       bic             r12, r12, #15
>> -T       sub             r12, r12, #512
>>  T       mov             sp,  r12
>>  A       bic             sp,  sp,  #15
>> -A       sub             sp,  sp,  #512
>> +        sub             sp,  sp,  #512
>
> sub r12, sp, #512 works too and saves the first mov

Oh, right

> or better
>
> T mov r7, sp
> T bic r7, r7, #15
> A bic r7, sp, #15

I guess that should be "and r7, r7/sp, #15" then

>  add r7, r7, #512
>  sub sp, r7
>
> and restore the stack with
> add sp, r7

Hmm, that ends up as 4 instructions for thumb and 3 for arm, for aligning 
and subtracting. Isn't that equal to, not better than, what you achieve 
with your suggestion with "sub r12, sp, #512" for thumb?

// Martin
Janne Grunau Nov. 17, 2016, 10:43 p.m. | #3
On 2016-11-18 00:13:51 +0200, Martin Storsjö wrote:
> On Thu, 17 Nov 2016, Janne Grunau wrote:
> 
> >On 2016-11-14 00:01:42 +0200, Martin Storsjö wrote:
> >>The subtraction directly on sp is one of the operations that are
> >>allowed in thumb mode.
> >>---
> >> libavcodec/arm/vp9itxfm_neon.S | 6 ++----
> >> 1 file changed, 2 insertions(+), 4 deletions(-)
> >>
> >>diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> >>index cdb43b5..46fd010 100644
> >>--- a/libavcodec/arm/vp9itxfm_neon.S
> >>+++ b/libavcodec/arm/vp9itxfm_neon.S
> >>@@ -796,10 +796,9 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
> >>         @ Align the stack, allocate a temp buffer
> >> T       mov             r12, sp
> >> T       bic             r12, r12, #15
> >>-T       sub             r12, r12, #512
> >> T       mov             sp,  r12
> >> A       bic             sp,  sp,  #15
> >>-A       sub             sp,  sp,  #512
> >>+        sub             sp,  sp,  #512
> >
> >sub r12, sp, #512 works too and saves the first mov
> 
> Oh, right
> 
> >or better
> >
> >T mov r7, sp
> >T bic r7, r7, #15
> >A bic r7, sp, #15
> 
> I guess that should be "and r7, r7/sp, #15" then

err, yes
> 
> > add r7, r7, #512
> > sub sp, r7
> >
> >and restore the stack with
> >add sp, r7
> 
> Hmm, that ends up as 4 instructions for thumb and 3 for arm, for aligning
> and subtracting. Isn't that equal to, not better than, what you achieve with
> your suggestion with "sub r12, sp, #512" for thumb?

yes, better only in the sense that arm and thumb share more one 
instruction more

Janne

Patch

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index cdb43b5..46fd010 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -796,10 +796,9 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
         @ Align the stack, allocate a temp buffer
 T       mov             r12, sp
 T       bic             r12, r12, #15
-T       sub             r12, r12, #512
 T       mov             sp,  r12
 A       bic             sp,  sp,  #15
-A       sub             sp,  sp,  #512
+        sub             sp,  sp,  #512
 
         mov             r4,  r0
         mov             r5,  r1
@@ -1122,10 +1121,9 @@  function ff_vp9_idct_idct_32x32_add_neon, export=1
         @ Align the stack, allocate a temp buffer
 T       mov             r12, sp
 T       bic             r12, r12, #15
-T       sub             r12, r12, #2048
 T       mov             sp,  r12
 A       bic             sp,  sp,  #15
-A       sub             sp,  sp,  #2048
+        sub             sp,  sp,  #2048
 
         mov             r4,  r0
         mov             r5,  r1