arm: vp9itxfm: Simplify the stack alignment code

Message ID 1479454935-18170-1-git-send-email-martin@martin.st
State Committed
Headers show

Commit Message

Martin Storsjö Nov. 18, 2016, 7:42 a.m.
From: Janne Grunau <janne-libav@jannau.net>

This is one instruction less for thumb, and only have got
1/2 arm/thumb specific instructions.
---
 libavcodec/arm/vp9itxfm_neon.S | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

Comments

Janne Grunau Nov. 18, 2016, 7:14 p.m. | #1
On 2016-11-18 09:42:15 +0200, Martin Storsjö wrote:
> From: Janne Grunau <janne-libav@jannau.net>
> 
> This is one instruction less for thumb, and only have got
> 1/2 arm/thumb specific instructions.
> ---
>  libavcodec/arm/vp9itxfm_neon.S | 28 ++++++++++++----------------
>  1 file changed, 12 insertions(+), 16 deletions(-)
> 
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> index cdb43b5..46d91b7 100644
> --- a/libavcodec/arm/vp9itxfm_neon.S
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -791,15 +791,13 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>  .ifnc \txfm1\()_\txfm2,idct_idct
>          vpush           {q4-q7}
>  .endif
> -        mov             r7,  sp
>  
>          @ Align the stack, allocate a temp buffer
> -T       mov             r12, sp
> -T       bic             r12, r12, #15
> -T       sub             r12, r12, #512
> -T       mov             sp,  r12
> -A       bic             sp,  sp,  #15
> -A       sub             sp,  sp,  #512
> +T       mov             r7,  sp
> +T       and             r7,  r7,  #15
> +A       and             r7,  sp,  #15
> +        add             r7,  r7,  #512
> +        sub             sp,  sp,  r7
>  
>          mov             r4,  r0
>          mov             r5,  r1
> @@ -828,7 +826,7 @@ A       sub             sp,  sp,  #512
>          bl              \txfm2\()16_1d_4x16_pass2_neon
>  .endr
>  
> -        mov             sp,  r7
> +        add             sp,  r7
>  .ifnc \txfm1\()_\txfm2,idct_idct
>          vpop            {q4-q7}
>  .endif
> @@ -1117,15 +1115,13 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
>          beq             idct32x32_dc_add_neon
>          push            {r4-r7,lr}
>          vpush           {q4-q7}
> -        mov             r7,  sp
>  
>          @ Align the stack, allocate a temp buffer
> -T       mov             r12, sp
> -T       bic             r12, r12, #15
> -T       sub             r12, r12, #2048
> -T       mov             sp,  r12
> -A       bic             sp,  sp,  #15
> -A       sub             sp,  sp,  #2048
> +T       mov             r7,  sp
> +T       and             r7,  r7,  #15
> +A       and             r7,  sp,  #15
> +        add             r7,  r7,  #2048
> +        sub             sp,  sp,  r7
>  
>          mov             r4,  r0
>          mov             r5,  r1
> @@ -1143,7 +1139,7 @@ A       sub             sp,  sp,  #2048
>          bl              idct32_1d_4x32_pass2_neon
>  .endr
>  
> -        mov             sp,  r7
> +        add             sp,  r7
>          vpop            {q4-q7}
>          pop             {r4-r7,pc}
>  endfunc

ok

Janne

Patch

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index cdb43b5..46d91b7 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -791,15 +791,13 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifnc \txfm1\()_\txfm2,idct_idct
         vpush           {q4-q7}
 .endif
-        mov             r7,  sp
 
         @ Align the stack, allocate a temp buffer
-T       mov             r12, sp
-T       bic             r12, r12, #15
-T       sub             r12, r12, #512
-T       mov             sp,  r12
-A       bic             sp,  sp,  #15
-A       sub             sp,  sp,  #512
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #512
+        sub             sp,  sp,  r7
 
         mov             r4,  r0
         mov             r5,  r1
@@ -828,7 +826,7 @@  A       sub             sp,  sp,  #512
         bl              \txfm2\()16_1d_4x16_pass2_neon
 .endr
 
-        mov             sp,  r7
+        add             sp,  r7
 .ifnc \txfm1\()_\txfm2,idct_idct
         vpop            {q4-q7}
 .endif
@@ -1117,15 +1115,13 @@  function ff_vp9_idct_idct_32x32_add_neon, export=1
         beq             idct32x32_dc_add_neon
         push            {r4-r7,lr}
         vpush           {q4-q7}
-        mov             r7,  sp
 
         @ Align the stack, allocate a temp buffer
-T       mov             r12, sp
-T       bic             r12, r12, #15
-T       sub             r12, r12, #2048
-T       mov             sp,  r12
-A       bic             sp,  sp,  #15
-A       sub             sp,  sp,  #2048
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #2048
+        sub             sp,  sp,  r7
 
         mov             r4,  r0
         mov             r5,  r1
@@ -1143,7 +1139,7 @@  A       sub             sp,  sp,  #2048
         bl              idct32_1d_4x32_pass2_neon
 .endr
 
-        mov             sp,  r7
+        add             sp,  r7
         vpop            {q4-q7}
         pop             {r4-r7,pc}
 endfunc