[PATCHv2] aarch64: vp9itxfm: Don't repeatedly set x9 when nothing overwrites it

Message ID 1479938975-3562-1-git-send-email-martin@martin.st
State Committed
Commit 2f99117f6ff24ce5be2abb9e014cb8b86c2aa0e0
Headers show

Commit Message

Martin Storsjö Nov. 23, 2016, 10:09 p.m.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 26 +++++++++++++++-----------
 1 file changed, 15 insertions(+), 11 deletions(-)

Comments

Janne Grunau Nov. 24, 2016, 6:59 a.m. | #1
On 2016-11-24 00:09:35 +0200, Martin Storsjö wrote:
> ---
>  libavcodec/aarch64/vp9itxfm_neon.S | 26 +++++++++++++++-----------
>  1 file changed, 15 insertions(+), 11 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
> index 2dc6b75..f4194a6 100644
> --- a/libavcodec/aarch64/vp9itxfm_neon.S
> +++ b/libavcodec/aarch64/vp9itxfm_neon.S
> @@ -599,9 +599,9 @@ endfunc
>  // x1 = unused
>  // x2 = src
>  // x3 = slice offset
> +// x9 = input stride
>  .macro itxfm16_1d_funcs txfm
>  function \txfm\()16_1d_8x16_pass1_neon
> -        mov             x9, #32
>          movi            v2.8h, #0
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          load_clear      \i,  x2,  x9
> @@ -649,8 +649,8 @@ endfunc
>  // x1 = dst stride
>  // x2 = src (temp buffer)
>  // x3 = slice offset
> +// x9 = temp buffer stride
>  function \txfm\()16_1d_8x16_pass2_neon
> -        mov             x9, #32
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23
>          load            \i,  x2,  x9
>  .endr
> @@ -747,6 +747,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>  .ifc \txfm1,idct
>          ld1             {v0.8h,v1.8h}, [x10]
>  .endif
> +        mov             x9, #32
>  
>  .irp i, 0, 8
>          add             x0,  sp,  #(\i*32)
> @@ -882,13 +883,12 @@ endfunc
>  // x0 = dst (temp buffer)
>  // x1 = unused
>  // x2 = src
> +// x9 = double input stride
>  // x10 = idct_coeffs
>  // x11 = idct_coeffs + 32
>  function idct32_1d_8x32_pass1_neon
>          ld1             {v0.8h,v1.8h}, [x10]
>  
> -        // Double stride of the input, since we only read every other line
> -        mov             x9,  #128
>          movi            v4.8h, #0
>  
>          // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
> @@ -987,12 +987,13 @@ endfunc
>  // x0 = dst
>  // x1 = dst stride
>  // x2 = src (temp buffer)
> +// x7 = negative double temp buffer stride
> +// x9 = double temp buffer stride
>  // x10 = idct_coeffs
>  // x11 = idct_coeffs + 32
>  function idct32_1d_8x32_pass2_neon
>          ld1             {v0.8h,v1.8h}, [x10]
>  
> -        mov             x9, #128
>          // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          ld1             {v\i\().8h}, [x2], x9
> @@ -1001,7 +1002,6 @@ function idct32_1d_8x32_pass2_neon
>  
>          idct16
>  
> -        mov             x9,  #128
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          st1             {v\i\().8h}, [x2], x9
>  .endr
> @@ -1018,11 +1018,10 @@ function idct32_1d_8x32_pass2_neon
>  
>          idct32_odd
>  
> -        mov             x9,  #128
>  .macro load_acc_store a, b, c, d, neg=0
> +.if \neg == 0
>          ld1             {v4.8h},  [x2], x9
>          ld1             {v5.8h},  [x2], x9
> -.if \neg == 0
>          add             v4.8h, v4.8h, v\a\().8h
>          ld1             {v6.8h},  [x2], x9
>          add             v5.8h, v5.8h, v\b\().8h
> @@ -1030,10 +1029,12 @@ function idct32_1d_8x32_pass2_neon
>          add             v6.8h, v6.8h, v\c\().8h
>          add             v7.8h, v7.8h, v\d\().8h
>  .else
> +        ld1             {v4.8h},  [x2], x7
> +        ld1             {v5.8h},  [x2], x7
>          sub             v4.8h, v4.8h, v\a\().8h
> -        ld1             {v6.8h},  [x2], x9
> +        ld1             {v6.8h},  [x2], x7
>          sub             v5.8h, v5.8h, v\b\().8h
> -        ld1             {v7.8h},  [x2], x9
> +        ld1             {v7.8h},  [x2], x7
>          sub             v6.8h, v6.8h, v\c\().8h
>          sub             v7.8h, v7.8h, v\d\().8h
>  .endif
> @@ -1064,7 +1065,6 @@ function idct32_1d_8x32_pass2_neon
>          load_acc_store  23, 22, 21, 20
>          load_acc_store  19, 18, 17, 16
>          sub             x2,  x2,  x9
> -        neg             x9,  x9
>          load_acc_store  16, 17, 18, 19, 1
>          load_acc_store  20, 21, 22, 23, 1
>          load_acc_store  24, 25, 26, 27, 1
> @@ -1093,6 +1093,10 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
>          mov             x5,  x1
>          mov             x6,  x2
>  
> +        // Double stride of the input, since we only read every other line
> +        mov             x9,  #128
> +        neg             x7,  x9
> +
>  .irp i, 0, 8, 16, 24
>          add             x0,  sp,  #(\i*64)
>          add             x2,  x6,  #(\i*2)

ok

Janne

Patch

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 2dc6b75..f4194a6 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -599,9 +599,9 @@  endfunc
 // x1 = unused
 // x2 = src
 // x3 = slice offset
+// x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
-        mov             x9, #32
         movi            v2.8h, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         load_clear      \i,  x2,  x9
@@ -649,8 +649,8 @@  endfunc
 // x1 = dst stride
 // x2 = src (temp buffer)
 // x3 = slice offset
+// x9 = temp buffer stride
 function \txfm\()16_1d_8x16_pass2_neon
-        mov             x9, #32
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         load            \i,  x2,  x9
 .endr
@@ -747,6 +747,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1,idct
         ld1             {v0.8h,v1.8h}, [x10]
 .endif
+        mov             x9, #32
 
 .irp i, 0, 8
         add             x0,  sp,  #(\i*32)
@@ -882,13 +883,12 @@  endfunc
 // x0 = dst (temp buffer)
 // x1 = unused
 // x2 = src
+// x9 = double input stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass1_neon
         ld1             {v0.8h,v1.8h}, [x10]
 
-        // Double stride of the input, since we only read every other line
-        mov             x9,  #128
         movi            v4.8h, #0
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
@@ -987,12 +987,13 @@  endfunc
 // x0 = dst
 // x1 = dst stride
 // x2 = src (temp buffer)
+// x7 = negative double temp buffer stride
+// x9 = double temp buffer stride
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
 function idct32_1d_8x32_pass2_neon
         ld1             {v0.8h,v1.8h}, [x10]
 
-        mov             x9, #128
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         ld1             {v\i\().8h}, [x2], x9
@@ -1001,7 +1002,6 @@  function idct32_1d_8x32_pass2_neon
 
         idct16
 
-        mov             x9,  #128
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         st1             {v\i\().8h}, [x2], x9
 .endr
@@ -1018,11 +1018,10 @@  function idct32_1d_8x32_pass2_neon
 
         idct32_odd
 
-        mov             x9,  #128
 .macro load_acc_store a, b, c, d, neg=0
+.if \neg == 0
         ld1             {v4.8h},  [x2], x9
         ld1             {v5.8h},  [x2], x9
-.if \neg == 0
         add             v4.8h, v4.8h, v\a\().8h
         ld1             {v6.8h},  [x2], x9
         add             v5.8h, v5.8h, v\b\().8h
@@ -1030,10 +1029,12 @@  function idct32_1d_8x32_pass2_neon
         add             v6.8h, v6.8h, v\c\().8h
         add             v7.8h, v7.8h, v\d\().8h
 .else
+        ld1             {v4.8h},  [x2], x7
+        ld1             {v5.8h},  [x2], x7
         sub             v4.8h, v4.8h, v\a\().8h
-        ld1             {v6.8h},  [x2], x9
+        ld1             {v6.8h},  [x2], x7
         sub             v5.8h, v5.8h, v\b\().8h
-        ld1             {v7.8h},  [x2], x9
+        ld1             {v7.8h},  [x2], x7
         sub             v6.8h, v6.8h, v\c\().8h
         sub             v7.8h, v7.8h, v\d\().8h
 .endif
@@ -1064,7 +1065,6 @@  function idct32_1d_8x32_pass2_neon
         load_acc_store  23, 22, 21, 20
         load_acc_store  19, 18, 17, 16
         sub             x2,  x2,  x9
-        neg             x9,  x9
         load_acc_store  16, 17, 18, 19, 1
         load_acc_store  20, 21, 22, 23, 1
         load_acc_store  24, 25, 26, 27, 1
@@ -1093,6 +1093,10 @@  function ff_vp9_idct_idct_32x32_add_neon, export=1
         mov             x5,  x1
         mov             x6,  x2
 
+        // Double stride of the input, since we only read every other line
+        mov             x9,  #128
+        neg             x7,  x9
+
 .irp i, 0, 8, 16, 24
         add             x0,  sp,  #(\i*64)
         add             x2,  x6,  #(\i*2)