[4/5] aarch64: vp9itxfm: Restructure the idct32 store macros

Message ID 1480584422-24237-5-git-send-email-martin@martin.st
State Committed
Headers show

Commit Message

Martin Storsjö Dec. 1, 2016, 9:27 a.m.
This avoids concatenation, which can't be used if the whole macro
is wrapped within another macro.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 80 +++++++++++++++++++-------------------
 1 file changed, 40 insertions(+), 40 deletions(-)

Comments

Janne Grunau Feb. 4, 2017, 4:48 p.m. | #1
On 2016-12-01 11:27:00 +0200, Martin Storsjö wrote:
> This avoids concatenation, which can't be used if the whole macro
> is wrapped within another macro.
> ---
>  libavcodec/aarch64/vp9itxfm_neon.S | 80 +++++++++++++++++++-------------------
>  1 file changed, 40 insertions(+), 40 deletions(-)

ok, I think it's also more readable.

Janne

Patch

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index 5a080a4..be9643e 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -942,23 +942,23 @@  function idct32_1d_8x32_pass1_neon
 .macro store_rev a, b
         // There's no rev128 instruction, but we reverse each 64 bit
         // half, and then flip them using an ext with 8 bytes offset.
-        rev64           v1.8h, v\b\().8h
-        st1             {v\a\().8h},  [x0], #16
-        rev64           v0.8h, v\a\().8h
+        rev64           v1.8h, \b
+        st1             {\a},  [x0], #16
+        rev64           v0.8h, \a
         ext             v1.16b, v1.16b, v1.16b, #8
-        st1             {v\b\().8h},  [x0], #16
+        st1             {\b},  [x0], #16
         ext             v0.16b, v0.16b, v0.16b, #8
         st1             {v1.8h},  [x0], #16
         st1             {v0.8h},  [x0], #16
 .endm
-        store_rev       16, 24
-        store_rev       17, 25
-        store_rev       18, 26
-        store_rev       19, 27
-        store_rev       20, 28
-        store_rev       21, 29
-        store_rev       22, 30
-        store_rev       23, 31
+        store_rev       v16.8h, v24.8h
+        store_rev       v17.8h, v25.8h
+        store_rev       v18.8h, v26.8h
+        store_rev       v19.8h, v27.8h
+        store_rev       v20.8h, v28.8h
+        store_rev       v21.8h, v29.8h
+        store_rev       v22.8h, v30.8h
+        store_rev       v23.8h, v31.8h
         sub             x0,  x0,  #512
 .purgem store_rev
 
@@ -984,14 +984,14 @@  function idct32_1d_8x32_pass1_neon
         // subtracted from the output.
 .macro store_rev a, b
         ld1             {v4.8h},  [x0]
-        rev64           v1.8h, v\b\().8h
-        add             v4.8h, v4.8h, v\a\().8h
-        rev64           v0.8h, v\a\().8h
+        rev64           v1.8h, \b
+        add             v4.8h, v4.8h, \a
+        rev64           v0.8h, \a
         st1             {v4.8h},  [x0], #16
         ext             v1.16b, v1.16b, v1.16b, #8
         ld1             {v5.8h},  [x0]
         ext             v0.16b, v0.16b, v0.16b, #8
-        add             v5.8h, v5.8h, v\b\().8h
+        add             v5.8h, v5.8h, \b
         st1             {v5.8h},  [x0], #16
         ld1             {v6.8h},  [x0]
         sub             v6.8h, v6.8h, v1.8h
@@ -1001,14 +1001,14 @@  function idct32_1d_8x32_pass1_neon
         st1             {v7.8h},  [x0], #16
 .endm
 
-        store_rev       31, 23
-        store_rev       30, 22
-        store_rev       29, 21
-        store_rev       28, 20
-        store_rev       27, 19
-        store_rev       26, 18
-        store_rev       25, 17
-        store_rev       24, 16
+        store_rev       v31.8h, v23.8h
+        store_rev       v30.8h, v22.8h
+        store_rev       v29.8h, v21.8h
+        store_rev       v28.8h, v20.8h
+        store_rev       v27.8h, v19.8h
+        store_rev       v26.8h, v18.8h
+        store_rev       v25.8h, v17.8h
+        store_rev       v24.8h, v16.8h
 .purgem store_rev
         br              x14
 endfunc
@@ -1055,21 +1055,21 @@  function idct32_1d_8x32_pass2_neon
 .if \neg == 0
         ld1             {v4.8h},  [x2], x9
         ld1             {v5.8h},  [x2], x9
-        add             v4.8h, v4.8h, v\a\().8h
+        add             v4.8h, v4.8h, \a
         ld1             {v6.8h},  [x2], x9
-        add             v5.8h, v5.8h, v\b\().8h
+        add             v5.8h, v5.8h, \b
         ld1             {v7.8h},  [x2], x9
-        add             v6.8h, v6.8h, v\c\().8h
-        add             v7.8h, v7.8h, v\d\().8h
+        add             v6.8h, v6.8h, \c
+        add             v7.8h, v7.8h, \d
 .else
         ld1             {v4.8h},  [x2], x7
         ld1             {v5.8h},  [x2], x7
-        sub             v4.8h, v4.8h, v\a\().8h
+        sub             v4.8h, v4.8h, \a
         ld1             {v6.8h},  [x2], x7
-        sub             v5.8h, v5.8h, v\b\().8h
+        sub             v5.8h, v5.8h, \b
         ld1             {v7.8h},  [x2], x7
-        sub             v6.8h, v6.8h, v\c\().8h
-        sub             v7.8h, v7.8h, v\d\().8h
+        sub             v6.8h, v6.8h, \c
+        sub             v7.8h, v7.8h, \d
 .endif
         ld1             {v0.8b}, [x0], x1
         ld1             {v1.8b}, [x0], x1
@@ -1093,15 +1093,15 @@  function idct32_1d_8x32_pass2_neon
         st1             {v6.8b}, [x0], x1
         st1             {v7.8b}, [x0], x1
 .endm
-        load_acc_store  31, 30, 29, 28
-        load_acc_store  27, 26, 25, 24
-        load_acc_store  23, 22, 21, 20
-        load_acc_store  19, 18, 17, 16
+        load_acc_store  v31.8h, v30.8h, v29.8h, v28.8h
+        load_acc_store  v27.8h, v26.8h, v25.8h, v24.8h
+        load_acc_store  v23.8h, v22.8h, v21.8h, v20.8h
+        load_acc_store  v19.8h, v18.8h, v17.8h, v16.8h
         sub             x2,  x2,  x9
-        load_acc_store  16, 17, 18, 19, 1
-        load_acc_store  20, 21, 22, 23, 1
-        load_acc_store  24, 25, 26, 27, 1
-        load_acc_store  28, 29, 30, 31, 1
+        load_acc_store  v16.8h, v17.8h, v18.8h, v19.8h, 1
+        load_acc_store  v20.8h, v21.8h, v22.8h, v23.8h, 1
+        load_acc_store  v24.8h, v25.8h, v26.8h, v27.8h, 1
+        load_acc_store  v28.8h, v29.8h, v30.8h, v31.8h, 1
 .purgem load_acc_store
         br              x14
 endfunc