[6/6] aarch64: vp9mc: Calculate less unused data in the 4 pixel wide horizontal filter

Message ID 1483359476-4641-6-git-send-email-martin@martin.st
State Committed
Commit 388e0d2515bc6bbc9d0c9af1d230bd16cf945fe7
Headers show

Commit Message

Martin Storsjö Jan. 2, 2017, 12:17 p.m.
No measured speedup on an Cortex A53, but other cores might benefit.
---
 libavcodec/aarch64/vp9mc_neon.S | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

Comments

Diego Biurrun Jan. 2, 2017, 2:06 p.m. | #1
On Mon, Jan 02, 2017 at 02:17:56PM +0200, Martin Storsjö wrote:
> No measured speedup on an Cortex A53, but other cores might benefit.

nit: on a Cortex

Diego
Janne Grunau Feb. 10, 2017, 7:56 p.m. | #2
On 2017-01-02 14:17:56 +0200, Martin Storsjö wrote:
> No measured speedup on an Cortex A53, but other cores might benefit.

A little surprised that it didn't made a difference on the cortex-a53 
since certain sites reported the NEON unit isn't fully 128-bit wide, So 
unlikely that it makes a difference on other cores.

> ---
>  libavcodec/aarch64/vp9mc_neon.S | 15 +++++++++++++--
>  1 file changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
> index 99f1809..95ed26c 100644
> --- a/libavcodec/aarch64/vp9mc_neon.S
> +++ b/libavcodec/aarch64/vp9mc_neon.S
> @@ -202,9 +202,12 @@ endfunc
>          ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
>          mla             \dst2\().8h, v21.8h, v0.h[\offset]
>          mla             \dst4\().8h, v23.8h, v0.h[\offset]
> -.else
> +.elseif \size == 8
>          mla             \dst1\().8h, v20.8h, v0.h[\offset]
>          mla             \dst3\().8h, v22.8h, v0.h[\offset]
> +.else
> +        mla             \dst1\().4h, v20.4h, v0.h[\offset]
> +        mla             \dst3\().4h, v22.4h, v0.h[\offset]
>  .endif
>  .endm
>  // The same as above, but don't accumulate straight into the
> @@ -219,16 +222,24 @@ endfunc
>          ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
>          mul             v21.8h, v21.8h, v0.h[\offset]
>          mul             v23.8h, v23.8h, v0.h[\offset]
> -.else
> +.elseif \size == 8
>          mul             v20.8h, v20.8h, v0.h[\offset]
>          mul             v22.8h, v22.8h, v0.h[\offset]
> +.else
> +        mul             v20.4h, v20.4h, v0.h[\offset]
> +        mul             v22.4h, v22.4h, v0.h[\offset]
>  .endif
> +.if \size == 4
> +        sqadd           \dst1\().4h, \dst1\().4h, v20.4h
> +        sqadd           \dst3\().4h, \dst3\().4h, v22.4h
> +.else
>          sqadd           \dst1\().8h, \dst1\().8h, v20.8h
>          sqadd           \dst3\().8h, \dst3\().8h, v22.8h
>  .if \size >= 16
>          sqadd           \dst2\().8h, \dst2\().8h, v21.8h
>          sqadd           \dst4\().8h, \dst4\().8h, v23.8h
>  .endif
> +.endif
>  .endm

patch ok

Janne

Patch

diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
index 99f1809..95ed26c 100644
--- a/libavcodec/aarch64/vp9mc_neon.S
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -202,9 +202,12 @@  endfunc
         ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
         mla             \dst2\().8h, v21.8h, v0.h[\offset]
         mla             \dst4\().8h, v23.8h, v0.h[\offset]
-.else
+.elseif \size == 8
         mla             \dst1\().8h, v20.8h, v0.h[\offset]
         mla             \dst3\().8h, v22.8h, v0.h[\offset]
+.else
+        mla             \dst1\().4h, v20.4h, v0.h[\offset]
+        mla             \dst3\().4h, v22.4h, v0.h[\offset]
 .endif
 .endm
 // The same as above, but don't accumulate straight into the
@@ -219,16 +222,24 @@  endfunc
         ext             v23.16b, \src5\().16b, \src6\().16b, #(2*\offset)
         mul             v21.8h, v21.8h, v0.h[\offset]
         mul             v23.8h, v23.8h, v0.h[\offset]
-.else
+.elseif \size == 8
         mul             v20.8h, v20.8h, v0.h[\offset]
         mul             v22.8h, v22.8h, v0.h[\offset]
+.else
+        mul             v20.4h, v20.4h, v0.h[\offset]
+        mul             v22.4h, v22.4h, v0.h[\offset]
 .endif
+.if \size == 4
+        sqadd           \dst1\().4h, \dst1\().4h, v20.4h
+        sqadd           \dst3\().4h, \dst3\().4h, v22.4h
+.else
         sqadd           \dst1\().8h, \dst1\().8h, v20.8h
         sqadd           \dst3\().8h, \dst3\().8h, v22.8h
 .if \size >= 16
         sqadd           \dst2\().8h, \dst2\().8h, v21.8h
         sqadd           \dst4\().8h, \dst4\().8h, v23.8h
 .endif
+.endif
 .endm