[5/6] aarch64: vp9lpf: Interleave the start of flat8in into the calculation above

Message ID 1484513752-7472-5-git-send-email-martin@martin.st
State Committed
Headers show

Commit Message

Martin Storsjö Jan. 15, 2017, 8:55 p.m.
---
 libavcodec/aarch64/vp9lpf_neon.S | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

Comments

Janne Grunau Feb. 10, 2017, 8:44 p.m. | #1
On 2017-01-15 22:55:51 +0200, Martin Storsjö wrote:
> ---
>  libavcodec/aarch64/vp9lpf_neon.S | 16 +++++++++++++---
>  1 file changed, 13 insertions(+), 3 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
> index 4553173..3894307 100644
> --- a/libavcodec/aarch64/vp9lpf_neon.S
> +++ b/libavcodec/aarch64/vp9lpf_neon.S
> @@ -316,20 +316,30 @@
>  
>          uxtl_sz         v0.8h,  v1.8h,  v22, \sz    // p1
>          uxtl_sz         v2.8h,  v3.8h,  v25, \sz    // q1
> +.if \wd >= 8
> +        mov             x5,  v6.d[0]
> +.endif
>          saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3, \sz // p1 + f
>          ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q1 - f
> +.if \wd >= 8
> +.ifc \sz, .16b
> +        mov             x6,  v6.d[1]
> +.endif
> +.endif

is it helpful have this mov here? It would look a little less ugly if 
you merged this .if with the one above

>          sqxtun_sz       v0,  v0.8h,  v1.8h, \sz     // out p1
>          sqxtun_sz       v2,  v2.8h,  v3.8h, \sz     // out q1
> +.if \wd >= 8
> +.ifc \sz, .16b
> +        adds            x5,  x5,  x6
> +.endif
> +.endif
>          bit             v22\sz, v0\sz,  v5\sz       // if (!hev && fm && !flat8in)
>          bit             v25\sz, v2\sz,  v5\sz
>  
>          // If no pixels need flat8in, jump to flat8out
>          // (or to a writeout of the inner 4 pixels, for wd=8)
>  .if \wd >= 8
> -        mov             x5,  v6.d[0]
>  .ifc \sz, .16b
> -        mov             x6,  v6.d[1]
> -        adds            x5,  x5,  x6
>          b.eq            6f
>  .else
>          cbz             x5,  6f

otherwise ok

Janne
Martin Storsjö Feb. 11, 2017, 8:38 p.m. | #2
On Fri, 10 Feb 2017, Janne Grunau wrote:

> On 2017-01-15 22:55:51 +0200, Martin Storsjö wrote:
>> ---
>>  libavcodec/aarch64/vp9lpf_neon.S | 16 +++++++++++++---
>>  1 file changed, 13 insertions(+), 3 deletions(-)
>> 
>> diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
>> index 4553173..3894307 100644
>> --- a/libavcodec/aarch64/vp9lpf_neon.S
>> +++ b/libavcodec/aarch64/vp9lpf_neon.S
>> @@ -316,20 +316,30 @@
>>
>>          uxtl_sz         v0.8h,  v1.8h,  v22, \sz    // p1
>>          uxtl_sz         v2.8h,  v3.8h,  v25, \sz    // q1
>> +.if \wd >= 8
>> +        mov             x5,  v6.d[0]
>> +.endif
>>          saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3, \sz // p1 + f
>>          ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q1 - f
>> +.if \wd >= 8
>> +.ifc \sz, .16b
>> +        mov             x6,  v6.d[1]
>> +.endif
>> +.endif
>
> is it helpful have this mov here? It would look a little less ugly if 
> you merged this .if with the one above

It doesn't seem to slow things down to move them next to each other, will 
push in that form.

// Martin

Patch

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 4553173..3894307 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -316,20 +316,30 @@ 
 
         uxtl_sz         v0.8h,  v1.8h,  v22, \sz    // p1
         uxtl_sz         v2.8h,  v3.8h,  v25, \sz    // q1
+.if \wd >= 8
+        mov             x5,  v6.d[0]
+.endif
         saddw_sz        v0.8h,  v1.8h,  v0.8h,  v1.8h,  \tmp3, \sz // p1 + f
         ssubw_sz        v2.8h,  v3.8h,  v2.8h,  v3.8h,  \tmp3, \sz // q1 - f
+.if \wd >= 8
+.ifc \sz, .16b
+        mov             x6,  v6.d[1]
+.endif
+.endif
         sqxtun_sz       v0,  v0.8h,  v1.8h, \sz     // out p1
         sqxtun_sz       v2,  v2.8h,  v3.8h, \sz     // out q1
+.if \wd >= 8
+.ifc \sz, .16b
+        adds            x5,  x5,  x6
+.endif
+.endif
         bit             v22\sz, v0\sz,  v5\sz       // if (!hev && fm && !flat8in)
         bit             v25\sz, v2\sz,  v5\sz
 
         // If no pixels need flat8in, jump to flat8out
         // (or to a writeout of the inner 4 pixels, for wd=8)
 .if \wd >= 8
-        mov             x5,  v6.d[0]
 .ifc \sz, .16b
-        mov             x6,  v6.d[1]
-        adds            x5,  x5,  x6
         b.eq            6f
 .else
         cbz             x5,  6f