[2/6] arm/aarch64: vp9lpf: Keep the comparison to E within 8 bit

Message ID 1484513752-7472-2-git-send-email-martin@martin.st
State Committed
Commit c582cb8537367721bb399a5d01b652c20142b756
Headers show

Commit Message

Martin Storsjö Jan. 15, 2017, 8:55 p.m.
The theoretical maximum value of E is 193, so we can just
saturate the addition to 255.

Before:                     Cortex A7      A8      A9     A53  A53/AArch64
vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
After:
vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7
---
 libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++++-------------------------------
 libavcodec/arm/vp9lpf_neon.S     | 11 +++++------
 2 files changed, 14 insertions(+), 37 deletions(-)

Comments

Janne Grunau Feb. 10, 2017, 8:37 p.m. | #1
On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote:
> The theoretical maximum value of E is 193, so we can just
> saturate the addition to 255.
> 
> Before:                     Cortex A7      A8      A9     A53  A53/AArch64
> vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
> vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
> vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
> vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
> After:
> vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
> vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
> vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
> vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7
> ---
>  libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++++-------------------------------
>  libavcodec/arm/vp9lpf_neon.S     | 11 +++++------
>  2 files changed, 14 insertions(+), 37 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
> index 3b8e6eb..4553173 100644
> --- a/libavcodec/aarch64/vp9lpf_neon.S
> +++ b/libavcodec/aarch64/vp9lpf_neon.S
> @@ -51,13 +51,6 @@
>  // see the arm version instead.
>  
>  
> -.macro uabdl_sz dst1, dst2, in1, in2, sz
> -        uabdl           \dst1,  \in1\().8b,  \in2\().8b
> -.ifc \sz, .16b
> -        uabdl2          \dst2,  \in1\().16b, \in2\().16b
> -.endif
> -.endm
> -
>  .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
>          add             \dst1,  \in1,  \in3
>  .ifc \sz, .16b
> @@ -86,20 +79,6 @@
>  .endif
>  .endm
>  
> -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
> -        cmhs            \dst1,  \in1,  \in3
> -.ifc \sz, .16b
> -        cmhs            \dst2,  \in2,  \in4
> -.endif
> -.endm
> -
> -.macro xtn_sz dst, in1, in2, sz
> -        xtn             \dst\().8b,  \in1
> -.ifc \sz, .16b
> -        xtn2            \dst\().16b, \in2
> -.endif
> -.endm
> -
>  .macro usubl_sz dst1, dst2, in1, in2, sz
>          usubl           \dst1,  \in1\().8b,  \in2\().8b
>  .ifc \sz, .16b
> @@ -179,20 +158,20 @@
>  // tmpq2 == tmp3 + tmp4, etc.
>  .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
>  .if \mix == 0
> -        dup             v0.8h,  w2        // E
> -        dup             v1.8h,  w2        // E
> +        dup             v0\sz,  w2        // E
>          dup             v2\sz,  w3        // I
>          dup             v3\sz,  w4        // H
>  .else
> -        dup             v0.8h,  w2        // E
> +        dup             v0.8b,  w2        // E
>          dup             v2.8b,  w3        // I
>          dup             v3.8b,  w4        // H
> +        lsr             w5,     w2,  #8
>          lsr             w6,     w3,  #8
>          lsr             w7,     w4,  #8
> -        ushr            v1.8h,  v0.8h, #8 // E
> +        dup             v1.8b,  w5        // E
>          dup             v4.8b,  w6        // I
> -        bic             v0.8h,  #255, lsl 8 // E
>          dup             v5.8b,  w7        // H
> +        trn1            v0.2d,  v0.2d,  v1.2d

isn't this equivalent to

dup  v0.8h, w2
uzp1 v0.16b, v0.16b, v0.16b

on little endian?

>          trn1            v2.2d,  v2.2d,  v4.2d
>          trn1            v3.2d,  v3.2d,  v5.2d
>  .endif
> @@ -206,16 +185,15 @@
>          umax            v4\sz,  v4\sz,  v5\sz
>          umax            v5\sz,  v6\sz,  v7\sz
>          umax            \tmp1\sz, \tmp1\sz, \tmp2\sz
> -        uabdl_sz        v6.8h,  v7.8h,  v23, v24, \sz // abs(p0 - q0)
> +        uabd            v6\sz,  v23\sz, v24\sz        // abs(p0 - q0)
>          umax            v4\sz,  v4\sz,  v5\sz
> -        add_sz          v6.8h,  v7.8h,  v6.8h,  v7.8h,  v6.8h,  v7.8h, \sz // abs(p0 - q0) * 2
> +        uqadd           v6\sz,  v6\sz,  v6\sz         // abs(p0 - q0) * 2
>          uabd            v5\sz,  v22\sz, v25\sz        // abs(p1 - q1)
>          umax            v4\sz,  v4\sz,  \tmp1\sz      // max(abs(p3 - p2), ..., abs(q2 - q3))
>          ushr            v5\sz,  v5\sz,  #1
>          cmhs            v4\sz,  v2\sz,  v4\sz         // max(abs()) <= I
> -        uaddw_sz        v6.8h,  v7.8h,  v6.8h,  v7.8h,  v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
> -        cmhs_sz         v6.8h,  v7.8h,  v0.8h,  v1.8h,  v6.8h,  v7.8h, \sz
> -        xtn_sz          v5,     v6.8h,  v7.8h,  \sz
> +        uqadd           v6\sz,  v6\sz,  v5\sz         // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
> +        cmhs            v5\sz,  v0\sz,  v6\sz
>          and             v4\sz,  v4\sz,  v5\sz         // fm
>  
>          // If no pixels need filtering, just exit as soon as possible
> diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
> index c57c0e9..5e154f6 100644
> --- a/libavcodec/arm/vp9lpf_neon.S
> +++ b/libavcodec/arm/vp9lpf_neon.S
> @@ -51,7 +51,7 @@
>  @ and d28-d31 as temp registers, or d8-d15.
>  @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
>  .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
> -        vdup.u16        q0,  r2 @ E
> +        vdup.u8         d0,  r2 @ E
>          vdup.u8         d2,  r3 @ I
>          ldr             r3,  [sp]
>  
> @@ -64,16 +64,15 @@
>          vmax.u8         d4,  d4,  d5
>          vmax.u8         d5,  d6,  d7
>          vmax.u8         \tmp1,  \tmp1,  \tmp2
> -        vabdl.u8        q3,  d23, d24    @ abs(p0 - q0)
> +        vabd.u8         d6,  d23, d24    @ abs(p0 - q0)
>          vmax.u8         d4,  d4,  d5
> -        vadd.u16        q3,  q3,  q3     @ abs(p0 - q0) * 2
> +        vqadd.u8        d6,  d6,  d6     @ abs(p0 - q0) * 2
>          vabd.u8         d5,  d22, d25    @ abs(p1 - q1)
>          vmax.u8         d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
>          vshr.u8         d5,  d5,  #1
>          vcle.u8         d4,  d4,  d2     @ max(abs()) <= I
> -        vaddw.u8        q3,  q3,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
> -        vcle.u16        q3,  q3,  q0
> -        vmovn.u16       d5,  q3
> +        vqadd.u8        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
> +        vcle.u8         d5,  d6,  d0
>          vand            d4,  d4,  d5     @ fm
>  
>          vdup.u8         d3,  r3          @ H

otherwise ok

Janne
Martin Storsjö Feb. 11, 2017, 8:19 p.m. | #2
On Fri, 10 Feb 2017, Janne Grunau wrote:

> On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote:
>> The theoretical maximum value of E is 193, so we can just
>> saturate the addition to 255.
>> 
>> Before:                     Cortex A7      A8      A9     A53  A53/AArch64
>> vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
>> vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
>> vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
>> vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
>> After:
>> vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
>> vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
>> vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
>> vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7
>> ---
>>  libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++++-------------------------------
>>  libavcodec/arm/vp9lpf_neon.S     | 11 +++++------
>>  2 files changed, 14 insertions(+), 37 deletions(-)
>> 
>> diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
>> index 3b8e6eb..4553173 100644
>> --- a/libavcodec/aarch64/vp9lpf_neon.S
>> +++ b/libavcodec/aarch64/vp9lpf_neon.S
>> @@ -51,13 +51,6 @@
>>  // see the arm version instead.
>>
>> 
>> -.macro uabdl_sz dst1, dst2, in1, in2, sz
>> -        uabdl           \dst1,  \in1\().8b,  \in2\().8b
>> -.ifc \sz, .16b
>> -        uabdl2          \dst2,  \in1\().16b, \in2\().16b
>> -.endif
>> -.endm
>> -
>>  .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
>>          add             \dst1,  \in1,  \in3
>>  .ifc \sz, .16b
>> @@ -86,20 +79,6 @@
>>  .endif
>>  .endm
>> 
>> -.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
>> -        cmhs            \dst1,  \in1,  \in3
>> -.ifc \sz, .16b
>> -        cmhs            \dst2,  \in2,  \in4
>> -.endif
>> -.endm
>> -
>> -.macro xtn_sz dst, in1, in2, sz
>> -        xtn             \dst\().8b,  \in1
>> -.ifc \sz, .16b
>> -        xtn2            \dst\().16b, \in2
>> -.endif
>> -.endm
>> -
>>  .macro usubl_sz dst1, dst2, in1, in2, sz
>>          usubl           \dst1,  \in1\().8b,  \in2\().8b
>>  .ifc \sz, .16b
>> @@ -179,20 +158,20 @@
>>  // tmpq2 == tmp3 + tmp4, etc.
>>  .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
>>  .if \mix == 0
>> -        dup             v0.8h,  w2        // E
>> -        dup             v1.8h,  w2        // E
>> +        dup             v0\sz,  w2        // E
>>          dup             v2\sz,  w3        // I
>>          dup             v3\sz,  w4        // H
>>  .else
>> -        dup             v0.8h,  w2        // E
>> +        dup             v0.8b,  w2        // E
>>          dup             v2.8b,  w3        // I
>>          dup             v3.8b,  w4        // H
>> +        lsr             w5,     w2,  #8
>>          lsr             w6,     w3,  #8
>>          lsr             w7,     w4,  #8
>> -        ushr            v1.8h,  v0.8h, #8 // E
>> +        dup             v1.8b,  w5        // E
>>          dup             v4.8b,  w6        // I
>> -        bic             v0.8h,  #255, lsl 8 // E
>>          dup             v5.8b,  w7        // H
>> +        trn1            v0.2d,  v0.2d,  v1.2d
>
> isn't this equivalent to
>
> dup  v0.8h, w2
> uzp1 v0.16b, v0.16b, v0.16b
>
> on little endian?

Nice idea, but it isn't quite as straightforward on aarch64 - on arm it 
would have been.

All the even values will be output in the output registers of uzp1, so 
you need uzp2 as well.

So instead of this as we have now:

     dup  v0.8b, w2
     lsr  w5, w2, #8
     dup  v1.8b, w5
     trn1 v0.2d, v0.2d, v1.2d

We could do:

     dup  v0.8h, w2
     uzp2 v1.16b, v0.16b, v0.16b
     uzp1 v0.16b, v0.16b, v0.16b
     trn1 v0.2d, v0.2d, v1.2d

And I'm not sure if that's any more straightforward.

In arm mode, one could have done this though:

     vdup.s16 q0, r2
     vuzp.8  d0, d1

(We don't have a corresponding routine on arm though since we don't have 
enough register space.)

// Martin
Janne Grunau Feb. 23, 2017, 8:26 p.m. | #3
On 2017-02-11 22:19:02 +0200, Martin Storsjö wrote:
> On Fri, 10 Feb 2017, Janne Grunau wrote:
> 
> >On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote:
> >>The theoretical maximum value of E is 193, so we can just
> >>saturate the addition to 255.
> >>
> >>Before:                     Cortex A7      A8      A9     A53  A53/AArch64
> >>vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
> >>vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
> >>vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
> >>vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
> >>After:
> >>vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
> >>vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
> >>vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
> >>vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7
> >>---
> >> libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++++-------------------------------
> >> libavcodec/arm/vp9lpf_neon.S     | 11 +++++------
> >> 2 files changed, 14 insertions(+), 37 deletions(-)
> >>
> >>diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
> >>index 3b8e6eb..4553173 100644
> >>--- a/libavcodec/aarch64/vp9lpf_neon.S
> >>+++ b/libavcodec/aarch64/vp9lpf_neon.S
> >>@@ -51,13 +51,6 @@
> >> // see the arm version instead.
> >>
> >>
> >>-.macro uabdl_sz dst1, dst2, in1, in2, sz
> >>-        uabdl           \dst1,  \in1\().8b,  \in2\().8b
> >>-.ifc \sz, .16b
> >>-        uabdl2          \dst2,  \in1\().16b, \in2\().16b
> >>-.endif
> >>-.endm
> >>-
> >> .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
> >>         add             \dst1,  \in1,  \in3
> >> .ifc \sz, .16b
> >>@@ -86,20 +79,6 @@
> >> .endif
> >> .endm
> >>
> >>-.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
> >>-        cmhs            \dst1,  \in1,  \in3
> >>-.ifc \sz, .16b
> >>-        cmhs            \dst2,  \in2,  \in4
> >>-.endif
> >>-.endm
> >>-
> >>-.macro xtn_sz dst, in1, in2, sz
> >>-        xtn             \dst\().8b,  \in1
> >>-.ifc \sz, .16b
> >>-        xtn2            \dst\().16b, \in2
> >>-.endif
> >>-.endm
> >>-
> >> .macro usubl_sz dst1, dst2, in1, in2, sz
> >>         usubl           \dst1,  \in1\().8b,  \in2\().8b
> >> .ifc \sz, .16b
> >>@@ -179,20 +158,20 @@
> >> // tmpq2 == tmp3 + tmp4, etc.
> >> .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
> >> .if \mix == 0
> >>-        dup             v0.8h,  w2        // E
> >>-        dup             v1.8h,  w2        // E
> >>+        dup             v0\sz,  w2        // E
> >>         dup             v2\sz,  w3        // I
> >>         dup             v3\sz,  w4        // H
> >> .else
> >>-        dup             v0.8h,  w2        // E
> >>+        dup             v0.8b,  w2        // E
> >>         dup             v2.8b,  w3        // I
> >>         dup             v3.8b,  w4        // H
> >>+        lsr             w5,     w2,  #8
> >>         lsr             w6,     w3,  #8
> >>         lsr             w7,     w4,  #8
> >>-        ushr            v1.8h,  v0.8h, #8 // E
> >>+        dup             v1.8b,  w5        // E
> >>         dup             v4.8b,  w6        // I
> >>-        bic             v0.8h,  #255, lsl 8 // E
> >>         dup             v5.8b,  w7        // H
> >>+        trn1            v0.2d,  v0.2d,  v1.2d
> >
> >isn't this equivalent to
> >
> >dup  v0.8h, w2
> >uzp1 v0.16b, v0.16b, v0.16b
> >
> >on little endian?
> 
> Nice idea, but it isn't quite as straightforward on aarch64 - on arm it
> would have been.

gah, yes.

> All the even values will be output in the output registers of uzp1, so 
> you need uzp2 as well.
> 
> So instead of this as we have now:
> 
>     dup  v0.8b, w2
>     lsr  w5, w2, #8
>     dup  v1.8b, w5
>     trn1 v0.2d, v0.2d, v1.2d
> 
> We could do:
> 
>     dup  v0.8h, w2
>     uzp2 v1.16b, v0.16b, v0.16b
>     uzp1 v0.16b, v0.16b, v0.16b
>     trn1 v0.2d, v0.2d, v1.2d

rev16 v1.16b, v0.16b // or ext ..x or any other instruction
uzp1  v0.16b, v0.16b, v1.16b

is one instruction less but also not straight forward

ok as is

Janne
Martin Storsjö Feb. 23, 2017, 9:37 p.m. | #4
On Thu, 23 Feb 2017, Janne Grunau wrote:

> On 2017-02-11 22:19:02 +0200, Martin Storsjö wrote:
>> On Fri, 10 Feb 2017, Janne Grunau wrote:
>> 
>> >On 2017-01-15 22:55:48 +0200, Martin Storsjö wrote:
>> >>The theoretical maximum value of E is 193, so we can just
>> >>saturate the addition to 255.
>> >>
>> >>Before:                     Cortex A7      A8      A9     A53  A53/AArch64
>> >>vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
>> >>vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
>> >>vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
>> >>vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
>> >>After:
>> >>vp9_loop_filter_v_4_8_neon:     136.0   125.7   112.6    84.0         83.0
>> >>vp9_loop_filter_v_8_8_neon:     234.0   195.5   171.5   136.0        133.7
>> >>vp9_loop_filter_v_16_8_neon:    490.0   417.5   377.7   289.0        271.0
>> >>vp9_loop_filter_v_16_16_neon:   951.2   814.7   732.3   571.0        446.7
>> >>---
>> >> libavcodec/aarch64/vp9lpf_neon.S | 40 +++++++++-------------------------------
>> >> libavcodec/arm/vp9lpf_neon.S     | 11 +++++------
>> >> 2 files changed, 14 insertions(+), 37 deletions(-)
>> >>
>> >>diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
>> >>index 3b8e6eb..4553173 100644
>> >>--- a/libavcodec/aarch64/vp9lpf_neon.S
>> >>+++ b/libavcodec/aarch64/vp9lpf_neon.S
>> >>@@ -51,13 +51,6 @@
>> >> // see the arm version instead.
>> >>
>> >>
>> >>-.macro uabdl_sz dst1, dst2, in1, in2, sz
>> >>-        uabdl           \dst1,  \in1\().8b,  \in2\().8b
>> >>-.ifc \sz, .16b
>> >>-        uabdl2          \dst2,  \in1\().16b, \in2\().16b
>> >>-.endif
>> >>-.endm
>> >>-
>> >> .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
>> >>         add             \dst1,  \in1,  \in3
>> >> .ifc \sz, .16b
>> >>@@ -86,20 +79,6 @@
>> >> .endif
>> >> .endm
>> >>
>> >>-.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
>> >>-        cmhs            \dst1,  \in1,  \in3
>> >>-.ifc \sz, .16b
>> >>-        cmhs            \dst2,  \in2,  \in4
>> >>-.endif
>> >>-.endm
>> >>-
>> >>-.macro xtn_sz dst, in1, in2, sz
>> >>-        xtn             \dst\().8b,  \in1
>> >>-.ifc \sz, .16b
>> >>-        xtn2            \dst\().16b, \in2
>> >>-.endif
>> >>-.endm
>> >>-
>> >> .macro usubl_sz dst1, dst2, in1, in2, sz
>> >>         usubl           \dst1,  \in1\().8b,  \in2\().8b
>> >> .ifc \sz, .16b
>> >>@@ -179,20 +158,20 @@
>> >> // tmpq2 == tmp3 + tmp4, etc.
>> >> .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
>> >> .if \mix == 0
>> >>-        dup             v0.8h,  w2        // E
>> >>-        dup             v1.8h,  w2        // E
>> >>+        dup             v0\sz,  w2        // E
>> >>         dup             v2\sz,  w3        // I
>> >>         dup             v3\sz,  w4        // H
>> >> .else
>> >>-        dup             v0.8h,  w2        // E
>> >>+        dup             v0.8b,  w2        // E
>> >>         dup             v2.8b,  w3        // I
>> >>         dup             v3.8b,  w4        // H
>> >>+        lsr             w5,     w2,  #8
>> >>         lsr             w6,     w3,  #8
>> >>         lsr             w7,     w4,  #8
>> >>-        ushr            v1.8h,  v0.8h, #8 // E
>> >>+        dup             v1.8b,  w5        // E
>> >>         dup             v4.8b,  w6        // I
>> >>-        bic             v0.8h,  #255, lsl 8 // E
>> >>         dup             v5.8b,  w7        // H
>> >>+        trn1            v0.2d,  v0.2d,  v1.2d
>> >
>> >isn't this equivalent to
>> >
>> >dup  v0.8h, w2
>> >uzp1 v0.16b, v0.16b, v0.16b
>> >
>> >on little endian?
>> 
>> Nice idea, but it isn't quite as straightforward on aarch64 - on arm it
>> would have been.
>
> gah, yes.
>
>> All the even values will be output in the output registers of uzp1, so 
>> you need uzp2 as well.
>> 
>> So instead of this as we have now:
>>
>>     dup  v0.8b, w2
>>     lsr  w5, w2, #8
>>     dup  v1.8b, w5
>>     trn1 v0.2d, v0.2d, v1.2d
>> 
>> We could do:
>>
>>     dup  v0.8h, w2
>>     uzp2 v1.16b, v0.16b, v0.16b
>>     uzp1 v0.16b, v0.16b, v0.16b
>>     trn1 v0.2d, v0.2d, v1.2d
>
> rev16 v1.16b, v0.16b // or ext ..x or any other instruction
> uzp1  v0.16b, v0.16b, v1.16b
>
> is one instruction less but also not straight forward

Neat, thanks! This turns out to be one cycle faster in total, and three 
instructions less. I'll push that as a separate patch since it changes the 
existing ones quite a bit as well, not just the registers touched by this 
patch.

// Martin

Patch

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index 3b8e6eb..4553173 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -51,13 +51,6 @@ 
 // see the arm version instead.
 
 
-.macro uabdl_sz dst1, dst2, in1, in2, sz
-        uabdl           \dst1,  \in1\().8b,  \in2\().8b
-.ifc \sz, .16b
-        uabdl2          \dst2,  \in1\().16b, \in2\().16b
-.endif
-.endm
-
 .macro add_sz dst1, dst2, in1, in2, in3, in4, sz
         add             \dst1,  \in1,  \in3
 .ifc \sz, .16b
@@ -86,20 +79,6 @@ 
 .endif
 .endm
 
-.macro cmhs_sz dst1, dst2, in1, in2, in3, in4, sz
-        cmhs            \dst1,  \in1,  \in3
-.ifc \sz, .16b
-        cmhs            \dst2,  \in2,  \in4
-.endif
-.endm
-
-.macro xtn_sz dst, in1, in2, sz
-        xtn             \dst\().8b,  \in1
-.ifc \sz, .16b
-        xtn2            \dst\().16b, \in2
-.endif
-.endm
-
 .macro usubl_sz dst1, dst2, in1, in2, sz
         usubl           \dst1,  \in1\().8b,  \in2\().8b
 .ifc \sz, .16b
@@ -179,20 +158,20 @@ 
 // tmpq2 == tmp3 + tmp4, etc.
 .macro loop_filter wd, sz, mix, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8
 .if \mix == 0
-        dup             v0.8h,  w2        // E
-        dup             v1.8h,  w2        // E
+        dup             v0\sz,  w2        // E
         dup             v2\sz,  w3        // I
         dup             v3\sz,  w4        // H
 .else
-        dup             v0.8h,  w2        // E
+        dup             v0.8b,  w2        // E
         dup             v2.8b,  w3        // I
         dup             v3.8b,  w4        // H
+        lsr             w5,     w2,  #8
         lsr             w6,     w3,  #8
         lsr             w7,     w4,  #8
-        ushr            v1.8h,  v0.8h, #8 // E
+        dup             v1.8b,  w5        // E
         dup             v4.8b,  w6        // I
-        bic             v0.8h,  #255, lsl 8 // E
         dup             v5.8b,  w7        // H
+        trn1            v0.2d,  v0.2d,  v1.2d
         trn1            v2.2d,  v2.2d,  v4.2d
         trn1            v3.2d,  v3.2d,  v5.2d
 .endif
@@ -206,16 +185,15 @@ 
         umax            v4\sz,  v4\sz,  v5\sz
         umax            v5\sz,  v6\sz,  v7\sz
         umax            \tmp1\sz, \tmp1\sz, \tmp2\sz
-        uabdl_sz        v6.8h,  v7.8h,  v23, v24, \sz // abs(p0 - q0)
+        uabd            v6\sz,  v23\sz, v24\sz        // abs(p0 - q0)
         umax            v4\sz,  v4\sz,  v5\sz
-        add_sz          v6.8h,  v7.8h,  v6.8h,  v7.8h,  v6.8h,  v7.8h, \sz // abs(p0 - q0) * 2
+        uqadd           v6\sz,  v6\sz,  v6\sz         // abs(p0 - q0) * 2
         uabd            v5\sz,  v22\sz, v25\sz        // abs(p1 - q1)
         umax            v4\sz,  v4\sz,  \tmp1\sz      // max(abs(p3 - p2), ..., abs(q2 - q3))
         ushr            v5\sz,  v5\sz,  #1
         cmhs            v4\sz,  v2\sz,  v4\sz         // max(abs()) <= I
-        uaddw_sz        v6.8h,  v7.8h,  v6.8h,  v7.8h,  v5, \sz // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
-        cmhs_sz         v6.8h,  v7.8h,  v0.8h,  v1.8h,  v6.8h,  v7.8h, \sz
-        xtn_sz          v5,     v6.8h,  v7.8h,  \sz
+        uqadd           v6\sz,  v6\sz,  v5\sz         // abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        cmhs            v5\sz,  v0\sz,  v6\sz
         and             v4\sz,  v4\sz,  v5\sz         // fm
 
         // If no pixels need filtering, just exit as soon as possible
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index c57c0e9..5e154f6 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -51,7 +51,7 @@ 
 @ and d28-d31 as temp registers, or d8-d15.
 @ tmp1,tmp2 = tmpq1, tmp3,tmp4 = tmpq2, tmp5,tmp6 = tmpq3, tmp7,tmp8 = tmpq4
 .macro loop_filter wd, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tmpq1, tmpq2, tmpq3, tmpq4
-        vdup.u16        q0,  r2 @ E
+        vdup.u8         d0,  r2 @ E
         vdup.u8         d2,  r3 @ I
         ldr             r3,  [sp]
 
@@ -64,16 +64,15 @@ 
         vmax.u8         d4,  d4,  d5
         vmax.u8         d5,  d6,  d7
         vmax.u8         \tmp1,  \tmp1,  \tmp2
-        vabdl.u8        q3,  d23, d24    @ abs(p0 - q0)
+        vabd.u8         d6,  d23, d24    @ abs(p0 - q0)
         vmax.u8         d4,  d4,  d5
-        vadd.u16        q3,  q3,  q3     @ abs(p0 - q0) * 2
+        vqadd.u8        d6,  d6,  d6     @ abs(p0 - q0) * 2
         vabd.u8         d5,  d22, d25    @ abs(p1 - q1)
         vmax.u8         d4,  d4,  \tmp1  @ max(abs(p3 - p2), ..., abs(q2 - q3))
         vshr.u8         d5,  d5,  #1
         vcle.u8         d4,  d4,  d2     @ max(abs()) <= I
-        vaddw.u8        q3,  q3,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
-        vcle.u16        q3,  q3,  q0
-        vmovn.u16       d5,  q3
+        vqadd.u8        d6,  d6,  d5     @ abs(p0 - q0) * 2 + abs(p1 - q1) >> 1
+        vcle.u8         d5,  d6,  d0
         vand            d4,  d4,  d5     @ fm
 
         vdup.u8         d3,  r3          @ H