[PATCHv2] aarch64: vp9: Add NEON optimizations of VP9 MC functions

Message ID 1478091521-23200-1-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö Nov. 2, 2016, 12:58 p.m.
This work is sponsored by, and copyright, Google.

These are ported from the ARM version; it is essentially a 1:1
port with no extra added features, but with some hand tuning
(especially for the plain copy/avg functions). The ARM version
isn't very register starved to begin with, so there's not much
to be gained from having more spare registers here - we only
avoid having to clobber callee-saved registers.

Examples of runtimes vs the 32 bit version, on a Cortex A53:
                                     ARM   AArch64
vp9_avg4_neon:                      32.2      23.7
vp9_avg8_neon:                      57.5      53.7
vp9_avg16_neon:                    168.6     165.4
vp9_avg32_neon:                    586.7     585.2
vp9_avg64_neon:                   2458.6    2325.9
vp9_avg_8tap_smooth_4h_neon:       130.7     124.0
vp9_avg_8tap_smooth_4hv_neon:      478.8     440.3
vp9_avg_8tap_smooth_4v_neon:       118.0      96.2
vp9_avg_8tap_smooth_8h_neon:       239.7     232.0
vp9_avg_8tap_smooth_8hv_neon:      691.3     649.9
vp9_avg_8tap_smooth_8v_neon:       238.0     214.5
vp9_avg_8tap_smooth_64h_neon:    11512.9   11492.8
vp9_avg_8tap_smooth_64hv_neon:   23322.1   23255.1
vp9_avg_8tap_smooth_64v_neon:    11556.2   11554.5
vp9_put4_neon:                      18.0      16.5
vp9_put8_neon:                      40.2      37.7
vp9_put16_neon:                     99.4      95.2
vp9_put32_neon:                    348.8     307.4
vp9_put64_neon:                   1321.3    1109.8
vp9_put_8tap_smooth_4h_neon:       124.7     117.3
vp9_put_8tap_smooth_4hv_neon:      465.8     425.3
vp9_put_8tap_smooth_4v_neon:       105.0      82.5
vp9_put_8tap_smooth_8h_neon:       227.7     218.2
vp9_put_8tap_smooth_8hv_neon:      661.4     620.1
vp9_put_8tap_smooth_8v_neon:       208.0     187.2
vp9_put_8tap_smooth_64h_neon:    10864.6   10873.9
vp9_put_8tap_smooth_64hv_neon:   21359.4   21295.7
vp9_put_8tap_smooth_64v_neon:     9629.1    9639.4

These are generally about as fast as the corresponding ARM
routines on the same CPU (at least on the A53), in most cases
marginally faster.

The speedup vs C code is pretty much the same as for the 32 bit
case; on the A53 it's around 6-13x for ther larger 8tap filters.
The exact speedup varies a little, since the C versions generally
don't end up exactly as slow/fast as on 32 bit.
---
v2: Updated according to the comments on the 32 bit version.
---
 libavcodec/aarch64/Makefile              |   2 +
 libavcodec/aarch64/vp9dsp_init_aarch64.c | 139 ++++++
 libavcodec/aarch64/vp9mc_neon.S          | 733 +++++++++++++++++++++++++++++++
 libavcodec/vp9.h                         |   1 +
 libavcodec/vp9dsp.c                      |   2 +
 5 files changed, 877 insertions(+)
 create mode 100644 libavcodec/aarch64/vp9dsp_init_aarch64.c
 create mode 100644 libavcodec/aarch64/vp9mc_neon.S

Comments

Martin Storsjö Nov. 2, 2016, 1:23 p.m. | #1
On Wed, 2 Nov 2016, Martin Storsjö wrote:

> This work is sponsored by, and copyright, Google.
>
> These are ported from the ARM version; it is essentially a 1:1
> port with no extra added features, but with some hand tuning
> (especially for the plain copy/avg functions). The ARM version
> isn't very register starved to begin with, so there's not much
> to be gained from having more spare registers here - we only
> avoid having to clobber callee-saved registers.
>
> Examples of runtimes vs the 32 bit version, on a Cortex A53:
>                                     ARM   AArch64
> vp9_avg4_neon:                      32.2      23.7
> vp9_avg8_neon:                      57.5      53.7
> vp9_avg16_neon:                    168.6     165.4
> vp9_avg32_neon:                    586.7     585.2
> vp9_avg64_neon:                   2458.6    2325.9
> vp9_avg_8tap_smooth_4h_neon:       130.7     124.0
> vp9_avg_8tap_smooth_4hv_neon:      478.8     440.3
> vp9_avg_8tap_smooth_4v_neon:       118.0      96.2
> vp9_avg_8tap_smooth_8h_neon:       239.7     232.0
> vp9_avg_8tap_smooth_8hv_neon:      691.3     649.9
> vp9_avg_8tap_smooth_8v_neon:       238.0     214.5
> vp9_avg_8tap_smooth_64h_neon:    11512.9   11492.8
> vp9_avg_8tap_smooth_64hv_neon:   23322.1   23255.1
> vp9_avg_8tap_smooth_64v_neon:    11556.2   11554.5
> vp9_put4_neon:                      18.0      16.5
> vp9_put8_neon:                      40.2      37.7
> vp9_put16_neon:                     99.4      95.2
> vp9_put32_neon:                    348.8     307.4
> vp9_put64_neon:                   1321.3    1109.8
> vp9_put_8tap_smooth_4h_neon:       124.7     117.3
> vp9_put_8tap_smooth_4hv_neon:      465.8     425.3
> vp9_put_8tap_smooth_4v_neon:       105.0      82.5
> vp9_put_8tap_smooth_8h_neon:       227.7     218.2
> vp9_put_8tap_smooth_8hv_neon:      661.4     620.1
> vp9_put_8tap_smooth_8v_neon:       208.0     187.2
> vp9_put_8tap_smooth_64h_neon:    10864.6   10873.9
> vp9_put_8tap_smooth_64hv_neon:   21359.4   21295.7
> vp9_put_8tap_smooth_64v_neon:     9629.1    9639.4
>
> These are generally about as fast as the corresponding ARM
> routines on the same CPU (at least on the A53), in most cases
> marginally faster.
>
> The speedup vs C code is pretty much the same as for the 32 bit
> case; on the A53 it's around 6-13x for ther larger 8tap filters.
> The exact speedup varies a little, since the C versions generally
> don't end up exactly as slow/fast as on 32 bit.
> ---
> v2: Updated according to the comments on the 32 bit version.
> ---
> libavcodec/aarch64/Makefile              |   2 +
> libavcodec/aarch64/vp9dsp_init_aarch64.c | 139 ++++++
> libavcodec/aarch64/vp9mc_neon.S          | 733 +++++++++++++++++++++++++++++++
> libavcodec/vp9.h                         |   1 +
> libavcodec/vp9dsp.c                      |   2 +
> 5 files changed, 877 insertions(+)
> create mode 100644 libavcodec/aarch64/vp9dsp_init_aarch64.c
> create mode 100644 libavcodec/aarch64/vp9mc_neon.S

> +function ff_vp9_copy64_neon, export=1
> +1:
> +        ldp             x5,  x6,  [x2]
> +        stp             x5,  x6,  [x0]
> +        ldp             x5,  x6,  [x2, #16]
> +        stp             x5,  x6,  [x0, #16]
> +        subs            w4,  w4,  #1
> +        ldp             x5,  x6,  [x2, #32]
> +        stp             x5,  x6,  [x0, #32]
> +        ldp             x5,  x6,  [x2, #48]
> +        stp             x5,  x6,  [x0, #48]
> +        add             x2,  x2,  x3
> +        add             x0,  x0,  x1
> +        b.ne            1b
> +        ret
> +endfunc

I forgot to mention it anywhere, but the copy32 and copy64 functions don't 
actually use any vector registers at all, but only plain aarch64 ldp/stp. 
When implemented with neon loads/stores, they ended up significantly 
slower than the C version, on my dragonboard.

Currently copy64 runs at around 1100 cycles, while a trivial neon version 
(that loads all 64 bytes at once with a ld1 {v0,v1,v2,v3}) runs at around 
1600 cycles. One could of course play with all different combinations of 
loading 16, 32 or 64 bytes per ld1 and scheduling them differently (IIRC I 
did try some of those combinations at least), but I never got down to what 
the C version did unless I use ldp/stp.

Technically, having a _neon prefix for them is wrong, but anything else 
(omitting these two while hooking up avg32/avg64 separately) is more 
complication - although I'm open for suggestions on how to handle it best.

// Martin
Diego Biurrun Nov. 2, 2016, 1:57 p.m. | #2
On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
> On Wed, 2 Nov 2016, Martin Storsjö wrote:
> 
> Technically, having a _neon prefix for them is wrong, but anything else
> (omitting these two while hooking up avg32/avg64 separately) is more
> complication - although I'm open for suggestions on how to handle it best.

Where exactly is the complication? In the way you assign the function
pointers in the init file?

Diego
Martin Storsjö Nov. 2, 2016, 2 p.m. | #3
On Wed, 2 Nov 2016, Diego Biurrun wrote:

> On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
>> On Wed, 2 Nov 2016, Martin Storsjö wrote:
>> 
>> Technically, having a _neon prefix for them is wrong, but anything else
>> (omitting these two while hooking up avg32/avg64 separately) is more
>> complication - although I'm open for suggestions on how to handle it best.
>
> Where exactly is the complication? In the way you assign the function
> pointers in the init file?

Yes, it'd require splitting up those macros a bit; either for assigning 
the same function pointers, but with a different simd instruction set 
suffix, or for only assigning the avg function pointer for those sizes.

// Martin
Luca Barbato Nov. 2, 2016, 2:11 p.m. | #4
On 02/11/2016 14:23, Martin Storsjö wrote:
> Technically, having a _neon prefix for them is wrong

If we are sure that there won't be anything neon but without those
instructions I wouldn't be much concerned.
Diego Biurrun Nov. 2, 2016, 2:29 p.m. | #5
On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
> On Wed, 2 Nov 2016, Diego Biurrun wrote:
> >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
> >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
> >>Technically, having a _neon prefix for them is wrong, but anything else
> >>(omitting these two while hooking up avg32/avg64 separately) is more
> >>complication - although I'm open for suggestions on how to handle it best.
> >
> >Where exactly is the complication? In the way you assign the function
> >pointers in the init file?
> 
> Yes, it'd require splitting up those macros a bit; either for assigning the
> same function pointers, but with a different simd instruction set suffix, or
> for only assigning the avg function pointer for those sizes.

Try something like

#define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
#define ff_vp9_copy64_neon ff_vp9_copy64_aarch64

before the assignment macros. You don't have to somehow drop some of the
assignments in the macros then. There's precedent for this in some of the
x86 init files.

Diego
Janne Grunau Nov. 2, 2016, 6:24 p.m. | #6
On 2016-11-02 15:29:34 +0100, Diego Biurrun wrote:
> On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
> > On Wed, 2 Nov 2016, Diego Biurrun wrote:
> > >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
> > >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
> > >>Technically, having a _neon prefix for them is wrong, but anything else
> > >>(omitting these two while hooking up avg32/avg64 separately) is more
> > >>complication - although I'm open for suggestions on how to handle it best.
> > >
> > >Where exactly is the complication? In the way you assign the function
> > >pointers in the init file?
> > 
> > Yes, it'd require splitting up those macros a bit; either for assigning the
> > same function pointers, but with a different simd instruction set suffix, or
> > for only assigning the avg function pointer for those sizes.
> 
> Try something like
> 
> #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
> #define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
> 
> before the assignment macros. You don't have to somehow drop some of the
> assignments in the macros then. There's precedent for this in some of the
> x86 init files.

That only fixes the function names but not the cpu_flag based assignment 
if I understand it correctly. And it is a little bit silly since neon is 
not really optional on aarch64. At least in the Application profile.

Janne
Diego Biurrun Nov. 3, 2016, 7:38 p.m. | #7
On Wed, Nov 02, 2016 at 07:24:03PM +0100, Janne Grunau wrote:
> On 2016-11-02 15:29:34 +0100, Diego Biurrun wrote:
> > On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
> > > On Wed, 2 Nov 2016, Diego Biurrun wrote:
> > > >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
> > > >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
> > > >>Technically, having a _neon prefix for them is wrong, but anything else
> > > >>(omitting these two while hooking up avg32/avg64 separately) is more
> > > >>complication - although I'm open for suggestions on how to handle it best.
> > > >
> > > >Where exactly is the complication? In the way you assign the function
> > > >pointers in the init file?
> > > 
> > > Yes, it'd require splitting up those macros a bit; either for assigning the
> > > same function pointers, but with a different simd instruction set suffix, or
> > > for only assigning the avg function pointer for those sizes.
> > 
> > Try something like
> > 
> > #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
> > #define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
> > 
> > before the assignment macros. You don't have to somehow drop some of the
> > assignments in the macros then. There's precedent for this in some of the
> > x86 init files.
> 
> That only fixes the function names but not the cpu_flag based assignment 
> if I understand it correctly. And it is a little bit silly since neon is 
> not really optional on aarch64. At least in the Application profile.

I forgot to say that ff_vp9_copy32_aarch64 and ff_vp9_copy64_aarch64
should be assigned to the function pointer unconditionally before
checking for neon support.

Diego
Janne Grunau Nov. 3, 2016, 8:42 p.m. | #8
On 2016-11-03 20:38:11 +0100, Diego Biurrun wrote:
> On Wed, Nov 02, 2016 at 07:24:03PM +0100, Janne Grunau wrote:
> > On 2016-11-02 15:29:34 +0100, Diego Biurrun wrote:
> > > On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
> > > > On Wed, 2 Nov 2016, Diego Biurrun wrote:
> > > > >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
> > > > >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
> > > > >>Technically, having a _neon prefix for them is wrong, but anything else
> > > > >>(omitting these two while hooking up avg32/avg64 separately) is more
> > > > >>complication - although I'm open for suggestions on how to handle it best.
> > > > >
> > > > >Where exactly is the complication? In the way you assign the function
> > > > >pointers in the init file?
> > > > 
> > > > Yes, it'd require splitting up those macros a bit; either for assigning the
> > > > same function pointers, but with a different simd instruction set suffix, or
> > > > for only assigning the avg function pointer for those sizes.
> > > 
> > > Try something like
> > > 
> > > #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
> > > #define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
> > > 
> > > before the assignment macros. You don't have to somehow drop some of the
> > > assignments in the macros then. There's precedent for this in some of the
> > > x86 init files.
> > 
> > That only fixes the function names but not the cpu_flag based assignment 
> > if I understand it correctly. And it is a little bit silly since neon is 
> > not really optional on aarch64. At least in the Application profile.
> 
> I forgot to say that ff_vp9_copy32_aarch64 and ff_vp9_copy64_aarch64
> should be assigned to the function pointer unconditionally before
> checking for neon support.

unconditionally is not nice since we want to compare them to C in 
checkasm. That's the reason why we have the otherwise pointless 
AV_CPU_FLAG_ARMV8

Janne
Diego Biurrun Nov. 3, 2016, 8:53 p.m. | #9
On Thu, Nov 03, 2016 at 09:42:44PM +0100, Janne Grunau wrote:
> On 2016-11-03 20:38:11 +0100, Diego Biurrun wrote:
> > On Wed, Nov 02, 2016 at 07:24:03PM +0100, Janne Grunau wrote:
> > > On 2016-11-02 15:29:34 +0100, Diego Biurrun wrote:
> > > > On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
> > > > > On Wed, 2 Nov 2016, Diego Biurrun wrote:
> > > > > >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
> > > > > >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
> > > > > >>Technically, having a _neon prefix for them is wrong, but anything else
> > > > > >>(omitting these two while hooking up avg32/avg64 separately) is more
> > > > > >>complication - although I'm open for suggestions on how to handle it best.
> > > > > >
> > > > > >Where exactly is the complication? In the way you assign the function
> > > > > >pointers in the init file?
> > > > > 
> > > > > Yes, it'd require splitting up those macros a bit; either for assigning the
> > > > > same function pointers, but with a different simd instruction set suffix, or
> > > > > for only assigning the avg function pointer for those sizes.
> > > > 
> > > > Try something like
> > > > 
> > > > #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
> > > > #define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
> > > > 
> > > > before the assignment macros. You don't have to somehow drop some of the
> > > > assignments in the macros then. There's precedent for this in some of the
> > > > x86 init files.
> > > 
> > > That only fixes the function names but not the cpu_flag based assignment 
> > > if I understand it correctly. And it is a little bit silly since neon is 
> > > not really optional on aarch64. At least in the Application profile.
> > 
> > I forgot to say that ff_vp9_copy32_aarch64 and ff_vp9_copy64_aarch64
> > should be assigned to the function pointer unconditionally before
> > checking for neon support.
> 
> unconditionally is not nice since we want to compare them to C in 
> checkasm. That's the reason why we have the otherwise pointless 
> AV_CPU_FLAG_ARMV8

Unconditional in the aarch64 init function is not unconditional:

libavcodec/foodsp.c:
  if (ARCH_AARCH64)
      init_aarch64(..);

libavcodec/aarch64/foodsp_init.c:
  init_aarch64(..)
  {
      function.pointer = ff_vp9_copy32_aarch64;

      if (have_neon())
          something.else = ff_whatever;

Anyway, you get the idea. Quite possibly I'm overlooking something
and it does not work as I envision...

Diego
Martin Storsjö Nov. 3, 2016, 9:02 p.m. | #10
On Thu, 3 Nov 2016, Diego Biurrun wrote:

> On Thu, Nov 03, 2016 at 09:42:44PM +0100, Janne Grunau wrote:
>> On 2016-11-03 20:38:11 +0100, Diego Biurrun wrote:
>> > On Wed, Nov 02, 2016 at 07:24:03PM +0100, Janne Grunau wrote:
>> > > On 2016-11-02 15:29:34 +0100, Diego Biurrun wrote:
>> > > > On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
>> > > > > On Wed, 2 Nov 2016, Diego Biurrun wrote:
>> > > > > >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
>> > > > > >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
>> > > > > >>Technically, having a _neon prefix for them is wrong, but anything else
>> > > > > >>(omitting these two while hooking up avg32/avg64 separately) is more
>> > > > > >>complication - although I'm open for suggestions on how to handle it best.
>> > > > > >
>> > > > > >Where exactly is the complication? In the way you assign the function
>> > > > > >pointers in the init file?
>> > > > > 
>> > > > > Yes, it'd require splitting up those macros a bit; either for assigning the
>> > > > > same function pointers, but with a different simd instruction set suffix, or
>> > > > > for only assigning the avg function pointer for those sizes.
>> > > > 
>> > > > Try something like
>> > > > 
>> > > > #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
>> > > > #define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
>> > > > 
>> > > > before the assignment macros. You don't have to somehow drop some of the
>> > > > assignments in the macros then. There's precedent for this in some of the
>> > > > x86 init files.
>> > > 
>> > > That only fixes the function names but not the cpu_flag based assignment 
>> > > if I understand it correctly. And it is a little bit silly since neon is 
>> > > not really optional on aarch64. At least in the Application profile.
>> > 
>> > I forgot to say that ff_vp9_copy32_aarch64 and ff_vp9_copy64_aarch64
>> > should be assigned to the function pointer unconditionally before
>> > checking for neon support.
>> 
>> unconditionally is not nice since we want to compare them to C in 
>> checkasm. That's the reason why we have the otherwise pointless 
>> AV_CPU_FLAG_ARMV8
>
> Unconditional in the aarch64 init function is not unconditional:
>
> libavcodec/foodsp.c:
>  if (ARCH_AARCH64)
>      init_aarch64(..);
>
> libavcodec/aarch64/foodsp_init.c:
>  init_aarch64(..)
>  {
>      function.pointer = ff_vp9_copy32_aarch64;
>
>      if (have_neon())
>          something.else = ff_whatever;
>
> Anyway, you get the idea. Quite possibly I'm overlooking something
> and it does not work as I envision...

Yes, because as Janne said, in that case, you can't via cpuflags choose 
not to use this function, and you can't benchmark against the C version in 
checkasm, since the C version always gets overridden by this function, 
regardless of cpuflags.

That's why we've introduced the flag AV_CPU_FLAG_ARMV8, and one should 
wrap such functions within have_armv8(cpuflags) instead of 
have_neon(cpuflags).

// Martin
Diego Biurrun Nov. 3, 2016, 9:14 p.m. | #11
On Thu, Nov 03, 2016 at 11:02:17PM +0200, Martin Storsjö wrote:
> On Thu, 3 Nov 2016, Diego Biurrun wrote:
> >On Thu, Nov 03, 2016 at 09:42:44PM +0100, Janne Grunau wrote:
> >>On 2016-11-03 20:38:11 +0100, Diego Biurrun wrote:
> >>> On Wed, Nov 02, 2016 at 07:24:03PM +0100, Janne Grunau wrote:
> >>> > On 2016-11-02 15:29:34 +0100, Diego Biurrun wrote:
> >>> > > On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
> >>> > > > On Wed, 2 Nov 2016, Diego Biurrun wrote:
> >>> > > > >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
> >>> > > > >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
> >>> > > > >>Technically, having a _neon prefix for them is wrong, but anything else
> >>> > > > >>(omitting these two while hooking up avg32/avg64 separately) is more
> >>> > > > >>complication - although I'm open for suggestions on how to handle it best.
> >>> > > > >
> >>> > > > >Where exactly is the complication? In the way you assign the function
> >>> > > > >pointers in the init file?
> >>> > > > > > > > Yes, it'd require splitting up those macros a bit;
> >>either for assigning the
> >>> > > > same function pointers, but with a different simd instruction set suffix, or
> >>> > > > for only assigning the avg function pointer for those sizes.
> >>> > > > > > Try something like
> >>> > > > > > #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
> >>> > > #define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
> >>> > > > > > before the assignment macros. You don't have to somehow drop
> >>some of the
> >>> > > assignments in the macros then. There's precedent for this in some of the
> >>> > > x86 init files.
> >>> > > > That only fixes the function names but not the cpu_flag based
> >>assignment > > if I understand it correctly. And it is a little bit
> >>silly since neon is > > not really optional on aarch64. At least in the
> >>Application profile.
> >>> > I forgot to say that ff_vp9_copy32_aarch64 and ff_vp9_copy64_aarch64
> >>> should be assigned to the function pointer unconditionally before
> >>> checking for neon support.
> >>
> >>unconditionally is not nice since we want to compare them to C in
> >>checkasm. That's the reason why we have the otherwise pointless
> >>AV_CPU_FLAG_ARMV8
> >
> >Unconditional in the aarch64 init function is not unconditional:
> >
> >libavcodec/foodsp.c:
> > if (ARCH_AARCH64)
> >     init_aarch64(..);
> >
> >libavcodec/aarch64/foodsp_init.c:
> > init_aarch64(..)
> > {
> >     function.pointer = ff_vp9_copy32_aarch64;
> >
> >     if (have_neon())
> >         something.else = ff_whatever;
> >
> >Anyway, you get the idea. Quite possibly I'm overlooking something
> >and it does not work as I envision...
> 
> Yes, because as Janne said, in that case, you can't via cpuflags choose not
> to use this function, and you can't benchmark against the C version in
> checkasm, since the C version always gets overridden by this function,
> regardless of cpuflags.
> 
> That's why we've introduced the flag AV_CPU_FLAG_ARMV8, and one should wrap
> such functions within have_armv8(cpuflags) instead of have_neon(cpuflags).

OK, last try then :)

  #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64

  init_aarch64(..)
  {
       if (have_armv8())
           function.pointer = ff_vp9_copy32_aarch64;

       if (have_neon())
           [neon macro trickery
           function.pointer = ff_vp9_copy32_neon;
            neon macro trickery]

Diego
Martin Storsjö Nov. 3, 2016, 9:36 p.m. | #12
On Thu, 3 Nov 2016, Diego Biurrun wrote:

> On Thu, Nov 03, 2016 at 11:02:17PM +0200, Martin Storsjö wrote:
>> On Thu, 3 Nov 2016, Diego Biurrun wrote:
>> >On Thu, Nov 03, 2016 at 09:42:44PM +0100, Janne Grunau wrote:
>> >>On 2016-11-03 20:38:11 +0100, Diego Biurrun wrote:
>> >>> On Wed, Nov 02, 2016 at 07:24:03PM +0100, Janne Grunau wrote:
>> >>> > On 2016-11-02 15:29:34 +0100, Diego Biurrun wrote:
>> >>> > > On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
>> >>> > > > On Wed, 2 Nov 2016, Diego Biurrun wrote:
>> >>> > > > >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
>> >>> > > > >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
>> >>> > > > >>Technically, having a _neon prefix for them is wrong, but anything else
>> >>> > > > >>(omitting these two while hooking up avg32/avg64 separately) is more
>> >>> > > > >>complication - although I'm open for suggestions on how to handle it best.
>> >>> > > > >
>> >>> > > > >Where exactly is the complication? In the way you assign the function
>> >>> > > > >pointers in the init file?
>> >>> > > > > > > > Yes, it'd require splitting up those macros a bit;
>> >>either for assigning the
>> >>> > > > same function pointers, but with a different simd instruction set suffix, or
>> >>> > > > for only assigning the avg function pointer for those sizes.
>> >>> > > > > > Try something like
>> >>> > > > > > #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
>> >>> > > #define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
>> >>> > > > > > before the assignment macros. You don't have to somehow drop
>> >>some of the
>> >>> > > assignments in the macros then. There's precedent for this in some of the
>> >>> > > x86 init files.
>> >>> > > > That only fixes the function names but not the cpu_flag based
>> >>assignment > > if I understand it correctly. And it is a little bit
>> >>silly since neon is > > not really optional on aarch64. At least in the
>> >>Application profile.
>> >>> > I forgot to say that ff_vp9_copy32_aarch64 and ff_vp9_copy64_aarch64
>> >>> should be assigned to the function pointer unconditionally before
>> >>> checking for neon support.
>> >>
>> >>unconditionally is not nice since we want to compare them to C in
>> >>checkasm. That's the reason why we have the otherwise pointless
>> >>AV_CPU_FLAG_ARMV8
>> >
>> >Unconditional in the aarch64 init function is not unconditional:
>> >
>> >libavcodec/foodsp.c:
>> > if (ARCH_AARCH64)
>> >     init_aarch64(..);
>> >
>> >libavcodec/aarch64/foodsp_init.c:
>> > init_aarch64(..)
>> > {
>> >     function.pointer = ff_vp9_copy32_aarch64;
>> >
>> >     if (have_neon())
>> >         something.else = ff_whatever;
>> >
>> >Anyway, you get the idea. Quite possibly I'm overlooking something
>> >and it does not work as I envision...
>> 
>> Yes, because as Janne said, in that case, you can't via cpuflags choose not
>> to use this function, and you can't benchmark against the C version in
>> checkasm, since the C version always gets overridden by this function,
>> regardless of cpuflags.
>> 
>> That's why we've introduced the flag AV_CPU_FLAG_ARMV8, and one should wrap
>> such functions within have_armv8(cpuflags) instead of have_neon(cpuflags).
>
> OK, last try then :)
>
>  #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
>
>  init_aarch64(..)
>  {
>       if (have_armv8())
>           function.pointer = ff_vp9_copy32_aarch64;
>
>       if (have_neon())
>           [neon macro trickery
>           function.pointer = ff_vp9_copy32_neon;
>            neon macro trickery]

Sure, something like that would work. Except if we go this way, I wouldn't 
assign it at all within the neon section, since it already is hooked up 
within have_armv8.

Either of those requires splitting up the macros for setting function 
pointers (since it's not just one pointer but a bunch of them). Not hard 
though, but makes the pretty unwieldy code even more unwieldy.

In any case, yes, it's doable. But I'm waiting for Janne's opinion. :P

// Martin
Janne Grunau Nov. 3, 2016, 10:29 p.m. | #13
On 2016-11-02 14:58:41 +0200, Martin Storsjö wrote:
> ---
>  libavcodec/aarch64/Makefile              |   2 +
>  libavcodec/aarch64/vp9dsp_init_aarch64.c | 139 ++++++
>  libavcodec/aarch64/vp9mc_neon.S          | 733 +++++++++++++++++++++++++++++++
>  libavcodec/vp9.h                         |   1 +
>  libavcodec/vp9dsp.c                      |   2 +
>  5 files changed, 877 insertions(+)
>  create mode 100644 libavcodec/aarch64/vp9dsp_init_aarch64.c
>  create mode 100644 libavcodec/aarch64/vp9mc_neon.S
> 
> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
> index 764bedc..6f1227a 100644
> --- a/libavcodec/aarch64/Makefile
> +++ b/libavcodec/aarch64/Makefile
> @@ -17,6 +17,7 @@ OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
>  OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
>  OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
>  OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
> +OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_aarch64.o
>  
>  # ARMv8 optimizations
>  
> @@ -43,3 +44,4 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
>  NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o               \
>                                             aarch64/synth_filter_neon.o
>  NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
> +NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9mc_neon.o
> diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c
> new file mode 100644
> index 0000000..3d414af
> --- /dev/null
> +++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c

> +    LOCAL_ALIGNED_16(uint8_t, temp, [((sz < 64 ? 2 * sz : 64) + 8) * sz]);        \

((1 + (sz < 64)) * sz + 8) * sz

nit, looks nicer imo. no need to change it

> diff --git a/libavcodec/aarch64/vp9mc_neon.S 
> b/libavcodec/aarch64/vp9mc_neon.S
> new file mode 100644
> index 0000000..9ca2a32
> --- /dev/null
> +++ b/libavcodec/aarch64/vp9mc_neon.S
> @@ -0,0 +1,733 @@
> +/*
> + * Copyright (c) 2016 Google Inc.
> + *
> + * This file is part of Libav.
> + *
> + * Libav is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * Libav is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/aarch64/asm.S"
> +
> +const regular_filter, align=4

can't we use the constants from C? the single sign extend will be lost 
in the noise. same applies to the arm version

> +// All public functions in this file have the following signature:
> +// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
> +//                            const uint8_t *ref, ptrdiff_t ref_stride,
> +//                            int h, int mx, int my);
> +
> +function ff_vp9_copy64_neon, export=1
> +1:
> +        ldp             x5,  x6,  [x2]
> +        stp             x5,  x6,  [x0]
> +        ldp             x5,  x6,  [x2, #16]

use more registers we have enough, probably not noticable

> +        stp             x5,  x6,  [x0, #16]
> +        subs            w4,  w4,  #1
> +        ldp             x5,  x6,  [x2, #32]
> +        stp             x5,  x6,  [x0, #32]
> +        ldp             x5,  x6,  [x2, #48]
> +        stp             x5,  x6,  [x0, #48]
> +        add             x2,  x2,  x3
> +        add             x0,  x0,  x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_vp9_avg64_neon, export=1
> +        mov             x5,  x0
> +1:
> +        ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
> +        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
> +        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
> +        urhadd          v0.16b,  v0.16b,  v4.16b
> +        urhadd          v1.16b,  v1.16b,  v5.16b
> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
> +        urhadd          v2.16b,  v2.16b,  v6.16b
> +        urhadd          v3.16b,  v3.16b,  v7.16b
> +        subs            w4,  w4,  #2
> +        urhadd          v16.16b, v16.16b, v20.16b
> +        urhadd          v17.16b, v17.16b, v21.16b
> +        st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
> +        urhadd          v18.16b, v18.16b, v22.16b
> +        urhadd          v19.16b, v19.16b, v23.16b
> +        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_vp9_copy32_neon, export=1
> +1:
> +        ldp             x5,  x6,  [x2]
> +        stp             x5,  x6,  [x0]
> +        subs            w4,  w4,  #1
> +        ldp             x5,  x6,  [x2, #16]

see 64

> +        stp             x5,  x6,  [x0, #16]
> +        add             x2,  x2,  x3
> +        add             x0,  x0,  x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_vp9_avg32_neon, export=1
> +1:
> +        ld1             {v2.16b, v3.16b},  [x2], x3
> +        ld1             {v0.16b, v1.16b},  [x0]
> +        urhadd          v0.16b,  v0.16b,  v2.16b
> +        urhadd          v1.16b,  v1.16b,  v3.16b
> +        subs            w4,  w4,  #1
> +        st1             {v0.16b, v1.16b},  [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_vp9_copy16_neon, export=1
> +        add             x5,  x0,  x1
> +        add             x1,  x1,  x1

I think lsl 'x1,  x1,  #1' would be a little clearer

> +        add             x6,  x2,  x3
> +        add             x3,  x3,  x3
> +1:
> +        ld1             {v0.16b},  [x2], x3
> +        ld1             {v1.16b},  [x6], x3
> +        ld1             {v2.16b},  [x2], x3
> +        ld1             {v3.16b},  [x6], x3
> +        subs            w4,  w4,  #4
> +        st1             {v0.16b},  [x0], x1
> +        st1             {v1.16b},  [x5], x1
> +        st1             {v2.16b},  [x0], x1
> +        st1             {v3.16b},  [x5], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_vp9_avg16_neon, export=1
> +1:
> +        ld1             {v2.16b},  [x2], x3
> +        ld1             {v0.16b},  [x0], x1
> +        ld1             {v3.16b},  [x2], x3
> +        urhadd          v0.16b,  v0.16b,  v2.16b
> +        ld1             {v1.16b},  [x0]
> +        sub             x0,  x0,  x1

use another register for st1

> +        urhadd          v1.16b,  v1.16b,  v3.16b
> +        subs            w4,  w4,  #2
> +        st1             {v0.16b},  [x0], x1
> +        st1             {v1.16b},  [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc

...

> +function ff_vp9_copy4_neon, export=1
> +1:
> +        ld1             {v0.s}[0], [x2], x3
> +        ld1             {v1.s}[0], [x2], x3
> +        st1             {v0.s}[0], [x0], x1
> +        ld1             {v2.s}[0], [x2], x3
> +        st1             {v1.s}[0], [x0], x1
> +        ld1             {v3.s}[0], [x2], x3
> +        subs            w4,  w4,  #4
> +        st1             {v2.s}[0], [x0], x1
> +        st1             {v3.s}[0], [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +function ff_vp9_avg4_neon, export=1
> +1:
> +        ld1r            {v4.2s}, [x2], x3

is there a difference between ld1r and ld1 {}[0]? eiether way we should 
just use one variant for loading 4 bytes unless we specificially need 
one of them. Iirc the cortex-a8 is the only core were a load to all 
lanes is faster than a load to all lanes.

> +        ld1r            {v1.2s}, [x0], x1
> +        ld1r            {v6.2s}, [x2], x3
> +        ld1r            {v2.2s}, [x0], x1
> +        ld1r            {v7.2s}, [x2], x3
> +        ld1r            {v3.2s}, [x0], x1
> +        sub             x0,  x0,  x1, lsl #2
> +        subs            w4,  w4,  #4
> +        urhadd          v0.8b,  v0.8b,  v4.8b
> +        urhadd          v1.8b,  v1.8b,  v5.8b
> +        urhadd          v2.8b,  v2.8b,  v6.8b
> +        urhadd          v3.8b,  v3.8b,  v7.8b

in theory a single urhadd instruction is enough but I doubt it's faster, 
doing 2 with 8 values each might though.

> +        st1             {v0.s}[0], [x0], x1
> +        st1             {v1.s}[0], [x0], x1
> +        st1             {v2.s}[0], [x0], x1
> +        st1             {v3.s}[0], [x0], x1
> +        b.ne            1b
> +        ret
> +endfunc
> +
> +
> +// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
> +// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
> +// dst1-dst2 and dst3-dst4 for size >= 16)
> +.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
> +        ext             v20.16b, \src1, \src2, #(2*\offset)
> +        ext             v22.16b, \src4, \src5, #(2*\offset)
> +.if \size >= 16
> +        mla             \dst1, v20.8h, v0.h[\offset]
> +        ext             v21.16b, \src2, \src3, #(2*\offset)
> +        mla             \dst3, v22.8h, v0.h[\offset]
> +        ext             v23.16b, \src5, \src6, #(2*\offset)
> +        mla             \dst2, v21.8h, v0.h[\offset]
> +        mla             \dst4, v23.8h, v0.h[\offset]
> +.else
> +        mla             \dst1, v20.8h, v0.h[\offset]
> +        mla             \dst3, v22.8h, v0.h[\offset]
> +.endif
> +.endm
> +// The same as above, but don't accumulate straight into the
> +// destination, but use a temp register and accumulate with saturation.
> +.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
> +        ext             v20.16b, \src1, \src2, #(2*\offset)
> +        ext             v22.16b, \src4, \src5, #(2*\offset)
> +.if \size >= 16
> +        mul             v20.8h, v20.8h, v0.h[\offset]
> +        ext             v21.16b, \src2, \src3, #(2*\offset)
> +        mul             v22.8h, v22.8h, v0.h[\offset]
> +        ext             v23.16b, \src5, \src6, #(2*\offset)
> +        mul             v21.8h, v21.8h, v0.h[\offset]
> +        mul             v23.8h, v23.8h, v0.h[\offset]
> +.else
> +        mul             v20.8h, v20.8h, v0.h[\offset]
> +        mul             v22.8h, v22.8h, v0.h[\offset]
> +.endif
> +        sqadd           \dst1, \dst1, v20.8h
> +        sqadd           \dst3, \dst3, v22.8h
> +.if \size >= 16
> +        sqadd           \dst2, \dst2, v21.8h
> +        sqadd           \dst4, \dst4, v23.8h
> +.endif
> +.endm
> +
> +
> +// Instantiate a horizontal filter function for the given size.
> +// This can work on 4, 8 or 16 pixels in parallel; for larger
> +// widths it will do 16 pixels at a time and loop horizontally.
> +// The actual width is passed in x5, the height in w4 and the
> +// filter coefficients in x9. idx2 is the index of the largest
> +// filter coefficient (3 or 4) and idx1 is the other one of them.
> +.macro do_8tap_h type, size, idx1, idx2
> +function \type\()_8tap_\size\()h_\idx1\idx2
> +        sub             x2,  x2,  #3
> +        add             x6,  x0,  x1
> +        add             x7,  x2,  x3
> +        add             x1,  x1,  x1
> +        add             x3,  x3,  x3
> +        // Only size >= 16 loops horizontally and needs
> +        // reduced dst stride
> +.if \size >= 16
> +        sub             x1,  x1,  x5
> +.endif
> +        // size >= 16 loads two qwords and increments r2,
> +        // for size 4/8 it's enough with one qword and no
> +        // postincrement
> +.if \size >= 16
> +        sub             x3,  x3,  x5
> +        sub             x3,  x3,  #8
> +.endif
> +        // Load the filter vector
> +        ld1             {v0.8h},  [x9]
> +1:
> +.if \size >= 16
> +        mov             x9,  x5
> +.endif
> +        // Load src
> +.if \size >= 16
> +        ld1             {v4.16b},  [x2], #16
> +        ld1             {v16.16b}, [x7], #16
> +        ld1             {v6.8b},   [x2], #8
> +        ld1             {v18.8b},  [x7], #8

ld1 {v4,  v5,  v6}, ...
ld1 {v16, v17, v18}, ...

and adept the rest

> +.else
> +        ld1             {v4.16b},  [x2]
> +        ld1             {v16.16b}, [x7]
> +.endif
> +        uxtl2           v5.8h,  v4.16b
> +        uxtl            v4.8h,  v4.8b
> +        uxtl2           v17.8h, v16.16b
> +        uxtl            v16.8h, v16.8b
> +.if \size >= 16
> +        uxtl            v6.8h,  v6.8b
> +        uxtl            v18.8h, v18.8b
> +.endif
> +2:
> +
> +        // Accumulate, adding idx2 last with a separate
> +        // saturating add. The positive filter coefficients
> +        // for all indices except idx2 must add up to less
> +        // than 127 for this not to overflow.
> +        mul             v1.8h,  v4.8h,  v0.h[0]
> +        mul             v24.8h, v16.8h, v0.h[0]
> +.if \size >= 16
> +        mul             v2.8h,  v5.8h,  v0.h[0]
> +        mul             v25.8h, v17.8h, v0.h[0]
> +.endif
> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 1,     \size
> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 2,     \size
> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx1, \size
> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 5,     \size
> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 6,     \size
> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 7,     \size
> +        extmulqadd      v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx2, \size
> +
> +        // Round, shift and saturate
> +        sqrshrun        v1.8b,   v1.8h,  #7
> +        sqrshrun        v24.8b,  v24.8h, #7
> +.if \size >= 16
> +        sqrshrun2       v1.16b,  v2.8h,  #7
> +        sqrshrun2       v24.16b, v25.8h, #7
> +.endif
> +        // Average
> +.ifc \type,avg
> +.if \size >= 16
> +        ld1             {v2.16b}, [x0]
> +        ld1             {v3.16b}, [x6]
> +        urhadd          v1.16b,  v1.16b,  v2.16b
> +        urhadd          v24.16b, v24.16b, v3.16b
> +.elseif \size == 8
> +        ld1             {v2.8b},  [x0]
> +        ld1             {v3.8b},  [x6]
> +        urhadd          v1.8b,  v1.8b,  v2.8b
> +        urhadd          v24.8b, v24.8b, v3.8b
> +.else
> +        ld1r            {v2.2s},  [x0]
> +        ld1r            {v3.2s},  [x6]
> +        urhadd          v1.8b,  v1.8b,  v2.8b
> +        urhadd          v24.8b, v24.8b, v3.8b
> +.endif
> +.endif
> +        // Store and loop horizontally (for size >= 16)
> +.if \size >= 16
> +        st1             {v1.16b},  [x0], #16
> +        st1             {v24.16b}, [x6], #16
> +        mov             v4.16b,  v6.16b
> +        mov             v16.16b, v18.16b
> +        subs            x9,  x9,  #16
> +        beq             3f

you can branch before the 2 movs

> +        ld1             {v6.16b},  [x2], #16
> +        ld1             {v18.16b}, [x7], #16
> +        uxtl            v5.8h,  v6.8b
> +        uxtl2           v6.8h,  v6.16b
> +        uxtl            v17.8h, v18.8b
> +        uxtl2           v18.8h, v18.16b
> +        b               2b
> +.elseif \size == 8
> +        st1             {v1.8b},    [x0]
> +        st1             {v24.8b},   [x6]
> +.else // \size == 4
> +        st1             {v1.s}[0],  [x0]
> +        st1             {v24.s}[0], [x6]
> +.endif
> +3:
> +        // Loop vertically
> +        add             x0,  x0,  x1
> +        add             x6,  x6,  x1
> +        add             x2,  x2,  x3
> +        add             x7,  x7,  x3
> +        subs            w4,  w4,  #2
> +        b.ne            1b
> +        ret
> +endfunc
> +.endm
> +
> +.macro do_8tap_h_size size
> +do_8tap_h put, \size, 3, 4
> +do_8tap_h avg, \size, 3, 4
> +do_8tap_h put, \size, 4, 3
> +do_8tap_h avg, \size, 4, 3
> +.endm
> +
> +do_8tap_h_size 4
> +do_8tap_h_size 8
> +do_8tap_h_size 16
> +
> +.macro do_8tap_h_func type, filter, size
> +function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
> +        movrel          x6,  \filter\()_filter-16
> +        cmp             w5,  #8
> +        add             x9,  x6,  w5, uxtw #4
> +        mov             x5,  #\size
> +.if \size >= 16
> +        bge             \type\()_8tap_16h_34
> +        b               \type\()_8tap_16h_43
> +.else
> +        bge             \type\()_8tap_\size\()h_34
> +        b               \type\()_8tap_\size\()h_43
> +.endif
> +endfunc
> +.endm
> +
> +.macro do_8tap_h_filters size
> +do_8tap_h_func put, regular, \size
> +do_8tap_h_func avg, regular, \size
> +do_8tap_h_func put, sharp,   \size
> +do_8tap_h_func avg, sharp,   \size
> +do_8tap_h_func put, smooth,  \size
> +do_8tap_h_func avg, smooth,  \size
> +.endm
> +
> +do_8tap_h_filters 64
> +do_8tap_h_filters 32
> +do_8tap_h_filters 16
> +do_8tap_h_filters 8
> +do_8tap_h_filters 4
> +
> +
> +// Vertical filters
> +
> +// Round, shift and saturate and store reg1-reg2 over 4 lines
> +.macro do_store4 reg1, reg2, tmp1, tmp2, type
> +        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
> +        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
> +.ifc \type,avg
> +        ld1r            {\tmp1\().2s},     [x0], x1
> +        ld1r            {\tmp2\().2s},     [x0], x1
> +        ld1             {\tmp1\().s}[1],  [x0], x1
> +        ld1             {\tmp2\().s}[1],  [x0], x1

use a copy of x0 for loading in avg

> +        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
> +        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
> +        sub             x0,  x0,  x1, lsl #2
> +.endif
> +        st1             {\reg1\().s}[0],  [x0], x1
> +        st1             {\reg2\().s}[0],  [x0], x1
> +        st1             {\reg1\().s}[1],  [x0], x1
> +        st1             {\reg2\().s}[1],  [x0], x1
> +.endm
> +
> +// Round, shift and saturate and store reg1-4
> +.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
> +        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
> +        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
> +        sqrshrun        \reg3\().8b,  \reg3\().8h, #7
> +        sqrshrun        \reg4\().8b,  \reg4\().8h, #7
> +.ifc \type,avg
> +        ld1             {\tmp1\().8b},  [x0], x1
> +        ld1             {\tmp2\().8b},  [x0], x1
> +        ld1             {\tmp3\().8b},  [x0], x1
> +        ld1             {\tmp4\().8b},  [x0], x1

use a copy of x0

> +        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
> +        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
> +        urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
> +        urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
> +        sub             x0,  x0,  x1, lsl #2
> +.endif
> +        st1             {\reg1\().8b},  [x0], x1
> +        st1             {\reg2\().8b},  [x0], x1
> +        st1             {\reg3\().8b},  [x0], x1
> +        st1             {\reg4\().8b},  [x0], x1
> +.endm

...

> +// Instantiate a vertical filter function for filtering a 4 pixels wide
> +// slice. The first half of the registers contain one row, while the second
> +// half of a register contains the second-next row (also stored in the first
> +// half of the register two steps ahead). The convolution does two outputs
> +// at a time; the output of v17-v24 into one, and v18-v25 into another one.
> +// The first half of first output is the first output row, the first half
> +// of the other output is the second output row. The second halves of the
> +// registers are rows 3 and 4.
> +// This only is designed to work for 4 or 8 output lines.
> +.macro do_8tap_4v type, idx1, idx2
> +function \type\()_8tap_4v_\idx1\idx2
> +        sub             x2,  x2,  x3, lsl #1
> +        sub             x2,  x2,  x3
> +        ld1             {v0.8h},  [x6]
> +
> +        ld1r            {v1.2s},    [x2], x3
> +        ld1r            {v2.2s},    [x2], x3
> +        ld1r            {v3.2s},    [x2], x3
> +        ld1r            {v4.2s},    [x2], x3
> +        ld1r            {v5.2s},    [x2], x3
> +        ld1r            {v6.2s},    [x2], x3
> +        ext             v1.8b,  v1.8b,  v3.8b,  #4

I think '{zip,trn}1 v1.2s, v1.2s, v3.2s' is the clearer pattern on 
64-bit. it works with ld1 {v0.s}[0] in the case there is no difference 
between single lane ld1 and ld1r
 
> +        ld1r            {v7.2s},    [x2], x3
> +        ext             v2.8b,  v2.8b,  v4.8b,  #4
> +        ld1r            {v30.2s},   [x2], x3
> +        uxtl            v17.8h, v1.8b
> +        ext             v3.8b,  v3.8b,  v5.8b,  #4
> +        ld1r            {v31.2s},   [x2], x3
> +        uxtl            v18.8h, v2.8b
> +        ext             v4.8b,  v4.8b,  v6.8b,  #4
> +        uxtl            v19.8h, v3.8b
> +        ext             v5.8b,  v5.8b,  v7.8b,  #4
> +        ld1             {v30.s}[1], [x2], x3

same applies as for the 32-bit version. we should do the load and 
combine pattern for all but the last two loads. small cost for height == 
4, larger gain for height == 8

A can give numbers for cortex-a57 and probably kyro. I don't think I can 
enable the cycle counter on my cortex-a72 device.

The rest is ok for me

Janne
Janne Grunau Nov. 3, 2016, 10:46 p.m. | #14
On 2016-11-03 23:36:08 +0200, Martin Storsjö wrote:
> On Thu, 3 Nov 2016, Diego Biurrun wrote:
> 
> >On Thu, Nov 03, 2016 at 11:02:17PM +0200, Martin Storsjö wrote:
> >>On Thu, 3 Nov 2016, Diego Biurrun wrote:
> >>>On Thu, Nov 03, 2016 at 09:42:44PM +0100, Janne Grunau wrote:
> >>>>On 2016-11-03 20:38:11 +0100, Diego Biurrun wrote:
> >>>>> On Wed, Nov 02, 2016 at 07:24:03PM +0100, Janne Grunau wrote:
> >>>>> > On 2016-11-02 15:29:34 +0100, Diego Biurrun wrote:
> >>>>> > > On Wed, Nov 02, 2016 at 04:00:38PM +0200, Martin Storsjö wrote:
> >>>>> > > > On Wed, 2 Nov 2016, Diego Biurrun wrote:
> >>>>> > > > >On Wed, Nov 02, 2016 at 03:23:14PM +0200, Martin Storsjö wrote:
> >>>>> > > > >>On Wed, 2 Nov 2016, Martin Storsjö wrote:
> >>>>> > > > >>Technically, having a _neon prefix for them is wrong, but anything else
> >>>>> > > > >>(omitting these two while hooking up avg32/avg64 separately) is more
> >>>>> > > > >>complication - although I'm open for suggestions on how to handle it best.
> >>>>> > > > >
> >>>>> > > > >Where exactly is the complication? In the way you assign the function
> >>>>> > > > >pointers in the init file?
> >>>>> > > > > > > > Yes, it'd require splitting up those macros a bit;
> >>>>either for assigning the
> >>>>> > > > same function pointers, but with a different simd instruction set suffix, or
> >>>>> > > > for only assigning the avg function pointer for those sizes.
> >>>>> > > > > > Try something like
> >>>>> > > > > > #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
> >>>>> > > #define ff_vp9_copy64_neon ff_vp9_copy64_aarch64
> >>>>> > > > > > before the assignment macros. You don't have to somehow drop
> >>>>some of the
> >>>>> > > assignments in the macros then. There's precedent for this in some of the
> >>>>> > > x86 init files.
> >>>>> > > > That only fixes the function names but not the cpu_flag based
> >>>>assignment > > if I understand it correctly. And it is a little bit
> >>>>silly since neon is > > not really optional on aarch64. At least in the
> >>>>Application profile.
> >>>>> > I forgot to say that ff_vp9_copy32_aarch64 and ff_vp9_copy64_aarch64
> >>>>> should be assigned to the function pointer unconditionally before
> >>>>> checking for neon support.
> >>>>
> >>>>unconditionally is not nice since we want to compare them to C in
> >>>>checkasm. That's the reason why we have the otherwise pointless
> >>>>AV_CPU_FLAG_ARMV8
> >>>
> >>>Unconditional in the aarch64 init function is not unconditional:
> >>>
> >>>libavcodec/foodsp.c:
> >>> if (ARCH_AARCH64)
> >>>     init_aarch64(..);
> >>>
> >>>libavcodec/aarch64/foodsp_init.c:
> >>> init_aarch64(..)
> >>> {
> >>>     function.pointer = ff_vp9_copy32_aarch64;
> >>>
> >>>     if (have_neon())
> >>>         something.else = ff_whatever;
> >>>
> >>>Anyway, you get the idea. Quite possibly I'm overlooking something
> >>>and it does not work as I envision...
> >>
> >>Yes, because as Janne said, in that case, you can't via cpuflags choose not
> >>to use this function, and you can't benchmark against the C version in
> >>checkasm, since the C version always gets overridden by this function,
> >>regardless of cpuflags.
> >>
> >>That's why we've introduced the flag AV_CPU_FLAG_ARMV8, and one should wrap
> >>such functions within have_armv8(cpuflags) instead of have_neon(cpuflags).
> >
> >OK, last try then :)
> >
> > #define ff_vp9_copy32_neon ff_vp9_copy32_aarch64
> >
> > init_aarch64(..)
> > {
> >      if (have_armv8())
> >          function.pointer = ff_vp9_copy32_aarch64;
> >
> >      if (have_neon())
> >          [neon macro trickery
> >          function.pointer = ff_vp9_copy32_neon;
> >           neon macro trickery]
> 
> Sure, something like that would work. Except if we go this way, I wouldn't
> assign it at all within the neon section, since it already is hooked up
> within have_armv8.

No the basic idea was to assign the same functions twice. Once under if 
(have_armv8()) with their real name and once under if (have_neon()) with 
the current macros. The missing _neon functions are provided by '#define 
ff_vp9_copy32_neon ff_vp9_copy32_aarch64'. That's not too messy and the 
functions have the correct debug symbols.

I can live with the current misleading function names but this solution 
is not too messy.

Janne
Martin Storsjö Nov. 3, 2016, 10:52 p.m. | #15
On Thu, 3 Nov 2016, Janne Grunau wrote:

> On 2016-11-02 14:58:41 +0200, Martin Storsjö wrote:
>> ---
>>  libavcodec/aarch64/Makefile              |   2 +
>>  libavcodec/aarch64/vp9dsp_init_aarch64.c | 139 ++++++
>>  libavcodec/aarch64/vp9mc_neon.S          | 733 +++++++++++++++++++++++++++++++
>>  libavcodec/vp9.h                         |   1 +
>>  libavcodec/vp9dsp.c                      |   2 +
>>  5 files changed, 877 insertions(+)
>>  create mode 100644 libavcodec/aarch64/vp9dsp_init_aarch64.c
>>  create mode 100644 libavcodec/aarch64/vp9mc_neon.S
>>
>> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
>> index 764bedc..6f1227a 100644
>> --- a/libavcodec/aarch64/Makefile
>> +++ b/libavcodec/aarch64/Makefile
>> @@ -17,6 +17,7 @@ OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
>>  OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
>>  OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
>>  OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
>> +OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_aarch64.o
>>
>>  # ARMv8 optimizations
>>
>> @@ -43,3 +44,4 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
>>  NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o               \
>>                                             aarch64/synth_filter_neon.o
>>  NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
>> +NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9mc_neon.o
>> diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c
>> new file mode 100644
>> index 0000000..3d414af
>> --- /dev/null
>> +++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c
>
>> +    LOCAL_ALIGNED_16(uint8_t, temp, [((sz < 64 ? 2 * sz : 64) + 8) * sz]);        \
>
> ((1 + (sz < 64)) * sz + 8) * sz
>
> nit, looks nicer imo. no need to change it
>
>> diff --git a/libavcodec/aarch64/vp9mc_neon.S
>> b/libavcodec/aarch64/vp9mc_neon.S
>> new file mode 100644
>> index 0000000..9ca2a32
>> --- /dev/null
>> +++ b/libavcodec/aarch64/vp9mc_neon.S
>> @@ -0,0 +1,733 @@
>> +/*
>> + * Copyright (c) 2016 Google Inc.
>> + *
>> + * This file is part of Libav.
>> + *
>> + * Libav is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * Libav is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with Libav; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/aarch64/asm.S"
>> +
>> +const regular_filter, align=4
>
> can't we use the constants from C? the single sign extend will be lost 
> in the noise. same applies to the arm version

Yes, I guess we could. They are currently a static array in vp9dsp.c 
though, so they'd need to be made available from there.

>> +// All public functions in this file have the following signature:
>> +// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
>> +//                            const uint8_t *ref, ptrdiff_t ref_stride,
>> +//                            int h, int mx, int my);
>> +
>> +function ff_vp9_copy64_neon, export=1
>> +1:
>> +        ldp             x5,  x6,  [x2]
>> +        stp             x5,  x6,  [x0]
>> +        ldp             x5,  x6,  [x2, #16]
>
> use more registers we have enough, probably not noticable

Ok, will try

>> +function ff_vp9_copy4_neon, export=1
>> +1:
>> +        ld1             {v0.s}[0], [x2], x3
>> +        ld1             {v1.s}[0], [x2], x3
>> +        st1             {v0.s}[0], [x0], x1
>> +        ld1             {v2.s}[0], [x2], x3
>> +        st1             {v1.s}[0], [x0], x1
>> +        ld1             {v3.s}[0], [x2], x3
>> +        subs            w4,  w4,  #4
>> +        st1             {v2.s}[0], [x0], x1
>> +        st1             {v3.s}[0], [x0], x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_vp9_avg4_neon, export=1
>> +1:
>> +        ld1r            {v4.2s}, [x2], x3
>
> is there a difference between ld1r and ld1 {}[0]? eiether way we should
> just use one variant for loading 4 bytes unless we specificially need
> one of them. Iirc the cortex-a8 is the only core were a load to all
> lanes is faster than a load to all lanes.

Not that I know of here; I just brought the pattern over from arm, and it 
doesn't cause any slowdown. But I can switch it to ld1 {}[0] since that's 
more readable/understandable, if there's no known gain from it on aarch64.

>> +        ld1r            {v1.2s}, [x0], x1
>> +        ld1r            {v6.2s}, [x2], x3
>> +        ld1r            {v2.2s}, [x0], x1
>> +        ld1r            {v7.2s}, [x2], x3
>> +        ld1r            {v3.2s}, [x0], x1
>> +        sub             x0,  x0,  x1, lsl #2
>> +        subs            w4,  w4,  #4
>> +        urhadd          v0.8b,  v0.8b,  v4.8b
>> +        urhadd          v1.8b,  v1.8b,  v5.8b
>> +        urhadd          v2.8b,  v2.8b,  v6.8b
>> +        urhadd          v3.8b,  v3.8b,  v7.8b
>
> in theory a single urhadd instruction is enough but I doubt it's faster,
> doing 2 with 8 values each might though.

Will test

>> +        st1             {v0.s}[0], [x0], x1
>> +        st1             {v1.s}[0], [x0], x1
>> +        st1             {v2.s}[0], [x0], x1
>> +        st1             {v3.s}[0], [x0], x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
>> +// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
>> +// dst1-dst2 and dst3-dst4 for size >= 16)
>> +.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
>> +        ext             v20.16b, \src1, \src2, #(2*\offset)
>> +        ext             v22.16b, \src4, \src5, #(2*\offset)
>> +.if \size >= 16
>> +        mla             \dst1, v20.8h, v0.h[\offset]
>> +        ext             v21.16b, \src2, \src3, #(2*\offset)
>> +        mla             \dst3, v22.8h, v0.h[\offset]
>> +        ext             v23.16b, \src5, \src6, #(2*\offset)
>> +        mla             \dst2, v21.8h, v0.h[\offset]
>> +        mla             \dst4, v23.8h, v0.h[\offset]
>> +.else
>> +        mla             \dst1, v20.8h, v0.h[\offset]
>> +        mla             \dst3, v22.8h, v0.h[\offset]
>> +.endif
>> +.endm
>> +// The same as above, but don't accumulate straight into the
>> +// destination, but use a temp register and accumulate with saturation.
>> +.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
>> +        ext             v20.16b, \src1, \src2, #(2*\offset)
>> +        ext             v22.16b, \src4, \src5, #(2*\offset)
>> +.if \size >= 16
>> +        mul             v20.8h, v20.8h, v0.h[\offset]
>> +        ext             v21.16b, \src2, \src3, #(2*\offset)
>> +        mul             v22.8h, v22.8h, v0.h[\offset]
>> +        ext             v23.16b, \src5, \src6, #(2*\offset)
>> +        mul             v21.8h, v21.8h, v0.h[\offset]
>> +        mul             v23.8h, v23.8h, v0.h[\offset]
>> +.else
>> +        mul             v20.8h, v20.8h, v0.h[\offset]
>> +        mul             v22.8h, v22.8h, v0.h[\offset]
>> +.endif
>> +        sqadd           \dst1, \dst1, v20.8h
>> +        sqadd           \dst3, \dst3, v22.8h
>> +.if \size >= 16
>> +        sqadd           \dst2, \dst2, v21.8h
>> +        sqadd           \dst4, \dst4, v23.8h
>> +.endif
>> +.endm
>> +
>> +
>> +// Instantiate a horizontal filter function for the given size.
>> +// This can work on 4, 8 or 16 pixels in parallel; for larger
>> +// widths it will do 16 pixels at a time and loop horizontally.
>> +// The actual width is passed in x5, the height in w4 and the
>> +// filter coefficients in x9. idx2 is the index of the largest
>> +// filter coefficient (3 or 4) and idx1 is the other one of them.
>> +.macro do_8tap_h type, size, idx1, idx2
>> +function \type\()_8tap_\size\()h_\idx1\idx2
>> +        sub             x2,  x2,  #3
>> +        add             x6,  x0,  x1
>> +        add             x7,  x2,  x3
>> +        add             x1,  x1,  x1
>> +        add             x3,  x3,  x3
>> +        // Only size >= 16 loops horizontally and needs
>> +        // reduced dst stride
>> +.if \size >= 16
>> +        sub             x1,  x1,  x5
>> +.endif
>> +        // size >= 16 loads two qwords and increments r2,
>> +        // for size 4/8 it's enough with one qword and no
>> +        // postincrement
>> +.if \size >= 16
>> +        sub             x3,  x3,  x5
>> +        sub             x3,  x3,  #8
>> +.endif
>> +        // Load the filter vector
>> +        ld1             {v0.8h},  [x9]
>> +1:
>> +.if \size >= 16
>> +        mov             x9,  x5
>> +.endif
>> +        // Load src
>> +.if \size >= 16
>> +        ld1             {v4.16b},  [x2], #16
>> +        ld1             {v16.16b}, [x7], #16
>> +        ld1             {v6.8b},   [x2], #8
>> +        ld1             {v18.8b},  [x7], #8
>
> ld1 {v4,  v5,  v6}, ...
> ld1 {v16, v17, v18}, ...
>
> and adept the rest

Already done locally, in response to the arm review

>> +.else
>> +        ld1             {v4.16b},  [x2]
>> +        ld1             {v16.16b}, [x7]
>> +.endif
>> +        uxtl2           v5.8h,  v4.16b
>> +        uxtl            v4.8h,  v4.8b
>> +        uxtl2           v17.8h, v16.16b
>> +        uxtl            v16.8h, v16.8b
>> +.if \size >= 16
>> +        uxtl            v6.8h,  v6.8b
>> +        uxtl            v18.8h, v18.8b
>> +.endif
>> +2:
>> +
>> +        // Accumulate, adding idx2 last with a separate
>> +        // saturating add. The positive filter coefficients
>> +        // for all indices except idx2 must add up to less
>> +        // than 127 for this not to overflow.
>> +        mul             v1.8h,  v4.8h,  v0.h[0]
>> +        mul             v24.8h, v16.8h, v0.h[0]
>> +.if \size >= 16
>> +        mul             v2.8h,  v5.8h,  v0.h[0]
>> +        mul             v25.8h, v17.8h, v0.h[0]
>> +.endif
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 1,     \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 2,     \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx1, \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 5,     \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 6,     \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 7,     \size
>> +        extmulqadd      v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx2, \size
>> +
>> +        // Round, shift and saturate
>> +        sqrshrun        v1.8b,   v1.8h,  #7
>> +        sqrshrun        v24.8b,  v24.8h, #7
>> +.if \size >= 16
>> +        sqrshrun2       v1.16b,  v2.8h,  #7
>> +        sqrshrun2       v24.16b, v25.8h, #7
>> +.endif
>> +        // Average
>> +.ifc \type,avg
>> +.if \size >= 16
>> +        ld1             {v2.16b}, [x0]
>> +        ld1             {v3.16b}, [x6]
>> +        urhadd          v1.16b,  v1.16b,  v2.16b
>> +        urhadd          v24.16b, v24.16b, v3.16b
>> +.elseif \size == 8
>> +        ld1             {v2.8b},  [x0]
>> +        ld1             {v3.8b},  [x6]
>> +        urhadd          v1.8b,  v1.8b,  v2.8b
>> +        urhadd          v24.8b, v24.8b, v3.8b
>> +.else
>> +        ld1r            {v2.2s},  [x0]
>> +        ld1r            {v3.2s},  [x6]
>> +        urhadd          v1.8b,  v1.8b,  v2.8b
>> +        urhadd          v24.8b, v24.8b, v3.8b
>> +.endif
>> +.endif
>> +        // Store and loop horizontally (for size >= 16)
>> +.if \size >= 16
>> +        st1             {v1.16b},  [x0], #16
>> +        st1             {v24.16b}, [x6], #16
>> +        mov             v4.16b,  v6.16b
>> +        mov             v16.16b, v18.16b
>> +        subs            x9,  x9,  #16
>> +        beq             3f
>
> you can branch before the 2 movs

Oh, indeed. Will change (and will check if I can do the same change for 
arm as well)

>> +        ld1             {v6.16b},  [x2], #16
>> +        ld1             {v18.16b}, [x7], #16
>> +        uxtl            v5.8h,  v6.8b
>> +        uxtl2           v6.8h,  v6.16b
>> +        uxtl            v17.8h, v18.8b
>> +        uxtl2           v18.8h, v18.16b
>> +        b               2b
>> +.elseif \size == 8
>> +        st1             {v1.8b},    [x0]
>> +        st1             {v24.8b},   [x6]
>> +.else // \size == 4
>> +        st1             {v1.s}[0],  [x0]
>> +        st1             {v24.s}[0], [x6]
>> +.endif
>> +3:
>> +        // Loop vertically
>> +        add             x0,  x0,  x1
>> +        add             x6,  x6,  x1
>> +        add             x2,  x2,  x3
>> +        add             x7,  x7,  x3
>> +        subs            w4,  w4,  #2
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +.endm
>> +
>> +.macro do_8tap_h_size size
>> +do_8tap_h put, \size, 3, 4
>> +do_8tap_h avg, \size, 3, 4
>> +do_8tap_h put, \size, 4, 3
>> +do_8tap_h avg, \size, 4, 3
>> +.endm
>> +
>> +do_8tap_h_size 4
>> +do_8tap_h_size 8
>> +do_8tap_h_size 16
>> +
>> +.macro do_8tap_h_func type, filter, size
>> +function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
>> +        movrel          x6,  \filter\()_filter-16
>> +        cmp             w5,  #8
>> +        add             x9,  x6,  w5, uxtw #4
>> +        mov             x5,  #\size
>> +.if \size >= 16
>> +        bge             \type\()_8tap_16h_34
>> +        b               \type\()_8tap_16h_43
>> +.else
>> +        bge             \type\()_8tap_\size\()h_34
>> +        b               \type\()_8tap_\size\()h_43
>> +.endif
>> +endfunc
>> +.endm
>> +
>> +.macro do_8tap_h_filters size
>> +do_8tap_h_func put, regular, \size
>> +do_8tap_h_func avg, regular, \size
>> +do_8tap_h_func put, sharp,   \size
>> +do_8tap_h_func avg, sharp,   \size
>> +do_8tap_h_func put, smooth,  \size
>> +do_8tap_h_func avg, smooth,  \size
>> +.endm
>> +
>> +do_8tap_h_filters 64
>> +do_8tap_h_filters 32
>> +do_8tap_h_filters 16
>> +do_8tap_h_filters 8
>> +do_8tap_h_filters 4
>> +
>> +
>> +// Vertical filters
>> +
>> +// Round, shift and saturate and store reg1-reg2 over 4 lines
>> +.macro do_store4 reg1, reg2, tmp1, tmp2, type
>> +        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
>> +        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
>> +.ifc \type,avg
>> +        ld1r            {\tmp1\().2s},     [x0], x1
>> +        ld1r            {\tmp2\().2s},     [x0], x1
>> +        ld1             {\tmp1\().s}[1],  [x0], x1
>> +        ld1             {\tmp2\().s}[1],  [x0], x1
>
> use a copy of x0 for loading in avg
>
>> +        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
>> +        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
>> +        sub             x0,  x0,  x1, lsl #2
>> +.endif
>> +        st1             {\reg1\().s}[0],  [x0], x1
>> +        st1             {\reg2\().s}[0],  [x0], x1
>> +        st1             {\reg1\().s}[1],  [x0], x1
>> +        st1             {\reg2\().s}[1],  [x0], x1
>> +.endm
>> +
>> +// Round, shift and saturate and store reg1-4
>> +.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
>> +        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
>> +        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
>> +        sqrshrun        \reg3\().8b,  \reg3\().8h, #7
>> +        sqrshrun        \reg4\().8b,  \reg4\().8h, #7
>> +.ifc \type,avg
>> +        ld1             {\tmp1\().8b},  [x0], x1
>> +        ld1             {\tmp2\().8b},  [x0], x1
>> +        ld1             {\tmp3\().8b},  [x0], x1
>> +        ld1             {\tmp4\().8b},  [x0], x1
>
> use a copy of x0
>
>> +        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
>> +        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
>> +        urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
>> +        urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
>> +        sub             x0,  x0,  x1, lsl #2
>> +.endif
>> +        st1             {\reg1\().8b},  [x0], x1
>> +        st1             {\reg2\().8b},  [x0], x1
>> +        st1             {\reg3\().8b},  [x0], x1
>> +        st1             {\reg4\().8b},  [x0], x1
>> +.endm
>
> ...
>
>> +// Instantiate a vertical filter function for filtering a 4 pixels wide
>> +// slice. The first half of the registers contain one row, while the second
>> +// half of a register contains the second-next row (also stored in the first
>> +// half of the register two steps ahead). The convolution does two outputs
>> +// at a time; the output of v17-v24 into one, and v18-v25 into another one.
>> +// The first half of first output is the first output row, the first half
>> +// of the other output is the second output row. The second halves of the
>> +// registers are rows 3 and 4.
>> +// This only is designed to work for 4 or 8 output lines.
>> +.macro do_8tap_4v type, idx1, idx2
>> +function \type\()_8tap_4v_\idx1\idx2
>> +        sub             x2,  x2,  x3, lsl #1
>> +        sub             x2,  x2,  x3
>> +        ld1             {v0.8h},  [x6]
>> +
>> +        ld1r            {v1.2s},    [x2], x3
>> +        ld1r            {v2.2s},    [x2], x3
>> +        ld1r            {v3.2s},    [x2], x3
>> +        ld1r            {v4.2s},    [x2], x3
>> +        ld1r            {v5.2s},    [x2], x3
>> +        ld1r            {v6.2s},    [x2], x3
>> +        ext             v1.8b,  v1.8b,  v3.8b,  #4
>
> I think '{zip,trn}1 v1.2s, v1.2s, v3.2s' is the clearer pattern on
> 64-bit. it works with ld1 {v0.s}[0] in the case there is no difference
> between single lane ld1 and ld1r

Ok, I'll try that

>> +        ld1r            {v7.2s},    [x2], x3
>> +        ext             v2.8b,  v2.8b,  v4.8b,  #4
>> +        ld1r            {v30.2s},   [x2], x3
>> +        uxtl            v17.8h, v1.8b
>> +        ext             v3.8b,  v3.8b,  v5.8b,  #4
>> +        ld1r            {v31.2s},   [x2], x3
>> +        uxtl            v18.8h, v2.8b
>> +        ext             v4.8b,  v4.8b,  v6.8b,  #4
>> +        uxtl            v19.8h, v3.8b
>> +        ext             v5.8b,  v5.8b,  v7.8b,  #4
>> +        ld1             {v30.s}[1], [x2], x3
>
> same applies as for the 32-bit version. we should do the load and
> combine pattern for all but the last two loads. small cost for height ==
> 4, larger gain for height == 8

Yep, already done that way locally in response to the second arm review.

> A can give numbers for cortex-a57 and probably kyro. I don't think I can
> enable the cycle counter on my cortex-a72 device.

Ok, thanks, that'd be nice. Since those have out of order execution, not 
being able to finetune instruction scheduling with benchmarking 
of each individual test shouldn't matter too much, compared to the 
A53 where it matters a lot.

// Martin
Martin Storsjö Nov. 5, 2016, 12:14 p.m. | #16
On Thu, 3 Nov 2016, Janne Grunau wrote:

> On 2016-11-02 14:58:41 +0200, Martin Storsjö wrote:
>> ---
>>  libavcodec/aarch64/Makefile              |   2 +
>>  libavcodec/aarch64/vp9dsp_init_aarch64.c | 139 ++++++
>>  libavcodec/aarch64/vp9mc_neon.S          | 733 +++++++++++++++++++++++++++++++
>>  libavcodec/vp9.h                         |   1 +
>>  libavcodec/vp9dsp.c                      |   2 +
>>  5 files changed, 877 insertions(+)
>>  create mode 100644 libavcodec/aarch64/vp9dsp_init_aarch64.c
>>  create mode 100644 libavcodec/aarch64/vp9mc_neon.S
>>
>> diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
>> index 764bedc..6f1227a 100644
>> --- a/libavcodec/aarch64/Makefile
>> +++ b/libavcodec/aarch64/Makefile
>> @@ -17,6 +17,7 @@ OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
>>  OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
>>  OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
>>  OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
>> +OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_aarch64.o
>>
>>  # ARMv8 optimizations
>>
>> @@ -43,3 +44,4 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
>>  NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o               \
>>                                             aarch64/synth_filter_neon.o
>>  NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
>> +NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9mc_neon.o
>> diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c
>> new file mode 100644
>> index 0000000..3d414af
>> --- /dev/null
>> +++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c
>
>> +    LOCAL_ALIGNED_16(uint8_t, temp, [((sz < 64 ? 2 * sz : 64) + 8) * sz]);        \
>
> ((1 + (sz < 64)) * sz + 8) * sz
>
> nit, looks nicer imo. no need to change it

Yeah, that looks nicer - changed.

>> diff --git a/libavcodec/aarch64/vp9mc_neon.S
>> b/libavcodec/aarch64/vp9mc_neon.S
>> new file mode 100644
>> index 0000000..9ca2a32
>> --- /dev/null
>> +++ b/libavcodec/aarch64/vp9mc_neon.S
>> @@ -0,0 +1,733 @@
>> +/*
>> + * Copyright (c) 2016 Google Inc.
>> + *
>> + * This file is part of Libav.
>> + *
>> + * Libav is free software; you can redistribute it and/or
>> + * modify it under the terms of the GNU Lesser General Public
>> + * License as published by the Free Software Foundation; either
>> + * version 2.1 of the License, or (at your option) any later version.
>> + *
>> + * Libav is distributed in the hope that it will be useful,
>> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> + * Lesser General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU Lesser General Public
>> + * License along with Libav; if not, write to the Free Software
>> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
>> + */
>> +
>> +#include "libavutil/aarch64/asm.S"
>> +
>> +const regular_filter, align=4
>
> can't we use the constants from C? the single sign extend will be lost
> in the noise. same applies to the arm version
>
>> +// All public functions in this file have the following signature:
>> +// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
>> +//                            const uint8_t *ref, ptrdiff_t ref_stride,
>> +//                            int h, int mx, int my);
>> +
>> +function ff_vp9_copy64_neon, export=1
>> +1:
>> +        ldp             x5,  x6,  [x2]
>> +        stp             x5,  x6,  [x0]
>> +        ldp             x5,  x6,  [x2, #16]
>
> use more registers we have enough, probably not noticable

No measurable change on the A53, but changed to use more registers, and 
reordered them to allow for a bit more latency in the loads, in case it 
would help somewhere.

>> +        stp             x5,  x6,  [x0, #16]
>> +        subs            w4,  w4,  #1
>> +        ldp             x5,  x6,  [x2, #32]
>> +        stp             x5,  x6,  [x0, #32]
>> +        ldp             x5,  x6,  [x2, #48]
>> +        stp             x5,  x6,  [x0, #48]
>> +        add             x2,  x2,  x3
>> +        add             x0,  x0,  x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_vp9_avg64_neon, export=1
>> +        mov             x5,  x0
>> +1:
>> +        ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
>> +        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
>> +        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
>> +        urhadd          v0.16b,  v0.16b,  v4.16b
>> +        urhadd          v1.16b,  v1.16b,  v5.16b
>> +        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
>> +        urhadd          v2.16b,  v2.16b,  v6.16b
>> +        urhadd          v3.16b,  v3.16b,  v7.16b
>> +        subs            w4,  w4,  #2
>> +        urhadd          v16.16b, v16.16b, v20.16b
>> +        urhadd          v17.16b, v17.16b, v21.16b
>> +        st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
>> +        urhadd          v18.16b, v18.16b, v22.16b
>> +        urhadd          v19.16b, v19.16b, v23.16b
>> +        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_vp9_copy32_neon, export=1
>> +1:
>> +        ldp             x5,  x6,  [x2]
>> +        stp             x5,  x6,  [x0]
>> +        subs            w4,  w4,  #1
>> +        ldp             x5,  x6,  [x2, #16]
>
> see 64

Updated similarlly as the 64 version

>> +        stp             x5,  x6,  [x0, #16]
>> +        add             x2,  x2,  x3
>> +        add             x0,  x0,  x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_vp9_avg32_neon, export=1
>> +1:
>> +        ld1             {v2.16b, v3.16b},  [x2], x3
>> +        ld1             {v0.16b, v1.16b},  [x0]
>> +        urhadd          v0.16b,  v0.16b,  v2.16b
>> +        urhadd          v1.16b,  v1.16b,  v3.16b
>> +        subs            w4,  w4,  #1
>> +        st1             {v0.16b, v1.16b},  [x0], x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_vp9_copy16_neon, export=1
>> +        add             x5,  x0,  x1
>> +        add             x1,  x1,  x1
>
> I think lsl 'x1,  x1,  #1' would be a little clearer

Changed

>> +        add             x6,  x2,  x3
>> +        add             x3,  x3,  x3
>> +1:
>> +        ld1             {v0.16b},  [x2], x3
>> +        ld1             {v1.16b},  [x6], x3
>> +        ld1             {v2.16b},  [x2], x3
>> +        ld1             {v3.16b},  [x6], x3
>> +        subs            w4,  w4,  #4
>> +        st1             {v0.16b},  [x0], x1
>> +        st1             {v1.16b},  [x5], x1
>> +        st1             {v2.16b},  [x0], x1
>> +        st1             {v3.16b},  [x5], x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_vp9_avg16_neon, export=1
>> +1:
>> +        ld1             {v2.16b},  [x2], x3
>> +        ld1             {v0.16b},  [x0], x1
>> +        ld1             {v3.16b},  [x2], x3
>> +        urhadd          v0.16b,  v0.16b,  v2.16b
>> +        ld1             {v1.16b},  [x0]
>> +        sub             x0,  x0,  x1
>
> use another register for st1

Done, although it didn't actually improve anything in my measurements

>> +        urhadd          v1.16b,  v1.16b,  v3.16b
>> +        subs            w4,  w4,  #2
>> +        st1             {v0.16b},  [x0], x1
>> +        st1             {v1.16b},  [x0], x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>
> ...
>
>> +function ff_vp9_copy4_neon, export=1
>> +1:
>> +        ld1             {v0.s}[0], [x2], x3
>> +        ld1             {v1.s}[0], [x2], x3
>> +        st1             {v0.s}[0], [x0], x1
>> +        ld1             {v2.s}[0], [x2], x3
>> +        st1             {v1.s}[0], [x0], x1
>> +        ld1             {v3.s}[0], [x2], x3
>> +        subs            w4,  w4,  #4
>> +        st1             {v2.s}[0], [x0], x1
>> +        st1             {v3.s}[0], [x0], x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +function ff_vp9_avg4_neon, export=1
>> +1:
>> +        ld1r            {v4.2s}, [x2], x3
>
> is there a difference between ld1r and ld1 {}[0]? eiether way we should
> just use one variant for loading 4 bytes unless we specificially need
> one of them. Iirc the cortex-a8 is the only core were a load to all
> lanes is faster than a load to all lanes.

Changed to ld1 {}[0]

>> +        ld1r            {v1.2s}, [x0], x1
>> +        ld1r            {v6.2s}, [x2], x3
>> +        ld1r            {v2.2s}, [x0], x1
>> +        ld1r            {v7.2s}, [x2], x3
>> +        ld1r            {v3.2s}, [x0], x1
>> +        sub             x0,  x0,  x1, lsl #2
>> +        subs            w4,  w4,  #4
>> +        urhadd          v0.8b,  v0.8b,  v4.8b
>> +        urhadd          v1.8b,  v1.8b,  v5.8b
>> +        urhadd          v2.8b,  v2.8b,  v6.8b
>> +        urhadd          v3.8b,  v3.8b,  v7.8b
>
> in theory a single urhadd instruction is enough but I doubt it's faster,
> doing 2 with 8 values each might though.

Changed to 2 urhadd with 8 values each, and using a different register for 
output; 1 cycle faster

>> +        st1             {v0.s}[0], [x0], x1
>> +        st1             {v1.s}[0], [x0], x1
>> +        st1             {v2.s}[0], [x0], x1
>> +        st1             {v3.s}[0], [x0], x1
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +
>> +
>> +// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
>> +// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
>> +// dst1-dst2 and dst3-dst4 for size >= 16)
>> +.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
>> +        ext             v20.16b, \src1, \src2, #(2*\offset)
>> +        ext             v22.16b, \src4, \src5, #(2*\offset)
>> +.if \size >= 16
>> +        mla             \dst1, v20.8h, v0.h[\offset]
>> +        ext             v21.16b, \src2, \src3, #(2*\offset)
>> +        mla             \dst3, v22.8h, v0.h[\offset]
>> +        ext             v23.16b, \src5, \src6, #(2*\offset)
>> +        mla             \dst2, v21.8h, v0.h[\offset]
>> +        mla             \dst4, v23.8h, v0.h[\offset]
>> +.else
>> +        mla             \dst1, v20.8h, v0.h[\offset]
>> +        mla             \dst3, v22.8h, v0.h[\offset]
>> +.endif
>> +.endm
>> +// The same as above, but don't accumulate straight into the
>> +// destination, but use a temp register and accumulate with saturation.
>> +.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
>> +        ext             v20.16b, \src1, \src2, #(2*\offset)
>> +        ext             v22.16b, \src4, \src5, #(2*\offset)
>> +.if \size >= 16
>> +        mul             v20.8h, v20.8h, v0.h[\offset]
>> +        ext             v21.16b, \src2, \src3, #(2*\offset)
>> +        mul             v22.8h, v22.8h, v0.h[\offset]
>> +        ext             v23.16b, \src5, \src6, #(2*\offset)
>> +        mul             v21.8h, v21.8h, v0.h[\offset]
>> +        mul             v23.8h, v23.8h, v0.h[\offset]
>> +.else
>> +        mul             v20.8h, v20.8h, v0.h[\offset]
>> +        mul             v22.8h, v22.8h, v0.h[\offset]
>> +.endif
>> +        sqadd           \dst1, \dst1, v20.8h
>> +        sqadd           \dst3, \dst3, v22.8h
>> +.if \size >= 16
>> +        sqadd           \dst2, \dst2, v21.8h
>> +        sqadd           \dst4, \dst4, v23.8h
>> +.endif
>> +.endm
>> +
>> +
>> +// Instantiate a horizontal filter function for the given size.
>> +// This can work on 4, 8 or 16 pixels in parallel; for larger
>> +// widths it will do 16 pixels at a time and loop horizontally.
>> +// The actual width is passed in x5, the height in w4 and the
>> +// filter coefficients in x9. idx2 is the index of the largest
>> +// filter coefficient (3 or 4) and idx1 is the other one of them.
>> +.macro do_8tap_h type, size, idx1, idx2
>> +function \type\()_8tap_\size\()h_\idx1\idx2
>> +        sub             x2,  x2,  #3
>> +        add             x6,  x0,  x1
>> +        add             x7,  x2,  x3
>> +        add             x1,  x1,  x1
>> +        add             x3,  x3,  x3
>> +        // Only size >= 16 loops horizontally and needs
>> +        // reduced dst stride
>> +.if \size >= 16
>> +        sub             x1,  x1,  x5
>> +.endif
>> +        // size >= 16 loads two qwords and increments r2,
>> +        // for size 4/8 it's enough with one qword and no
>> +        // postincrement
>> +.if \size >= 16
>> +        sub             x3,  x3,  x5
>> +        sub             x3,  x3,  #8
>> +.endif
>> +        // Load the filter vector
>> +        ld1             {v0.8h},  [x9]
>> +1:
>> +.if \size >= 16
>> +        mov             x9,  x5
>> +.endif
>> +        // Load src
>> +.if \size >= 16
>> +        ld1             {v4.16b},  [x2], #16
>> +        ld1             {v16.16b}, [x7], #16
>> +        ld1             {v6.8b},   [x2], #8
>> +        ld1             {v18.8b},  [x7], #8
>
> ld1 {v4,  v5,  v6}, ...
> ld1 {v16, v17, v18}, ...
>
> and adept the rest

Done

>> +.else
>> +        ld1             {v4.16b},  [x2]
>> +        ld1             {v16.16b}, [x7]
>> +.endif
>> +        uxtl2           v5.8h,  v4.16b
>> +        uxtl            v4.8h,  v4.8b
>> +        uxtl2           v17.8h, v16.16b
>> +        uxtl            v16.8h, v16.8b
>> +.if \size >= 16
>> +        uxtl            v6.8h,  v6.8b
>> +        uxtl            v18.8h, v18.8b
>> +.endif
>> +2:
>> +
>> +        // Accumulate, adding idx2 last with a separate
>> +        // saturating add. The positive filter coefficients
>> +        // for all indices except idx2 must add up to less
>> +        // than 127 for this not to overflow.
>> +        mul             v1.8h,  v4.8h,  v0.h[0]
>> +        mul             v24.8h, v16.8h, v0.h[0]
>> +.if \size >= 16
>> +        mul             v2.8h,  v5.8h,  v0.h[0]
>> +        mul             v25.8h, v17.8h, v0.h[0]
>> +.endif
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 1,     \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 2,     \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx1, \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 5,     \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 6,     \size
>> +        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 7,     \size
>> +        extmulqadd      v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx2, \size
>> +
>> +        // Round, shift and saturate
>> +        sqrshrun        v1.8b,   v1.8h,  #7
>> +        sqrshrun        v24.8b,  v24.8h, #7
>> +.if \size >= 16
>> +        sqrshrun2       v1.16b,  v2.8h,  #7
>> +        sqrshrun2       v24.16b, v25.8h, #7
>> +.endif
>> +        // Average
>> +.ifc \type,avg
>> +.if \size >= 16
>> +        ld1             {v2.16b}, [x0]
>> +        ld1             {v3.16b}, [x6]
>> +        urhadd          v1.16b,  v1.16b,  v2.16b
>> +        urhadd          v24.16b, v24.16b, v3.16b
>> +.elseif \size == 8
>> +        ld1             {v2.8b},  [x0]
>> +        ld1             {v3.8b},  [x6]
>> +        urhadd          v1.8b,  v1.8b,  v2.8b
>> +        urhadd          v24.8b, v24.8b, v3.8b
>> +.else
>> +        ld1r            {v2.2s},  [x0]
>> +        ld1r            {v3.2s},  [x6]
>> +        urhadd          v1.8b,  v1.8b,  v2.8b
>> +        urhadd          v24.8b, v24.8b, v3.8b
>> +.endif
>> +.endif
>> +        // Store and loop horizontally (for size >= 16)
>> +.if \size >= 16
>> +        st1             {v1.16b},  [x0], #16
>> +        st1             {v24.16b}, [x6], #16
>> +        mov             v4.16b,  v6.16b
>> +        mov             v16.16b, v18.16b
>> +        subs            x9,  x9,  #16
>> +        beq             3f
>
> you can branch before the 2 movs

Indeed. And moving the subs to before the st1s also seems to help a little

>> +        ld1             {v6.16b},  [x2], #16
>> +        ld1             {v18.16b}, [x7], #16
>> +        uxtl            v5.8h,  v6.8b
>> +        uxtl2           v6.8h,  v6.16b
>> +        uxtl            v17.8h, v18.8b
>> +        uxtl2           v18.8h, v18.16b
>> +        b               2b
>> +.elseif \size == 8
>> +        st1             {v1.8b},    [x0]
>> +        st1             {v24.8b},   [x6]
>> +.else // \size == 4
>> +        st1             {v1.s}[0],  [x0]
>> +        st1             {v24.s}[0], [x6]
>> +.endif
>> +3:
>> +        // Loop vertically
>> +        add             x0,  x0,  x1
>> +        add             x6,  x6,  x1
>> +        add             x2,  x2,  x3
>> +        add             x7,  x7,  x3
>> +        subs            w4,  w4,  #2
>> +        b.ne            1b
>> +        ret
>> +endfunc
>> +.endm
>> +
>> +.macro do_8tap_h_size size
>> +do_8tap_h put, \size, 3, 4
>> +do_8tap_h avg, \size, 3, 4
>> +do_8tap_h put, \size, 4, 3
>> +do_8tap_h avg, \size, 4, 3
>> +.endm
>> +
>> +do_8tap_h_size 4
>> +do_8tap_h_size 8
>> +do_8tap_h_size 16
>> +
>> +.macro do_8tap_h_func type, filter, size
>> +function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
>> +        movrel          x6,  \filter\()_filter-16
>> +        cmp             w5,  #8
>> +        add             x9,  x6,  w5, uxtw #4
>> +        mov             x5,  #\size
>> +.if \size >= 16
>> +        bge             \type\()_8tap_16h_34
>> +        b               \type\()_8tap_16h_43
>> +.else
>> +        bge             \type\()_8tap_\size\()h_34
>> +        b               \type\()_8tap_\size\()h_43
>> +.endif
>> +endfunc
>> +.endm
>> +
>> +.macro do_8tap_h_filters size
>> +do_8tap_h_func put, regular, \size
>> +do_8tap_h_func avg, regular, \size
>> +do_8tap_h_func put, sharp,   \size
>> +do_8tap_h_func avg, sharp,   \size
>> +do_8tap_h_func put, smooth,  \size
>> +do_8tap_h_func avg, smooth,  \size
>> +.endm
>> +
>> +do_8tap_h_filters 64
>> +do_8tap_h_filters 32
>> +do_8tap_h_filters 16
>> +do_8tap_h_filters 8
>> +do_8tap_h_filters 4
>> +
>> +
>> +// Vertical filters
>> +
>> +// Round, shift and saturate and store reg1-reg2 over 4 lines
>> +.macro do_store4 reg1, reg2, tmp1, tmp2, type
>> +        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
>> +        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
>> +.ifc \type,avg
>> +        ld1r            {\tmp1\().2s},     [x0], x1
>> +        ld1r            {\tmp2\().2s},     [x0], x1
>> +        ld1             {\tmp1\().s}[1],  [x0], x1
>> +        ld1             {\tmp2\().s}[1],  [x0], x1
>
> use a copy of x0 for loading in avg

This helps surprisingly much, around 4 cycles.

On arm, it is a net gain for the larger filters, but a loss for the size 4 
case (since we need to push another register), so I'll skip doing it 
there.

>> +        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
>> +        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
>> +        sub             x0,  x0,  x1, lsl #2
>> +.endif
>> +        st1             {\reg1\().s}[0],  [x0], x1
>> +        st1             {\reg2\().s}[0],  [x0], x1
>> +        st1             {\reg1\().s}[1],  [x0], x1
>> +        st1             {\reg2\().s}[1],  [x0], x1
>> +.endm
>> +
>> +// Round, shift and saturate and store reg1-4
>> +.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
>> +        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
>> +        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
>> +        sqrshrun        \reg3\().8b,  \reg3\().8h, #7
>> +        sqrshrun        \reg4\().8b,  \reg4\().8h, #7
>> +.ifc \type,avg
>> +        ld1             {\tmp1\().8b},  [x0], x1
>> +        ld1             {\tmp2\().8b},  [x0], x1
>> +        ld1             {\tmp3\().8b},  [x0], x1
>> +        ld1             {\tmp4\().8b},  [x0], x1
>
> use a copy of x0

Ditto

>> +        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
>> +        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
>> +        urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
>> +        urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
>> +        sub             x0,  x0,  x1, lsl #2
>> +.endif
>> +        st1             {\reg1\().8b},  [x0], x1
>> +        st1             {\reg2\().8b},  [x0], x1
>> +        st1             {\reg3\().8b},  [x0], x1
>> +        st1             {\reg4\().8b},  [x0], x1
>> +.endm
>
> ...
>
>> +// Instantiate a vertical filter function for filtering a 4 pixels wide
>> +// slice. The first half of the registers contain one row, while the second
>> +// half of a register contains the second-next row (also stored in the first
>> +// half of the register two steps ahead). The convolution does two outputs
>> +// at a time; the output of v17-v24 into one, and v18-v25 into another one.
>> +// The first half of first output is the first output row, the first half
>> +// of the other output is the second output row. The second halves of the
>> +// registers are rows 3 and 4.
>> +// This only is designed to work for 4 or 8 output lines.
>> +.macro do_8tap_4v type, idx1, idx2
>> +function \type\()_8tap_4v_\idx1\idx2
>> +        sub             x2,  x2,  x3, lsl #1
>> +        sub             x2,  x2,  x3
>> +        ld1             {v0.8h},  [x6]
>> +
>> +        ld1r            {v1.2s},    [x2], x3
>> +        ld1r            {v2.2s},    [x2], x3
>> +        ld1r            {v3.2s},    [x2], x3
>> +        ld1r            {v4.2s},    [x2], x3
>> +        ld1r            {v5.2s},    [x2], x3
>> +        ld1r            {v6.2s},    [x2], x3
>> +        ext             v1.8b,  v1.8b,  v3.8b,  #4
>
> I think '{zip,trn}1 v1.2s, v1.2s, v3.2s' is the clearer pattern on
> 64-bit. it works with ld1 {v0.s}[0] in the case there is no difference
> between single lane ld1 and ld1r

Done

>> +        ld1r            {v7.2s},    [x2], x3
>> +        ext             v2.8b,  v2.8b,  v4.8b,  #4
>> +        ld1r            {v30.2s},   [x2], x3
>> +        uxtl            v17.8h, v1.8b
>> +        ext             v3.8b,  v3.8b,  v5.8b,  #4
>> +        ld1r            {v31.2s},   [x2], x3
>> +        uxtl            v18.8h, v2.8b
>> +        ext             v4.8b,  v4.8b,  v6.8b,  #4
>> +        uxtl            v19.8h, v3.8b
>> +        ext             v5.8b,  v5.8b,  v7.8b,  #4
>> +        ld1             {v30.s}[1], [x2], x3
>
> same applies as for the 32-bit version. we should do the load and
> combine pattern for all but the last two loads. small cost for height ==
> 4, larger gain for height == 8

Done

// Martin

Patch

diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile
index 764bedc..6f1227a 100644
--- a/libavcodec/aarch64/Makefile
+++ b/libavcodec/aarch64/Makefile
@@ -17,6 +17,7 @@  OBJS-$(CONFIG_DCA_DECODER)              += aarch64/dcadsp_init.o
 OBJS-$(CONFIG_RV40_DECODER)             += aarch64/rv40dsp_init_aarch64.o
 OBJS-$(CONFIG_VC1_DECODER)              += aarch64/vc1dsp_init_aarch64.o
 OBJS-$(CONFIG_VORBIS_DECODER)           += aarch64/vorbisdsp_init.o
+OBJS-$(CONFIG_VP9_DECODER)              += aarch64/vp9dsp_init_aarch64.o
 
 # ARMv8 optimizations
 
@@ -43,3 +44,4 @@  NEON-OBJS-$(CONFIG_MPEGAUDIODSP)        += aarch64/mpegaudiodsp_neon.o
 NEON-OBJS-$(CONFIG_DCA_DECODER)         += aarch64/dcadsp_neon.o               \
                                            aarch64/synth_filter_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)      += aarch64/vorbisdsp_neon.o
+NEON-OBJS-$(CONFIG_VP9_DECODER)         += aarch64/vp9mc_neon.o
diff --git a/libavcodec/aarch64/vp9dsp_init_aarch64.c b/libavcodec/aarch64/vp9dsp_init_aarch64.c
new file mode 100644
index 0000000..3d414af
--- /dev/null
+++ b/libavcodec/aarch64/vp9dsp_init_aarch64.c
@@ -0,0 +1,139 @@ 
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include <stdint.h>
+
+#include "libavutil/attributes.h"
+#include "libavutil/aarch64/cpu.h"
+#include "libavcodec/vp9.h"
+
+#define declare_fpel(type, sz)                                          \
+void ff_vp9_##type##sz##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                              const uint8_t *src, ptrdiff_t src_stride, \
+                              int h, int mx, int my)
+
+#define declare_copy_avg(sz) \
+    declare_fpel(copy, sz);  \
+    declare_fpel(avg , sz)
+
+#define decl_mc_func(op, filter, dir, sz)                                                \
+void ff_vp9_##op##_##filter##sz##_##dir##_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                               const uint8_t *src, ptrdiff_t src_stride, \
+                                               int h, int mx, int my)
+
+#define define_8tap_2d_fn(op, filter, sz)                                         \
+static void op##_##filter##sz##_hv_neon(uint8_t *dst, ptrdiff_t dst_stride,       \
+                                        const uint8_t *src, ptrdiff_t src_stride, \
+                                        int h, int mx, int my)                    \
+{                                                                                 \
+    LOCAL_ALIGNED_16(uint8_t, temp, [((sz < 64 ? 2 * sz : 64) + 8) * sz]);        \
+    /* We only need h + 7 lines, but the horizontal filter assumes an             \
+     * even number of rows, so filter h + 8 lines here. */                        \
+    ff_vp9_put_##filter##sz##_h_neon(temp, sz,                                    \
+                                     src - 3 * src_stride, src_stride,            \
+                                     h + 8, mx, 0);                               \
+    ff_vp9_##op##_##filter##sz##_v_neon(dst, dst_stride,                          \
+                                        temp + 3 * sz, sz,                        \
+                                        h, 0, my);                                \
+}
+
+#define decl_filter_funcs(op, dir, sz)  \
+    decl_mc_func(op, regular, dir, sz); \
+    decl_mc_func(op, sharp,   dir, sz); \
+    decl_mc_func(op, smooth,  dir, sz)
+
+#define decl_mc_funcs(sz)           \
+    decl_filter_funcs(put, h,  sz); \
+    decl_filter_funcs(avg, h,  sz); \
+    decl_filter_funcs(put, v,  sz); \
+    decl_filter_funcs(avg, v,  sz); \
+    decl_filter_funcs(put, hv, sz); \
+    decl_filter_funcs(avg, hv, sz)
+
+declare_copy_avg(64);
+declare_copy_avg(32);
+declare_copy_avg(16);
+declare_copy_avg(8);
+declare_copy_avg(4);
+
+decl_mc_funcs(64);
+decl_mc_funcs(32);
+decl_mc_funcs(16);
+decl_mc_funcs(8);
+decl_mc_funcs(4);
+
+#define define_8tap_2d_funcs(sz)        \
+    define_8tap_2d_fn(put, regular, sz) \
+    define_8tap_2d_fn(put, sharp,   sz) \
+    define_8tap_2d_fn(put, smooth,  sz) \
+    define_8tap_2d_fn(avg, regular, sz) \
+    define_8tap_2d_fn(avg, sharp,   sz) \
+    define_8tap_2d_fn(avg, smooth,  sz)
+
+define_8tap_2d_funcs(64)
+define_8tap_2d_funcs(32)
+define_8tap_2d_funcs(16)
+define_8tap_2d_funcs(8)
+define_8tap_2d_funcs(4)
+
+av_cold void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_fpel(idx1, idx2, sz, type) \
+    dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_8TAP_SHARP  ][idx2][0][0] = \
+    dsp->mc[idx1][FILTER_BILINEAR    ][idx2][0][0] = ff_vp9_##type##sz##_neon
+
+#define init_copy_avg(idx, sz) \
+    init_fpel(idx, 0, sz, copy); \
+    init_fpel(idx, 1, sz, avg)
+
+#define init_mc_func(idx1, idx2, op, filter, fname, dir, mx, my, sz, pfx) \
+    dsp->mc[idx1][filter][idx2][mx][my] = pfx##op##_##fname##sz##_##dir##_neon
+
+#define init_mc_funcs(idx, dir, mx, my, sz, pfx) \
+    init_mc_func(idx, 0, put, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 0, put, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_REGULAR, regular, dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SHARP,   sharp,   dir, mx, my, sz, pfx); \
+    init_mc_func(idx, 1, avg, FILTER_8TAP_SMOOTH,  smooth,  dir, mx, my, sz, pfx)
+
+#define init_mc_funcs_dirs(idx, sz) \
+    init_mc_funcs(idx, h,  1, 0, sz, ff_vp9_); \
+    init_mc_funcs(idx, v,  0, 1, sz, ff_vp9_); \
+    init_mc_funcs(idx, hv, 1, 1, sz,)
+
+        init_copy_avg(0, 64);
+        init_copy_avg(1, 32);
+        init_copy_avg(2, 16);
+        init_copy_avg(3, 8);
+        init_copy_avg(4, 4);
+
+        init_mc_funcs_dirs(0, 64);
+        init_mc_funcs_dirs(1, 32);
+        init_mc_funcs_dirs(2, 16);
+        init_mc_funcs_dirs(3, 8);
+        init_mc_funcs_dirs(4, 4);
+    }
+}
diff --git a/libavcodec/aarch64/vp9mc_neon.S b/libavcodec/aarch64/vp9mc_neon.S
new file mode 100644
index 0000000..9ca2a32
--- /dev/null
+++ b/libavcodec/aarch64/vp9mc_neon.S
@@ -0,0 +1,733 @@ 
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/aarch64/asm.S"
+
+const regular_filter, align=4
+        .short  0,  1,  -5, 126,   8,  -3,  1,  0
+        .short -1,  3, -10, 122,  18,  -6,  2,  0
+        .short -1,  4, -13, 118,  27,  -9,  3, -1
+        .short -1,  4, -16, 112,  37, -11,  4, -1
+        .short -1,  5, -18, 105,  48, -14,  4, -1
+        .short -1,  5, -19,  97,  58, -16,  5, -1
+        .short -1,  6, -19,  88,  68, -18,  5, -1
+        .short -1,  6, -19,  78,  78, -19,  6, -1
+        .short -1,  5, -18,  68,  88, -19,  6, -1
+        .short -1,  5, -16,  58,  97, -19,  5, -1
+        .short -1,  4, -14,  48, 105, -18,  5, -1
+        .short -1,  4, -11,  37, 112, -16,  4, -1
+        .short -1,  3,  -9,  27, 118, -13,  4, -1
+        .short  0,  2,  -6,  18, 122, -10,  3, -1
+        .short  0,  1,  -3,   8, 126,  -5,  1,  0
+endconst
+
+const sharp_filter, align=4
+        .short -1,  3,  -7, 127,   8,  -3,  1,  0
+        .short -2,  5, -13, 125,  17,  -6,  3, -1
+        .short -3,  7, -17, 121,  27, -10,  5, -2
+        .short -4,  9, -20, 115,  37, -13,  6, -2
+        .short -4, 10, -23, 108,  48, -16,  8, -3
+        .short -4, 10, -24, 100,  59, -19,  9, -3
+        .short -4, 11, -24,  90,  70, -21, 10, -4
+        .short -4, 11, -23,  80,  80, -23, 11, -4
+        .short -4, 10, -21,  70,  90, -24, 11, -4
+        .short -3,  9, -19,  59, 100, -24, 10, -4
+        .short -3,  8, -16,  48, 108, -23, 10, -4
+        .short -2,  6, -13,  37, 115, -20,  9, -4
+        .short -2,  5, -10,  27, 121, -17,  7, -3
+        .short -1,  3,  -6,  17, 125, -13,  5, -2
+        .short  0,  1,  -3,   8, 127,  -7,  3, -1
+endconst
+
+const smooth_filter, align=4
+        .short -3, -1,  32,  64,  38,   1, -3,  0
+        .short -2, -2,  29,  63,  41,   2, -3,  0
+        .short -2, -2,  26,  63,  43,   4, -4,  0
+        .short -2, -3,  24,  62,  46,   5, -4,  0
+        .short -2, -3,  21,  60,  49,   7, -4,  0
+        .short -1, -4,  18,  59,  51,   9, -4,  0
+        .short -1, -4,  16,  57,  53,  12, -4, -1
+        .short -1, -4,  14,  55,  55,  14, -4, -1
+        .short -1, -4,  12,  53,  57,  16, -4, -1
+        .short  0, -4,   9,  51,  59,  18, -4, -1
+        .short  0, -4,   7,  49,  60,  21, -3, -2
+        .short  0, -4,   5,  46,  62,  24, -3, -2
+        .short  0, -4,   4,  43,  63,  26, -2, -2
+        .short  0, -3,   2,  41,  63,  29, -2, -2
+        .short  0, -3,   1,  38,  64,  32, -1, -3
+endconst
+
+// All public functions in this file have the following signature:
+// typedef void (*vp9_mc_func)(uint8_t *dst, ptrdiff_t dst_stride,
+//                            const uint8_t *ref, ptrdiff_t ref_stride,
+//                            int h, int mx, int my);
+
+function ff_vp9_copy64_neon, export=1
+1:
+        ldp             x5,  x6,  [x2]
+        stp             x5,  x6,  [x0]
+        ldp             x5,  x6,  [x2, #16]
+        stp             x5,  x6,  [x0, #16]
+        subs            w4,  w4,  #1
+        ldp             x5,  x6,  [x2, #32]
+        stp             x5,  x6,  [x0, #32]
+        ldp             x5,  x6,  [x2, #48]
+        stp             x5,  x6,  [x0, #48]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg64_neon, export=1
+        mov             x5,  x0
+1:
+        ld1             {v4.16b,  v5.16b,  v6.16b,  v7.16b},  [x2], x3
+        ld1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x0], x1
+        ld1             {v20.16b, v21.16b, v22.16b, v23.16b}, [x2], x3
+        urhadd          v0.16b,  v0.16b,  v4.16b
+        urhadd          v1.16b,  v1.16b,  v5.16b
+        ld1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x0], x1
+        urhadd          v2.16b,  v2.16b,  v6.16b
+        urhadd          v3.16b,  v3.16b,  v7.16b
+        subs            w4,  w4,  #2
+        urhadd          v16.16b, v16.16b, v20.16b
+        urhadd          v17.16b, v17.16b, v21.16b
+        st1             {v0.16b,  v1.16b,  v2.16b,  v3.16b},  [x5], x1
+        urhadd          v18.16b, v18.16b, v22.16b
+        urhadd          v19.16b, v19.16b, v23.16b
+        st1             {v16.16b, v17.16b, v18.16b, v19.16b}, [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy32_neon, export=1
+1:
+        ldp             x5,  x6,  [x2]
+        stp             x5,  x6,  [x0]
+        subs            w4,  w4,  #1
+        ldp             x5,  x6,  [x2, #16]
+        stp             x5,  x6,  [x0, #16]
+        add             x2,  x2,  x3
+        add             x0,  x0,  x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg32_neon, export=1
+1:
+        ld1             {v2.16b, v3.16b},  [x2], x3
+        ld1             {v0.16b, v1.16b},  [x0]
+        urhadd          v0.16b,  v0.16b,  v2.16b
+        urhadd          v1.16b,  v1.16b,  v3.16b
+        subs            w4,  w4,  #1
+        st1             {v0.16b, v1.16b},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy16_neon, export=1
+        add             x5,  x0,  x1
+        add             x1,  x1,  x1
+        add             x6,  x2,  x3
+        add             x3,  x3,  x3
+1:
+        ld1             {v0.16b},  [x2], x3
+        ld1             {v1.16b},  [x6], x3
+        ld1             {v2.16b},  [x2], x3
+        ld1             {v3.16b},  [x6], x3
+        subs            w4,  w4,  #4
+        st1             {v0.16b},  [x0], x1
+        st1             {v1.16b},  [x5], x1
+        st1             {v2.16b},  [x0], x1
+        st1             {v3.16b},  [x5], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg16_neon, export=1
+1:
+        ld1             {v2.16b},  [x2], x3
+        ld1             {v0.16b},  [x0], x1
+        ld1             {v3.16b},  [x2], x3
+        urhadd          v0.16b,  v0.16b,  v2.16b
+        ld1             {v1.16b},  [x0]
+        sub             x0,  x0,  x1
+        urhadd          v1.16b,  v1.16b,  v3.16b
+        subs            w4,  w4,  #2
+        st1             {v0.16b},  [x0], x1
+        st1             {v1.16b},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy8_neon, export=1
+1:
+        ld1             {v0.8b},  [x2], x3
+        ld1             {v1.8b},  [x2], x3
+        subs            w4,  w4,  #2
+        st1             {v0.8b},  [x0], x1
+        st1             {v1.8b},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg8_neon, export=1
+1:
+        ld1             {v2.8b},  [x2], x3
+        ld1             {v0.8b},  [x0], x1
+        ld1             {v3.8b},  [x2], x3
+        urhadd          v0.8b,  v0.8b,  v2.8b
+        ld1             {v1.8b},  [x0]
+        sub             x0,  x0,  x1
+        urhadd          v1.8b,  v1.8b,  v3.8b
+        subs            w4,  w4,  #2
+        st1             {v0.8b},  [x0], x1
+        st1             {v1.8b},  [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_copy4_neon, export=1
+1:
+        ld1             {v0.s}[0], [x2], x3
+        ld1             {v1.s}[0], [x2], x3
+        st1             {v0.s}[0], [x0], x1
+        ld1             {v2.s}[0], [x2], x3
+        st1             {v1.s}[0], [x0], x1
+        ld1             {v3.s}[0], [x2], x3
+        subs            w4,  w4,  #4
+        st1             {v2.s}[0], [x0], x1
+        st1             {v3.s}[0], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+function ff_vp9_avg4_neon, export=1
+1:
+        ld1r            {v4.2s}, [x2], x3
+        ld1r            {v0.2s}, [x0], x1
+        ld1r            {v5.2s}, [x2], x3
+        ld1r            {v1.2s}, [x0], x1
+        ld1r            {v6.2s}, [x2], x3
+        ld1r            {v2.2s}, [x0], x1
+        ld1r            {v7.2s}, [x2], x3
+        ld1r            {v3.2s}, [x0], x1
+        sub             x0,  x0,  x1, lsl #2
+        subs            w4,  w4,  #4
+        urhadd          v0.8b,  v0.8b,  v4.8b
+        urhadd          v1.8b,  v1.8b,  v5.8b
+        urhadd          v2.8b,  v2.8b,  v6.8b
+        urhadd          v3.8b,  v3.8b,  v7.8b
+        st1             {v0.s}[0], [x0], x1
+        st1             {v1.s}[0], [x0], x1
+        st1             {v2.s}[0], [x0], x1
+        st1             {v3.s}[0], [x0], x1
+        b.ne            1b
+        ret
+endfunc
+
+
+// Extract a vector from src1-src2 and src4-src5 (src1-src3 and src4-src6
+// for size >= 16), and multiply-accumulate into dst1 and dst3 (or
+// dst1-dst2 and dst3-dst4 for size >= 16)
+.macro extmla dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
+        ext             v20.16b, \src1, \src2, #(2*\offset)
+        ext             v22.16b, \src4, \src5, #(2*\offset)
+.if \size >= 16
+        mla             \dst1, v20.8h, v0.h[\offset]
+        ext             v21.16b, \src2, \src3, #(2*\offset)
+        mla             \dst3, v22.8h, v0.h[\offset]
+        ext             v23.16b, \src5, \src6, #(2*\offset)
+        mla             \dst2, v21.8h, v0.h[\offset]
+        mla             \dst4, v23.8h, v0.h[\offset]
+.else
+        mla             \dst1, v20.8h, v0.h[\offset]
+        mla             \dst3, v22.8h, v0.h[\offset]
+.endif
+.endm
+// The same as above, but don't accumulate straight into the
+// destination, but use a temp register and accumulate with saturation.
+.macro extmulqadd dst1, dst2, dst3, dst4, src1, src2, src3, src4, src5, src6, offset, size
+        ext             v20.16b, \src1, \src2, #(2*\offset)
+        ext             v22.16b, \src4, \src5, #(2*\offset)
+.if \size >= 16
+        mul             v20.8h, v20.8h, v0.h[\offset]
+        ext             v21.16b, \src2, \src3, #(2*\offset)
+        mul             v22.8h, v22.8h, v0.h[\offset]
+        ext             v23.16b, \src5, \src6, #(2*\offset)
+        mul             v21.8h, v21.8h, v0.h[\offset]
+        mul             v23.8h, v23.8h, v0.h[\offset]
+.else
+        mul             v20.8h, v20.8h, v0.h[\offset]
+        mul             v22.8h, v22.8h, v0.h[\offset]
+.endif
+        sqadd           \dst1, \dst1, v20.8h
+        sqadd           \dst3, \dst3, v22.8h
+.if \size >= 16
+        sqadd           \dst2, \dst2, v21.8h
+        sqadd           \dst4, \dst4, v23.8h
+.endif
+.endm
+
+
+// Instantiate a horizontal filter function for the given size.
+// This can work on 4, 8 or 16 pixels in parallel; for larger
+// widths it will do 16 pixels at a time and loop horizontally.
+// The actual width is passed in x5, the height in w4 and the
+// filter coefficients in x9. idx2 is the index of the largest
+// filter coefficient (3 or 4) and idx1 is the other one of them.
+.macro do_8tap_h type, size, idx1, idx2
+function \type\()_8tap_\size\()h_\idx1\idx2
+        sub             x2,  x2,  #3
+        add             x6,  x0,  x1
+        add             x7,  x2,  x3
+        add             x1,  x1,  x1
+        add             x3,  x3,  x3
+        // Only size >= 16 loops horizontally and needs
+        // reduced dst stride
+.if \size >= 16
+        sub             x1,  x1,  x5
+.endif
+        // size >= 16 loads two qwords and increments r2,
+        // for size 4/8 it's enough with one qword and no
+        // postincrement
+.if \size >= 16
+        sub             x3,  x3,  x5
+        sub             x3,  x3,  #8
+.endif
+        // Load the filter vector
+        ld1             {v0.8h},  [x9]
+1:
+.if \size >= 16
+        mov             x9,  x5
+.endif
+        // Load src
+.if \size >= 16
+        ld1             {v4.16b},  [x2], #16
+        ld1             {v16.16b}, [x7], #16
+        ld1             {v6.8b},   [x2], #8
+        ld1             {v18.8b},  [x7], #8
+.else
+        ld1             {v4.16b},  [x2]
+        ld1             {v16.16b}, [x7]
+.endif
+        uxtl2           v5.8h,  v4.16b
+        uxtl            v4.8h,  v4.8b
+        uxtl2           v17.8h, v16.16b
+        uxtl            v16.8h, v16.8b
+.if \size >= 16
+        uxtl            v6.8h,  v6.8b
+        uxtl            v18.8h, v18.8b
+.endif
+2:
+
+        // Accumulate, adding idx2 last with a separate
+        // saturating add. The positive filter coefficients
+        // for all indices except idx2 must add up to less
+        // than 127 for this not to overflow.
+        mul             v1.8h,  v4.8h,  v0.h[0]
+        mul             v24.8h, v16.8h, v0.h[0]
+.if \size >= 16
+        mul             v2.8h,  v5.8h,  v0.h[0]
+        mul             v25.8h, v17.8h, v0.h[0]
+.endif
+        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 1,     \size
+        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 2,     \size
+        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx1, \size
+        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 5,     \size
+        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 6,     \size
+        extmla          v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, 7,     \size
+        extmulqadd      v1.8h,  v2.8h,  v24.8h, v25.8h, v4.16b,  v5.16b,  v6.16b,  v16.16b, v17.16b, v18.16b, \idx2, \size
+
+        // Round, shift and saturate
+        sqrshrun        v1.8b,   v1.8h,  #7
+        sqrshrun        v24.8b,  v24.8h, #7
+.if \size >= 16
+        sqrshrun2       v1.16b,  v2.8h,  #7
+        sqrshrun2       v24.16b, v25.8h, #7
+.endif
+        // Average
+.ifc \type,avg
+.if \size >= 16
+        ld1             {v2.16b}, [x0]
+        ld1             {v3.16b}, [x6]
+        urhadd          v1.16b,  v1.16b,  v2.16b
+        urhadd          v24.16b, v24.16b, v3.16b
+.elseif \size == 8
+        ld1             {v2.8b},  [x0]
+        ld1             {v3.8b},  [x6]
+        urhadd          v1.8b,  v1.8b,  v2.8b
+        urhadd          v24.8b, v24.8b, v3.8b
+.else
+        ld1r            {v2.2s},  [x0]
+        ld1r            {v3.2s},  [x6]
+        urhadd          v1.8b,  v1.8b,  v2.8b
+        urhadd          v24.8b, v24.8b, v3.8b
+.endif
+.endif
+        // Store and loop horizontally (for size >= 16)
+.if \size >= 16
+        st1             {v1.16b},  [x0], #16
+        st1             {v24.16b}, [x6], #16
+        mov             v4.16b,  v6.16b
+        mov             v16.16b, v18.16b
+        subs            x9,  x9,  #16
+        beq             3f
+        ld1             {v6.16b},  [x2], #16
+        ld1             {v18.16b}, [x7], #16
+        uxtl            v5.8h,  v6.8b
+        uxtl2           v6.8h,  v6.16b
+        uxtl            v17.8h, v18.8b
+        uxtl2           v18.8h, v18.16b
+        b               2b
+.elseif \size == 8
+        st1             {v1.8b},    [x0]
+        st1             {v24.8b},   [x6]
+.else // \size == 4
+        st1             {v1.s}[0],  [x0]
+        st1             {v24.s}[0], [x6]
+.endif
+3:
+        // Loop vertically
+        add             x0,  x0,  x1
+        add             x6,  x6,  x1
+        add             x2,  x2,  x3
+        add             x7,  x7,  x3
+        subs            w4,  w4,  #2
+        b.ne            1b
+        ret
+endfunc
+.endm
+
+.macro do_8tap_h_size size
+do_8tap_h put, \size, 3, 4
+do_8tap_h avg, \size, 3, 4
+do_8tap_h put, \size, 4, 3
+do_8tap_h avg, \size, 4, 3
+.endm
+
+do_8tap_h_size 4
+do_8tap_h_size 8
+do_8tap_h_size 16
+
+.macro do_8tap_h_func type, filter, size
+function ff_vp9_\type\()_\filter\()\size\()_h_neon, export=1
+        movrel          x6,  \filter\()_filter-16
+        cmp             w5,  #8
+        add             x9,  x6,  w5, uxtw #4
+        mov             x5,  #\size
+.if \size >= 16
+        bge             \type\()_8tap_16h_34
+        b               \type\()_8tap_16h_43
+.else
+        bge             \type\()_8tap_\size\()h_34
+        b               \type\()_8tap_\size\()h_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_h_filters size
+do_8tap_h_func put, regular, \size
+do_8tap_h_func avg, regular, \size
+do_8tap_h_func put, sharp,   \size
+do_8tap_h_func avg, sharp,   \size
+do_8tap_h_func put, smooth,  \size
+do_8tap_h_func avg, smooth,  \size
+.endm
+
+do_8tap_h_filters 64
+do_8tap_h_filters 32
+do_8tap_h_filters 16
+do_8tap_h_filters 8
+do_8tap_h_filters 4
+
+
+// Vertical filters
+
+// Round, shift and saturate and store reg1-reg2 over 4 lines
+.macro do_store4 reg1, reg2, tmp1, tmp2, type
+        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
+        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
+.ifc \type,avg
+        ld1r            {\tmp1\().2s},     [x0], x1
+        ld1r            {\tmp2\().2s},     [x0], x1
+        ld1             {\tmp1\().s}[1],  [x0], x1
+        ld1             {\tmp2\().s}[1],  [x0], x1
+        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
+        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
+        sub             x0,  x0,  x1, lsl #2
+.endif
+        st1             {\reg1\().s}[0],  [x0], x1
+        st1             {\reg2\().s}[0],  [x0], x1
+        st1             {\reg1\().s}[1],  [x0], x1
+        st1             {\reg2\().s}[1],  [x0], x1
+.endm
+
+// Round, shift and saturate and store reg1-4
+.macro do_store reg1, reg2, reg3, reg4, tmp1, tmp2, tmp3, tmp4, type
+        sqrshrun        \reg1\().8b,  \reg1\().8h, #7
+        sqrshrun        \reg2\().8b,  \reg2\().8h, #7
+        sqrshrun        \reg3\().8b,  \reg3\().8h, #7
+        sqrshrun        \reg4\().8b,  \reg4\().8h, #7
+.ifc \type,avg
+        ld1             {\tmp1\().8b},  [x0], x1
+        ld1             {\tmp2\().8b},  [x0], x1
+        ld1             {\tmp3\().8b},  [x0], x1
+        ld1             {\tmp4\().8b},  [x0], x1
+        urhadd          \reg1\().8b,  \reg1\().8b,  \tmp1\().8b
+        urhadd          \reg2\().8b,  \reg2\().8b,  \tmp2\().8b
+        urhadd          \reg3\().8b,  \reg3\().8b,  \tmp3\().8b
+        urhadd          \reg4\().8b,  \reg4\().8b,  \tmp4\().8b
+        sub             x0,  x0,  x1, lsl #2
+.endif
+        st1             {\reg1\().8b},  [x0], x1
+        st1             {\reg2\().8b},  [x0], x1
+        st1             {\reg3\().8b},  [x0], x1
+        st1             {\reg4\().8b},  [x0], x1
+.endm
+
+// Evaluate the filter twice in parallel, from the inputs src1-src9 into dst1-dst2
+// (src1-src8 into dst1, src2-src9 into dst2), adding idx2 separately
+// at the end with saturation. Indices 0 and 7 always have negative or zero
+// coefficients, so they can be accumulated into tmp1-tmp2 together with the
+// largest coefficient.
+.macro convolve dst1, dst2, src1, src2, src3, src4, src5, src6, src7, src8, src9, idx1, idx2, tmp1, tmp2
+        mul             \dst1\().8h, \src2\().8h, v0.h[1]
+        mul             \dst2\().8h, \src3\().8h, v0.h[1]
+        mul             \tmp1\().8h, \src1\().8h, v0.h[0]
+        mul             \tmp2\().8h, \src2\().8h, v0.h[0]
+        mla             \dst1\().8h, \src3\().8h, v0.h[2]
+        mla             \dst2\().8h, \src4\().8h, v0.h[2]
+.if \idx1 == 3
+        mla             \dst1\().8h, \src4\().8h, v0.h[3]
+        mla             \dst2\().8h, \src5\().8h, v0.h[3]
+.else
+        mla             \dst1\().8h, \src5\().8h, v0.h[4]
+        mla             \dst2\().8h, \src6\().8h, v0.h[4]
+.endif
+        mla             \dst1\().8h, \src6\().8h, v0.h[5]
+        mla             \dst2\().8h, \src7\().8h, v0.h[5]
+        mla             \tmp1\().8h, \src8\().8h, v0.h[7]
+        mla             \tmp2\().8h, \src9\().8h, v0.h[7]
+        mla             \dst1\().8h, \src7\().8h, v0.h[6]
+        mla             \dst2\().8h, \src8\().8h, v0.h[6]
+.if \idx2 == 3
+        mla             \tmp1\().8h, \src4\().8h, v0.h[3]
+        mla             \tmp2\().8h, \src5\().8h, v0.h[3]
+.else
+        mla             \tmp1\().8h, \src5\().8h, v0.h[4]
+        mla             \tmp2\().8h, \src6\().8h, v0.h[4]
+.endif
+        sqadd           \dst1\().8h, \dst1\().8h, \tmp1\().8h
+        sqadd           \dst2\().8h, \dst2\().8h, \tmp2\().8h
+.endm
+
+// Load pixels and extend them to 16 bit
+.macro loadl dst1, dst2, dst3, dst4
+        ld1             {v1.8b}, [x2], x3
+        ld1             {v2.8b}, [x2], x3
+        ld1             {v3.8b}, [x2], x3
+.ifnb \dst4
+        ld1             {v4.8b}, [x2], x3
+.endif
+        uxtl            \dst1\().8h, v1.8b
+        uxtl            \dst2\().8h, v2.8b
+        uxtl            \dst3\().8h, v3.8b
+.ifnb \dst4
+        uxtl            \dst4\().8h, v4.8b
+.endif
+.endm
+
+// Instantiate a vertical filter function for filtering 8 pixels at a time.
+// The height is passed in x4, the width in x5 and the filter coefficients
+// in x6. idx2 is the index of the largest filter coefficient (3 or 4)
+// and idx1 is the other one of them.
+.macro do_8tap_8v type, idx1, idx2
+function \type\()_8tap_8v_\idx1\idx2
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+1:
+        mov             x6,  x4
+
+        loadl           v17, v18, v19
+
+        loadl           v20, v21, v22, v23
+2:
+        loadl           v24, v25, v26, v27
+        convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v5,  v6
+        convolve        v3,  v4,  v19, v20, v21, v22, v23, v24, v25, v26, v27, \idx1, \idx2, v5,  v6
+        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        loadl           v16, v17, v18, v19
+        convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v16, v17, \idx1, \idx2, v5,  v6
+        convolve        v3,  v4,  v23, v24, v25, v26, v27, v16, v17, v18, v19, \idx1, \idx2, v5,  v6
+        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
+
+        subs            x6,  x6,  #4
+        b.eq            8f
+
+        loadl           v20, v21, v22, v23
+        convolve        v1,  v2,  v25, v26, v27, v16, v17, v18, v19, v20, v21, \idx1, \idx2, v5,  v6
+        convolve        v3,  v4,  v27, v16, v17, v18, v19, v20, v21, v22, v23, \idx1, \idx2, v5,  v6
+        do_store        v1,  v2,  v3,  v4,  v5,  v6,  v7,  v28, \type
+
+        subs            x6,  x6,  #4
+        b.ne            2b
+
+8:
+        subs            x5,  x5,  #8
+        b.eq            9f
+        // x0 -= h * dst_stride
+        msub            x0,  x1,  x4, x0
+        // x2 -= h * src_stride
+        msub            x2,  x3,  x4, x2
+        // x2 -= 8 * src_stride
+        sub             x2,  x2,  x3, lsl #3
+        // x2 += 1 * src_stride
+        add             x2,  x2,  x3
+        add             x2,  x2,  #8
+        add             x0,  x0,  #8
+        b               1b
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_8v put, 3, 4
+do_8tap_8v put, 4, 3
+do_8tap_8v avg, 3, 4
+do_8tap_8v avg, 4, 3
+
+
+// Instantiate a vertical filter function for filtering a 4 pixels wide
+// slice. The first half of the registers contain one row, while the second
+// half of a register contains the second-next row (also stored in the first
+// half of the register two steps ahead). The convolution does two outputs
+// at a time; the output of v17-v24 into one, and v18-v25 into another one.
+// The first half of first output is the first output row, the first half
+// of the other output is the second output row. The second halves of the
+// registers are rows 3 and 4.
+// This only is designed to work for 4 or 8 output lines.
+.macro do_8tap_4v type, idx1, idx2
+function \type\()_8tap_4v_\idx1\idx2
+        sub             x2,  x2,  x3, lsl #1
+        sub             x2,  x2,  x3
+        ld1             {v0.8h},  [x6]
+
+        ld1r            {v1.2s},    [x2], x3
+        ld1r            {v2.2s},    [x2], x3
+        ld1r            {v3.2s},    [x2], x3
+        ld1r            {v4.2s},    [x2], x3
+        ld1r            {v5.2s},    [x2], x3
+        ld1r            {v6.2s},    [x2], x3
+        ext             v1.8b,  v1.8b,  v3.8b,  #4
+        ld1r            {v7.2s},    [x2], x3
+        ext             v2.8b,  v2.8b,  v4.8b,  #4
+        ld1r            {v30.2s},   [x2], x3
+        uxtl            v17.8h, v1.8b
+        ext             v3.8b,  v3.8b,  v5.8b,  #4
+        ld1r            {v31.2s},   [x2], x3
+        uxtl            v18.8h, v2.8b
+        ext             v4.8b,  v4.8b,  v6.8b,  #4
+        uxtl            v19.8h, v3.8b
+        ext             v5.8b,  v5.8b,  v7.8b,  #4
+        ld1             {v30.s}[1], [x2], x3
+        uxtl            v20.8h, v4.8b
+        ext             v6.8b,  v6.8b,  v30.8b, #4
+        uxtl            v21.8h, v5.8b
+        ext             v7.8b,  v7.8b,  v31.8b, #4
+        ld1             {v31.s}[1], [x2]
+        uxtl            v22.8h, v6.8b
+        uxtl            v23.8h, v7.8b
+        sub             x2,  x2,  x3
+        uxtl            v24.8h, v30.8b
+        uxtl            v25.8h, v31.8b
+
+        convolve        v1,  v2,  v17, v18, v19, v20, v21, v22, v23, v24, v25, \idx1, \idx2, v3,  v4
+        do_store4       v1,  v2,  v5,  v6,  \type
+
+        subs            x4,  x4,  #4
+        b.eq            9f
+
+        ld1r            {v1.2s},    [x2], x3
+        ld1r            {v2.2s},    [x2], x3
+        ld1r            {v3.2s},    [x2], x3
+        ld1r            {v4.2s},    [x2], x3
+        sub             x2,  x2,  x3, lsl #1
+        ld1             {v1.s}[1],  [x2], x3
+        ld1             {v2.s}[1],  [x2], x3
+        uxtl            v26.8h, v1.8b
+        ld1             {v3.s}[1],  [x2], x3
+        uxtl            v27.8h, v2.8b
+        ld1             {v4.s}[1],  [x2], x3
+
+        uxtl            v28.8h, v3.8b
+        uxtl            v29.8h, v4.8b
+
+        convolve        v1,  v2,  v21, v22, v23, v24, v25, v26, v27, v28, v29, \idx1, \idx2, v3,  v4
+        do_store4       v1,  v2,  v5,  v6,  \type
+
+9:
+        ret
+endfunc
+.endm
+
+do_8tap_4v put, 3, 4
+do_8tap_4v put, 4, 3
+do_8tap_4v avg, 3, 4
+do_8tap_4v avg, 4, 3
+
+
+.macro do_8tap_v_func type, filter, size
+function ff_vp9_\type\()_\filter\()\size\()_v_neon, export=1
+        uxtw            x4,  w4
+        movrel          x5,  \filter\()_filter-16
+        cmp             w6,  #8
+        add             x6,  x5,  w6, uxtw #4
+        mov             x5,  #\size
+.if \size >= 8
+        b.ge            \type\()_8tap_8v_34
+        b               \type\()_8tap_8v_43
+.else
+        b.ge            \type\()_8tap_4v_34
+        b               \type\()_8tap_4v_43
+.endif
+endfunc
+.endm
+
+.macro do_8tap_v_filters size
+do_8tap_v_func put, regular, \size
+do_8tap_v_func avg, regular, \size
+do_8tap_v_func put, sharp,   \size
+do_8tap_v_func avg, sharp,   \size
+do_8tap_v_func put, smooth,  \size
+do_8tap_v_func avg, smooth,  \size
+.endm
+
+do_8tap_v_filters 64
+do_8tap_v_filters 32
+do_8tap_v_filters 16
+do_8tap_v_filters 8
+do_8tap_v_filters 4
diff --git a/libavcodec/vp9.h b/libavcodec/vp9.h
index e4b9f82..2e03ae8 100644
--- a/libavcodec/vp9.h
+++ b/libavcodec/vp9.h
@@ -433,6 +433,7 @@  typedef struct VP9Context {
 
 void ff_vp9dsp_init(VP9DSPContext *dsp);
 
+void ff_vp9dsp_init_aarch64(VP9DSPContext *dsp);
 void ff_vp9dsp_init_arm(VP9DSPContext *dsp);
 void ff_vp9dsp_init_x86(VP9DSPContext *dsp);
 
diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index 0eca389..c198a47 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -2164,6 +2164,8 @@  av_cold void ff_vp9dsp_init(VP9DSPContext *dsp)
     vp9dsp_loopfilter_init(dsp);
     vp9dsp_mc_init(dsp);
 
+    if (ARCH_AARCH64)
+        ff_vp9dsp_init_aarch64(dsp);
     if (ARCH_ARM)
         ff_vp9dsp_init_arm(dsp);
     if (ARCH_X86)