[PATCHv2] arm: vp9: Add NEON itxfm routines

Message ID 1476814050-10850-1-git-send-email-martin@martin.st
State Committed
Headers show

Commit Message

Martin Storsjö Oct. 18, 2016, 6:07 p.m.
This work is sponsored by, and copyright, Google.

For the transforms up to 8x8, we can fit all the data (including
temporaries) in registers and just do a straightforward transform
of all the data. For 16x16, we do a transform of 4x16 pixels in
4 slices, using a temporary buffer. For 32x32, we transform 4x32
pixels at a time, in two steps of 4x16 pixels each.

Examples of relative speedup compared to the C version, from checkasm:
                         Cortex       A7     A8     A9    A53
vp9_inv_adst_adst_4x4_add_neon:     3.39   5.80   4.18   3.92
vp9_inv_adst_adst_8x8_add_neon:     3.94   4.82   4.25   3.89
vp9_inv_adst_adst_16x16_add_neon:   3.33   4.27   4.08   4.05
vp9_inv_dct_dct_4x4_add_neon:       3.73   5.06   4.26   4.28
vp9_inv_dct_dct_8x8_add_neon:       4.59   5.81   5.03   4.73
vp9_inv_dct_dct_16x16_add_neon:     3.40   3.39   3.33   3.68
vp9_inv_dct_dct_32x32_add_neon:     4.00   3.51   3.80   4.40
vp9_inv_wht_wht_4x4_add_neon:       3.24   5.16   3.52   3.67

Thus, the speedup vs C code is around 3-5x.

This is mostly marginally faster than the corresponding routines
in libvpx on most cores, tested with their 32x32 idct (compared to
vpx_idct32x32_1024_add_neon). These numbers are slightly in libvpx's
favour since their version doesn't clear the input buffer like ours
do (although the effect of that on the total runtime probably is
negligible.)

                           Cortex       A7       A8       A9      A53
vp9_inv_dct_dct_32x32_add_neon:    18852.0  16831.6  14217.4  11988.6
libvpx vpx_idct32x32_1024_add_neon 20789.0  13344.3  15049.9  13030.5

Only on the Cortex A8, the libvpx function is faster. On the other cores,
ours is slightly faster even though ours has got source block clearing
integrated.
---
v2: Updated some broken macro comments, optimized the transposes by
using the q registers for part of transposes.

Suggestions very much welcome on names for the macros - no idea if
the current ones make sense or what one commonly would call these
combinations.

I'm a bit reluctant to expanding the macros (to be able to schedule
instructions better), in order to keep things readable. (Although,
I guess this is kinda write-only code, which nobody ever touches
afterwards).
---
 libavcodec/arm/Makefile          |    3 +-
 libavcodec/arm/vp9dsp_init_arm.c |   51 +-
 libavcodec/arm/vp9itxfm_neon.S   | 1166 ++++++++++++++++++++++++++++++++++++++
 3 files changed, 1218 insertions(+), 2 deletions(-)
 create mode 100644 libavcodec/arm/vp9itxfm_neon.S

Comments

Janne Grunau Nov. 11, 2016, 1:16 a.m. | #1
On 2016-10-18 21:07:30 +0300, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
> 
> For the transforms up to 8x8, we can fit all the data (including
> temporaries) in registers and just do a straightforward transform
> of all the data. For 16x16, we do a transform of 4x16 pixels in
> 4 slices, using a temporary buffer. For 32x32, we transform 4x32
> pixels at a time, in two steps of 4x16 pixels each.
> 
> Examples of relative speedup compared to the C version, from checkasm:
>                          Cortex       A7     A8     A9    A53
> vp9_inv_adst_adst_4x4_add_neon:     3.39   5.80   4.18   3.92
> vp9_inv_adst_adst_8x8_add_neon:     3.94   4.82   4.25   3.89
> vp9_inv_adst_adst_16x16_add_neon:   3.33   4.27   4.08   4.05
> vp9_inv_dct_dct_4x4_add_neon:       3.73   5.06   4.26   4.28
> vp9_inv_dct_dct_8x8_add_neon:       4.59   5.81   5.03   4.73
> vp9_inv_dct_dct_16x16_add_neon:     3.40   3.39   3.33   3.68
> vp9_inv_dct_dct_32x32_add_neon:     4.00   3.51   3.80   4.40
> vp9_inv_wht_wht_4x4_add_neon:       3.24   5.16   3.52   3.67
> 
> Thus, the speedup vs C code is around 3-5x.
> 
> This is mostly marginally faster than the corresponding routines
> in libvpx on most cores, tested with their 32x32 idct (compared to
> vpx_idct32x32_1024_add_neon). These numbers are slightly in libvpx's
> favour since their version doesn't clear the input buffer like ours
> do (although the effect of that on the total runtime probably is
> negligible.)
> 
>                            Cortex       A7       A8       A9      A53
> vp9_inv_dct_dct_32x32_add_neon:    18852.0  16831.6  14217.4  11988.6
> libvpx vpx_idct32x32_1024_add_neon 20789.0  13344.3  15049.9  13030.5
> 
> Only on the Cortex A8, the libvpx function is faster. On the other cores,
> ours is slightly faster even though ours has got source block clearing
> integrated.
> ---
> v2: Updated some broken macro comments, optimized the transposes by
> using the q registers for part of transposes.
> 
> Suggestions very much welcome on names for the macros - no idea if
> the current ones make sense or what one commonly would call these
> combinations.
> 
> I'm a bit reluctant to expanding the macros (to be able to schedule
> instructions better), in order to keep things readable. (Although,
> I guess this is kinda write-only code, which nobody ever touches
> afterwards).
> ---
>  libavcodec/arm/Makefile          |    3 +-
>  libavcodec/arm/vp9dsp_init_arm.c |   51 +-
>  libavcodec/arm/vp9itxfm_neon.S   | 1166 ++++++++++++++++++++++++++++++++++++++
>  3 files changed, 1218 insertions(+), 2 deletions(-)
>  create mode 100644 libavcodec/arm/vp9itxfm_neon.S
> 
> diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
> index 2638230..01630ac 100644
> --- a/libavcodec/arm/Makefile
> +++ b/libavcodec/arm/Makefile
> @@ -139,4 +139,5 @@ NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
>                                            arm/rv40dsp_neon.o
>  NEON-OBJS-$(CONFIG_VORBIS_DECODER)     += arm/vorbisdsp_neon.o
>  NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp6dsp_neon.o
> -NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9mc_neon.o
> +NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9itxfm_neon.o           \
> +                                          arm/vp9mc_neon.o
> diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
> index db8c683..2ba2644 100644
> --- a/libavcodec/arm/vp9dsp_init_arm.c
> +++ b/libavcodec/arm/vp9dsp_init_arm.c
> @@ -94,7 +94,7 @@ define_8tap_2d_funcs(8)
>  define_8tap_2d_funcs(4)
>  
>  
> -av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
> +static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
>  {
>      int cpu_flags = av_get_cpu_flags();
>  
> @@ -138,3 +138,52 @@ av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
>          init_mc_funcs_dirs(4, 4);
>      }
>  }
> +
> +#define define_itxfm(type_a, type_b, sz)                                   \
> +void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst,    \
> +                                                         ptrdiff_t stride, \
> +                                                         int16_t *_block, int eob)
> +
> +#define define_itxfm_funcs(sz)      \
> +    define_itxfm(idct,  idct,  sz); \
> +    define_itxfm(iadst, idct,  sz); \
> +    define_itxfm(idct,  iadst, sz); \
> +    define_itxfm(iadst, iadst, sz)
> +
> +define_itxfm_funcs(4);
> +define_itxfm_funcs(8);
> +define_itxfm_funcs(16);
> +define_itxfm(idct, idct, 32);
> +define_itxfm(iwht, iwht, 4);
> +
> +
> +static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
> +{
> +    int cpu_flags = av_get_cpu_flags();
> +
> +    if (have_neon(cpu_flags)) {
> +#define init_itxfm(tx, sz)                                             \
> +    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_neon;  \
> +    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_neon; \
> +    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_neon; \
> +    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
> +
> +#define init_idct(tx, nm)           \
> +    dsp->itxfm_add[tx][DCT_DCT]   = \
> +    dsp->itxfm_add[tx][ADST_DCT]  = \
> +    dsp->itxfm_add[tx][DCT_ADST]  = \
> +    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
> +
> +        init_itxfm(TX_4X4, 4x4);
> +        init_itxfm(TX_8X8, 8x8);
> +        init_itxfm(TX_16X16, 16x16);
> +        init_idct(TX_32X32, idct_idct_32x32);
> +        init_idct(4, iwht_iwht_4x4);
> +    }
> +}
> +
> +av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
> +{
> +    vp9dsp_mc_init_arm(dsp);
> +    vp9dsp_itxfm_init_arm(dsp);
> +}
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> new file mode 100644
> index 0000000..96dc3a9
> --- /dev/null
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -0,0 +1,1166 @@
> +/*
> + * Copyright (c) 2016 Google Inc.
> + *
> + * This file is part of Libav.
> + *
> + * Libav is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU Lesser General Public
> + * License as published by the Free Software Foundation; either
> + * version 2.1 of the License, or (at your option) any later version.
> + *
> + * Libav is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> + * Lesser General Public License for more details.
> + *
> + * You should have received a copy of the GNU Lesser General Public
> + * License along with Libav; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
> + */
> +
> +#include "libavutil/arm/asm.S"
> +#include "neon.S"
> +
> +const itxfm4_coeffs, align=4
> +        .short  11585, 6270, 15137, 0
> +iadst4_coeffs:
> +        .short  5283, 15212, 9929, 13377
> +endconst
> +
> +const iadst8_coeffs, align=4
> +        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
> +endconst
> +
> +const idct_coeffs, align=4
> +        .short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
> +        .short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
> +        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
> +        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
> +endconst
> +
> +const iadst16_coeffs, align=4
> +        .short  16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
> +        .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
> +endconst
> +
> +@ Do two 4x4 transposes, using q registers for the subtransposes that don't

it's four 4x4 transposes

> +@ need to address the individual d registers.
> +@ r0,r1 == rq1, r2,r3 == rq1, etc
> +.macro transpose16_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
> +        vtrn.32          \rq0, \rq1
> +        vtrn.32          \rq2, \rq3
> +        vtrn.32          \rq4, \rq5
> +        vtrn.32          \rq6, \rq7
> +        vtrn.16          \r0,  \r1
> +        vtrn.16          \r2,  \r3
> +        vtrn.16          \r4,  \r5
> +        vtrn.16          \r6,  \r7
> +        vtrn.16          \r8,  \r9
> +        vtrn.16          \r10, \r11
> +        vtrn.16          \r12, \r13
> +        vtrn.16          \r14, \r15
> +.endm
> +
> +@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
> +@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
> +@ in/out are d registers
> +.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
> +        vadd.s16        \tmpd1, \in1,  \in2
> +        vsub.s16        \tmpd2, \in1,  \in2
> +        vmull.s16       \tmpq3, \tmpd1, d0[0]
> +        vmull.s16       \tmpq4, \tmpd2, d0[0]
> +.if \neg > 0
> +        vneg.s32        \tmpq3, \tmpq3
> +.endif
> +        vrshrn.s32      \out1, \tmpq3, #14
> +        vrshrn.s32      \out2, \tmpq4, #14
> +.endm

an empty line after .endm improves the readability

> +@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
> +@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
> +@ Same as mbutterfly0, but with input being 2 q registers, output
> +@ being 4 d registers.
> +@ This can do with either 4 or 6 temporary q registers.
> +.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
> +        vadd.s16        \tmpq1, \in1,  \in2
> +        vsub.s16        \tmpq2, \in1,  \in2
> +        vmull.s16       \tmpq3, \tmpd11, d0[0]
> +        vmull.s16       \tmpq4, \tmpd12, d0[0]
> +.ifb \tmpq5
> +        vrshrn.s32      \out1, \tmpq3, #14
> +        vrshrn.s32      \out2, \tmpq4, #14
> +        vmull.s16       \tmpq3, \tmpd21, d0[0]
> +        vmull.s16       \tmpq4, \tmpd22, d0[0]
> +        vrshrn.s32      \out3, \tmpq3, #14
> +        vrshrn.s32      \out4, \tmpq4, #14
> +.else
> +        vmull.s16       \tmpq5, \tmpd21, d0[0]
> +        vmull.s16       \tmpq6, \tmpd22, d0[0]
> +        vrshrn.s32      \out1, \tmpq3, #14
> +        vrshrn.s32      \out2, \tmpq4, #14
> +        vrshrn.s32      \out3, \tmpq5, #14
> +        vrshrn.s32      \out4, \tmpq6, #14
> +.endif
> +.endm
> +@ out1 = in1 * coef1 - in2 * coef2
> +@ out2 = in1 * coef2 + in2 * coef1
> +@ out are 2 q registers, in are 2 d registers
> +.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2
> +        vmull.s16       \out1, \in1, \coef1
> +        vmlsl.s16       \out1, \in2, \coef2
> +        vmull.s16       \out2, \in1, \coef2

doing the second vmull before the preferable on in-order units

> +        vmlal.s16       \out2, \in2, \coef1
> +.endm
> +@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
> +@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
> +@ out are 4 q registers, in are 4 d registers
> +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
> +        vmull.s16       \out1, \in1, \coef1
> +        vmull.s16       \out2, \in2, \coef1
> +        vmull.s16       \out3, \in1, \coef2
> +        vmull.s16       \out4, \in2, \coef2
> +        vmlsl.s16       \out1, \in3, \coef2
> +        vmlsl.s16       \out2, \in4, \coef2
> +        vmlal.s16       \out3, \in3, \coef1
> +        vmlal.s16       \out4, \in4, \coef1
> +.endm
> +@ in1 = (in1 * coef1 - in2 * coef2 + (1 << 13)) >> 14
> +@ in2 = (in1 * coef2 + in2 * coef1 + (1 << 13)) >> 14
> +@ in are 2 d registers, tmp are 2 q registers
> +.macro mbutterfly in1, in2, coef1, coef2, tmp1, tmp2, neg=0
> +        mbutterfly_l    \tmp1, \tmp2, \in1, \in2, \coef1, \coef2
> +.if \neg > 0
> +        vneg.s32        \tmp2, \tmp2
> +.endif
> +        vrshrn.s32      \in1, \tmp1,  #14
> +        vrshrn.s32      \in2, \tmp2,  #14
> +.endm
> +@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
> +@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
> +@ inout are 4 d registers, tmp are 4 q registers
> +.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
> +        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
> +        vrshrn.s32      \inout1, \tmp1,  #14
> +        vrshrn.s32      \inout2, \tmp2,  #14
> +        vrshrn.s32      \inout3, \tmp3,  #14
> +        vrshrn.s32      \inout4, \tmp4,  #14
> +.endm
> +.macro mbutterfly_neg in1, in2, coef1, coef2, tmp1, tmp2
> +        mbutterfly      \in1, \in2, \coef1, \coef2, \tmp1, \tmp2, 1
> +.endm

tis macro is a little pointless, readability is not really worse for

mbutterfly ..., neg=1 vs mbutterfly_neg ...

> +@ out1 = in1 + in2
> +@ out2 = in1 - in2
> +.macro butterfly out1, out2, in1, in2
> +        vadd.s16        \out1, \in1, \in2
> +        vsub.s16        \out2, \in1, \in2
> +.endm
> +@ out1 = in1 - in2
> +@ out2 = in1 + in2
> +.macro butterfly_r out1, out2, in1, in2
> +        vsub.s16        \out1, \in1, \in2
> +        vadd.s16        \out2, \in1, \in2
> +.endm
> +@ out1 = (in1 + in2 + (1 << 13)) >> 14
> +@ out2 = (in1 - in2 + (1 << 13)) >> 14
> +@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
> +.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
> +        vadd.s32        \tmp1, \in1, \in2
> +        vsub.s32        \tmp2, \in1, \in2
> +        vrshrn.s32      \out1, \tmp1,  #14
> +        vrshrn.s32      \out2, \tmp2,  #14
> +.endm
> +@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
> +@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
> +@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
> +.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
> +        vadd.s32        \tmp1, \in1, \in3
> +        vadd.s32        \tmp2, \in2, \in4
> +        vsub.s32        \tmp3, \in1, \in3
> +        vsub.s32        \tmp4, \in2, \in4
> +        vrshrn.s32      \out1, \tmp1,  #14
> +        vrshrn.s32      \out2, \tmp2,  #14
> +        vrshrn.s32      \out3, \tmp3,  #14
> +        vrshrn.s32      \out4, \tmp4,  #14
> +.endm
> +
> +
> +.macro iwht4 c0, c1, c2, c3
> +        vadd.i16        \c0,  \c0,  \c1
> +        vsub.i16        d17,  \c2,  \c3
> +        vsub.i16        d16,  \c0,  d17
> +        vshr.s16        d16,  d16,  #1
> +        vsub.i16        \c2,  d16,  \c1
> +        vsub.i16        \c1,  d16,  \c3
> +        vadd.i16        \c3,  d17,  \c2
> +        vsub.i16        \c0,  \c0,  \c1
> +.endm
> +
> +.macro idct4 c0, c1, c2, c3
> +        vadd.i16        d16,  \c0,  \c2
> +        vsub.i16        d17,  \c0,  \c2
> +        vmull.s16       q11,  \c1,  d0[1]
> +        vmull.s16       q12,  \c3,  d0[2]

vmlsl.s16 q11, \c3,  d0[2] and reorder for in-order

> +        vmull.s16       q13,  \c1,  d0[2]
> +        vmull.s16       q14,  \c3,  d0[1]

vmlal.s16 q13, \c3,  d0[1]

> +        vmull.s16       q9,   d16,  d0[0]
> +        vmull.s16       q10,  d17,  d0[0]
> +        vadd.i32        q13,  q13,  q14
> +        vsub.i32        q11,  q11,  q12
> +        vrshrn.s32      d16,  q9,   #14
> +        vrshrn.s32      d19,  q13,  #14
> +        vrshrn.s32      d17,  q10,  #14
> +        vrshrn.s32      d18,  q11,  #14
> +        vadd.i16        \c0,  d16,  d19
> +        vadd.i16        \c1,  d17,  d18
> +        vsub.i16        \c2,  d17,  d18
> +        vsub.i16        \c3,  d16,  d19
> +.endm
> +
> +.macro iadst4 c0, c1, c2, c3
> +        vmull.s16       q10,  \c0,  d1[0]
> +        vmlal.s16       q10,  \c2,  d1[1]
> +        vmlal.s16       q10,  \c3,  d1[2]
> +        vmull.s16       q11,  \c0,  d1[2]
> +        vmlsl.s16       q11,  \c2,  d1[0]
> +        vsub.s16        \c0,  \c0,  \c2
> +        vmlsl.s16       q11,  \c3,  d1[1]
> +        vadd.s16        \c0,  \c0,  \c3
> +        vmull.s16       q13,  \c1,  d1[3]
> +        vmull.s16       q12,  \c0,  d1[3]
> +        vadd.s32        q14,  q10,  q13
> +        vadd.s32        q1,   q11,  q13
> +        vrshrn.s32      \c0,  q14,  #14
> +        vadd.s32        q10,  q10,  q11
> +        vrshrn.s32      \c1,  q1,   #14
> +        vsub.s32        q10,  q10,  q13
> +        vrshrn.s32      \c2,  q12,  #14
> +        vrshrn.s32      \c3,  q10,  #14

instruction scheduling can be optimized for this one too

> +.endm
> +
> +@ The public functions in this file have got the following signature:
> +@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
> +
> +.macro itxfm_func4x4 txfm1, txfm2
> +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
> +.ifc \txfm1,\txfm2
> +.ifc \txfm1,idct
> +        movrel          r12, itxfm4_coeffs
> +        vld1.16         {d0}, [r12,:64]
> +.endif
> +.ifc \txfm1,iadst
> +        movrel          r12, iadst4_coeffs
> +        vld1.16         {d1}, [r12,:64]
> +.endif
> +.else
> +        movrel          r12, itxfm4_coeffs
> +        vld1.16         {q0}, [r12,:128]
> +.endif

aligned 8 byte and 16 byte loads are equally fast so this adds just 
complexity without gain

> +
> +        vmov.i16        q15, #0
> +.ifc \txfm1,idct
> +.ifc \txfm2,idct
> +        cmp             r3,  #1
> +        bne             1f
> +        @ DC-only for idct/idct
> +        vld1.16         {d4[]},   [r2]

alignment

> +        vmull.s16       q2,  d4,  d0[0]
> +        vrshrn.s32      d4,  q2,  #14
> +        vmull.s16       q2,  d4,  d0[0]
> +        vrshrn.s32      d4,  q2,  #14
> +        vst1.16         {d30[0]}, [r2]

same

> +        vdup.16         q2,  d4[0]
> +        vmov            q3,  q2

vdup first to q3 to avoid data dependency

> +        b               2f
> +.endif
> +.endif
> +
> +1:
> +        vld1.16         {d4-d7},  [r2,:128]
> +        vst1.16         {q15}, [r2,:128]!
> +
> +.ifc \txfm1,iwht
> +        vshr.s16        q2,  q2,  #2
> +        vshr.s16        q3,  q3,  #2
> +.endif
> +
> +        \txfm1\()4      d4,  d5,  d6,  d7
> +
> +        vst1.16         {q15}, [r2,:128]!
> +        @ Transpose 4x4 with 16 bit elements
> +        vtrn.16         d4,  d5
> +        vtrn.16         d6,  d7
> +        vtrn.32         d4,  d6
> +        vtrn.32         d5,  d7

vtrn.32 q2, q3

> +
> +        \txfm2\()4      d4,  d5,  d6,  d7
> +2:
> +        vld1.32         {d0[]},   [r0,:32], r1
> +        vld1.32         {d0[1]},  [r0,:32], r1
> +.ifnc \txfm1,iwht
> +        vrshr.s16       q2,  q2,  #4
> +        vrshr.s16       q3,  q3,  #4
> +.endif
> +        vaddw.u8        q2,  q2,  d0
> +        vld1.32         {d1[]},   [r0,:32], r1
> +        vld1.32         {d1[1]},  [r0,:32], r1
> +        vqmovun.s16     d0,  q2
> +        sub             r0,  r0,  r1, lsl #2

since we have free gp registers I'd use different register for load and 
store. probably not faster though

> +
> +        vaddw.u8        q3,  q3,  d1
> +        vst1.32         {d0[0]},  [r0,:32], r1
> +        vqmovun.s16     d1,  q3
> +
> +        vst1.32         {d0[1]},  [r0,:32], r1
> +        vst1.32         {d1[0]},  [r0,:32], r1
> +        vst1.32         {d1[1]},  [r0,:32], r1
> +
> +        bx              lr
> +endfunc
> +.endm
> +
> +itxfm_func4x4 idct,  idct
> +itxfm_func4x4 iadst, idct
> +itxfm_func4x4 idct,  iadst
> +itxfm_func4x4 iadst, iadst
> +itxfm_func4x4 iwht,  iwht
> +
> +
> +.macro idct8
> +        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
> +        dmbutterfly     d20, d21, d28, d29, d0[1], d0[2], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
> +        dmbutterfly     d18, d19, d30, d31, d0[3], d1[0], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
> +        dmbutterfly     d26, d27, d22, d23, d1[1], d1[2], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
> +
> +        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
> +        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
> +        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
> +        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
> +
> +        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
> +
> +        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
> +
> +        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
> +        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
> +        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
> +.endm
> +
> +.macro iadst8
> +        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d2[1], d2[0] @ q4,q5  = t1a, q2,q3 = t0a
> +        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d3[1], d3[0] @ q8,q15 = t5a, q6,q7 = t4a
> +
> +        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
> +
> +        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
> +
> +        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d2[3], d2[2] @ q6,q7 = t3a, q4,q5 = t2a
> +        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[3], d3[2] @ q10,q13 = t7a, q8,q15 = t6a
> +
> +        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
> +        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
> +
> +        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
> +        vneg.s16        q15, q15          @ q15 = out[7]
> +        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
> +
> +        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d0[1], d0[2] @ q10,q11 = t5a, q5,q7 = t4a
> +        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d0[2], d0[1] @ q2,q3 = t6a, q13,q14 = t7a
> +
> +        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
> +
> +        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
> +        vneg.s16        q11, q11      @ q11 = out[3]
> +
> +        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
> +        vneg.s16        q9,  q9       @ q9 = out[1]
> +
> +        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
> +        vneg.s16        q13, q13      @ q13 = out[5]
> +.endm
> +
> +
> +.macro itxfm_func8x8 txfm1, txfm2
> +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
> +        @ Push q4-q7 if iadst is used, idct requires
> +        @ a few scratch registers less, so only push q4-q5
> +        @ if only idct is involved.
> +        @ The iadst also uses a few coefficients from
> +        @ idct, so those always need to be loaded.
> +        movrel          r12, idct_coeffs

move this into the last else below

> +        vld1.16         {q0}, [r12,:128]

this can follow after this block if the iadst8_coeffs load uses post 
increment

> +.ifc \txfm1,iadst
> +        movrel          r12, iadst8_coeffs
> +        vld1.16         {q1}, [r12,:128]
> +        vpush           {q4-q7}
> +.else
> +.ifc \txfm2,iadst

does .elseifc work?

> +        movrel          r12, iadst8_coeffs
> +        vld1.16         {q1}, [r12,:128]
> +        vpush           {q4-q7}
> +.else
> +        vpush           {q4-q5}
> +.endif
> +.endif
> +
> +        vmov.i16        q2, #0
> +        vmov.i16        q3, #0
> +
> +.ifc \txfm1,idct
> +.ifc \txfm2,idct
> +        cmp             r3,  #1
> +        bne             1f
> +        @ DC-only for idct/idct
> +        vld1.16         {d16[]},   [r2]

alignment

> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vdup.16         q8,  d16[0]
> +        vmov            q9,  q8
> +        vmov            q10, q8
> +        vmov            q11, q8
> +        vmov            q12, q8
> +        vmov            q13, q8
> +        vmov            q14, q8
> +        vmov            q15, q8

all duped from d16[0]

> +        vst1.16         {d4[0]}, [r2]

alignment

> +        b               2f
> +.endif
> +.endif
> +1:
> +        vld1.16         {q8-q9},    [r2,:128]!
> +        vld1.16         {q10-q11},  [r2,:128]!
> +        vld1.16         {q12-q13},  [r2,:128]!
> +        vld1.16         {q14-q15},  [r2,:128]!
> +        sub             r2,  r2,  #128
> +        vst1.16         {q2-q3}, [r2,:128]!
> +        vst1.16         {q2-q3}, [r2,:128]!
> +        vst1.16         {q2-q3}, [r2,:128]!
> +        vst1.16         {q2-q3}, [r2,:128]!
> +
> +        \txfm1\()8
> +
> +        @ Transpose 8x8 with 16 bit elements
> +        vswp            d17, d24
> +        vswp            d19, d26
> +        vswp            d21, d28
> +        vswp            d23, d30
> +        transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
> +
> +        \txfm2\()8
> +2:
> +        @ Add into the destination
> +        vld1.8          {d4},  [r0,:64], r1
> +        vrshr.s16       q8,  q8,  #5
> +        vld1.8          {d5},  [r0,:64], r1
> +        vrshr.s16       q9,  q9,  #5
> +        vld1.8          {d6},  [r0,:64], r1
> +        vrshr.s16       q10, q10, #5
> +        vaddw.u8        q8,  q8,  d4
> +        vld1.8          {d7},  [r0,:64], r1
> +        vrshr.s16       q11, q11, #5
> +        vaddw.u8        q9,  q9,  d5
> +        vld1.8          {d8},  [r0,:64], r1
> +        vrshr.s16       q12, q12, #5
> +        vaddw.u8        q10, q10, d6
> +        vqmovun.s16     d4,  q8
> +        vld1.8          {d9},  [r0,:64], r1
> +        vrshr.s16       q13, q13, #5
> +        vaddw.u8        q11, q11, d7
> +        vqmovun.s16     d5,  q9
> +        vld1.8          {d10}, [r0,:64], r1
> +        vrshr.s16       q14, q14, #5
> +        vaddw.u8        q12, q12, d8
> +        vqmovun.s16     d6,  q10
> +        vld1.8          {d11}, [r0,:64], r1
> +        vrshr.s16       q15, q15, #5
> +        vaddw.u8        q13, q13, d9
> +        vqmovun.s16     d7,  q11
> +        sub             r0,  r0,  r1, lsl #3

could use a different register loads and stores

> +
> +        vst1.8          {d4},  [r0,:64], r1
> +        vaddw.u8        q14, q14, d10
> +        vst1.8          {d5},  [r0,:64], r1
> +        vqmovun.s16     d8,  q12
> +        vst1.8          {d6},  [r0,:64], r1
> +        vaddw.u8        q15, q15, d11
> +        vst1.8          {d7},  [r0,:64], r1
> +        vqmovun.s16     d9,  q13
> +        vst1.8          {d8},  [r0,:64], r1
> +        vqmovun.s16     d10, q14
> +        vst1.8          {d9},  [r0,:64], r1
> +        vqmovun.s16     d11, q15
> +
> +        vst1.8          {d10}, [r0,:64], r1
> +        vst1.8          {d11}, [r0,:64], r1
> +
> +.ifc \txfm1,iadst
> +        vpop            {q4-q7}
> +.else
> +.ifc \txfm2,iadst
> +        vpop            {q4-q7}
> +.else
> +        vpop            {q4-q5}
> +.endif
> +.endif
> +        bx              lr
> +endfunc
> +.endm
> +
> +itxfm_func8x8 idct,  idct
> +itxfm_func8x8 iadst, idct
> +itxfm_func8x8 idct,  iadst
> +itxfm_func8x8 iadst, iadst
> +
> +
> +function idct16x16_dc_add_neon
> +        movrel          r12, idct_coeffs
> +        vld1.16         {d0}, [r12,:64]
> +
> +        vmov.i16        q2, #0
> +
> +        vld1.16         {d16[]},   [r2]

alignment

> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vdup.16         q8,  d16[0]
> +        vst1.16         {d4[0]}, [r2]

alignment

> +
> +        vrshr.s16       q8,  q8,  #6
> +
> +        mov             r12, #16
> +1:
> +        @ Loop to add the constant from q8 into all 16x16 outputs
> +        vld1.8          {q3},  [r0,:128]
> +        vaddw.u8        q10, q8,  d6
> +        vaddw.u8        q11, q8,  d7
> +        vqmovun.s16     d6,  q10
> +        vqmovun.s16     d7,  q11
> +        vst1.8          {q3},  [r0,:128], r1
> +        subs            r12, r12, #1
> +        bne             1b
> +
> +        bx              lr
> +endfunc
> +
> +.macro idct16
> +        mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
> +        mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
> +        mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
> +        mbutterfly      d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
> +        mbutterfly      d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
> +        mbutterfly      d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
> +        mbutterfly      d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
> +        mbutterfly      d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
> +
> +        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
> +        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
> +        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
> +        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
> +        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
> +        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
> +        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
> +        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
> +
> +        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15 @ d22 = t6a, d26 = t5a
> +        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15 @ d23 = t9a, d25 = t14a
> +        mbutterfly_neg  d27, d21, d0[1], d0[2], q9,  q15 @ d27 = t13a, d21 = t10a
> +
> +        butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = t7a
> +        butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = t6
> +        butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = t5
> +        butterfly       d5,  d6,  d28, d6                @ d5  = t3a,  d6  = t4
> +        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
> +        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
> +        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
> +        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
> +
> +        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
> +        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
> +
> +        vswp            d27, d29                         @ d27 = t12, d29 = t13a
> +        vswp            d28, d27                         @ d28 = t12, d27 = t11
> +        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
> +        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
> +        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
> +        butterfly       d23, d24, d7,  d20               @ d23 = out[7], d24 = out[8]
> +        butterfly       d18, d29, d4,  d29               @ d18 = out[2], d29 = out[13]
> +        butterfly       d19, d28, d5,  d28               @ d19 = out[3], d28 = out[12]
> +        vmov            d4,  d21                         @ d4  = t10a
> +        butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
> +        butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
> +.endm
> +
> +.macro iadst16
> +        movrel          r12, iadst16_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
> +        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = t8
> +        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
> +        mbutterfly_l    q7,  q6,  d29, d18, d0[3], d0[2] @ q7  = t3,   q6  = t2
> +        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
> +
> +        mbutterfly_l    q3,  q2,  d21, d26, d2[3], d2[2] @ q3  = t11,  q2  = t10
> +        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
> +        mbutterfly_l    q5,  q4,  d27, d20, d1[1], d1[0] @ q5  = t5,   q4  = t4
> +        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
> +
> +        mbutterfly_l    q7,  q6,  d19, d28, d3[1], d3[0] @ q7  = t13,  q6  = t12
> +        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
> +        mbutterfly_l    q3,  q2,  d25, d22, d1[3], d1[2] @ q3  = t7,   q2  = t6
> +        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
> +
> +        mbutterfly_l    q5,  q4,  d17, d30, d3[3], d3[2] @ q5  = t15,  q4  = t14
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0}, [r12,:128]
> +        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
> +        mbutterfly_l    q7,  q6,  d23, d24, d0[3], d1[0] @ q7  = t9,   q6  = t8
> +        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
> +
> +        mbutterfly_l    q2,  q3,  d28, d19, d1[0], d0[3] @ q2  = t12,  q3  = t13
> +        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
> +        mbutterfly_l    q5,  q4,  d21, d26, d1[1], d1[2] @ q5  = t11,  q4  = t10
> +        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
> +        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
> +
> +        mbutterfly_l    q6,  q7,  d30, d17, d1[2], d1[1] @ q6  = t14,  q7  = t15
> +        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
> +        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
> +        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
> +
> +        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
> +        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
> +
> +        mbutterfly_l    q5,  q4,  d19, d28, d0[1], d0[2] @ q5  = t13,  q4  = t12
> +        mbutterfly_l    q6,  q7,  d30, d17, d0[2], d0[1] @ q6  = t14,  q7  = t15
> +
> +        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
> +        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
> +        vneg.s16        d29, d29                         @ d29 = out[13]
> +
> +        mbutterfly_l    q5,  q4,  d4,  d5,  d0[1], d0[2] @ q5  = t5a,  q4  = t4a
> +        mbutterfly_l    q6,  q7,  d7,  d6,  d0[2], d0[1] @ q6  = t6a,  q7  = t7a
> +
> +        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
> +        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
> +
> +        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
> +        vneg.s16        d19, d19                         @ d19 = out[3]
> +        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
> +
> +        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
> +        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
> +
> +        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
> +        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
> +        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
> +        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
> +
> +        vneg.s16        d31, d5                          @ d31 = out[15]
> +        vneg.s16        d17, d3                          @ d17 = out[1]
> +
> +        vmov            d16, d2
> +        vmov            d30, d4
> +.endm
> +
> +.macro itxfm16_1d_funcs txfm
> +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> +@ transpose into a horizontal 16x4 slice and store.
> +@ r0 = dst (temp buffer)
> +@ r1 = unused
> +@ r2 = src
> +@ r3 = slice offset
> +function \txfm\()16_1d_4x16_pass1_neon
> +        mov             r12, #32
> +        vmov.s16        q2, #0
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +
> +        \txfm\()16
> +
> +        @ Do four 4x4 transposes. Originally, d16-d31 contain the
> +        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> +        @ contain the transposed 4x4 blocks.
> +        transpose16_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
> +
> +        @ Store the transposed 4x4 blocks horizontally.
> +        cmp             r3,  #12
> +        beq             1f
> +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        bx              lr
> +1:
> +        @ Special case: For the last input column (r3 == 12),
> +        @ which would be stored as the last row in the temp buffer,
> +        @ don't store the first 4x4 block, but keep it in registers
> +        @ for the first slice of the second pass (where it is the
> +        @ last 4x4 block).
> +        add             r0,  r0,  #8
> +.irp i, 20, 24, 28
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        add             r0,  r0,  #8
> +.irp i, 21, 25, 29
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        add             r0,  r0,  #8
> +.irp i, 22, 26, 30
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        add             r0,  r0,  #8
> +.irp i, 23, 27, 31
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +        vmov            d28, d16
> +        vmov            d29, d17
> +        vmov            d30, d18
> +        vmov            d31, d19
> +        bx              lr
> +endfunc
> +
> +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> +@ load the destination pixels (from a similar 4x16 slice), add and store back.
> +@ r0 = dst
> +@ r1 = dst stride
> +@ r2 = src (temp buffer)
> +@ r3 = slice offset
> +function \txfm\()16_1d_4x16_pass2_neon
> +        mov             r12, #32
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        cmp             r3,  #0
> +        beq             1f
> +.irp i, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +1:
> +
> +        \txfm\()16
> +
> +.macro load_add_store coef0, coef1, coef2, coef3
> +        vrshr.s16       \coef0, \coef0, #6
> +        vrshr.s16       \coef1, \coef1, #6
> +
> +        vld1.32         {d4[]},   [r0,:32], r1
> +        vld1.32         {d4[1]},  [r0,:32], r1
> +        vrshr.s16       \coef2, \coef2, #6
> +        vrshr.s16       \coef3, \coef3, #6
> +        vld1.32         {d5[]},   [r0,:32], r1
> +        vld1.32         {d5[1]},  [r0,:32], r1
> +        vaddw.u8        \coef0, \coef0, d4
> +        vld1.32         {d6[]},   [r0,:32], r1
> +        vld1.32         {d6[1]},  [r0,:32], r1
> +        vaddw.u8        \coef1, \coef1, d5
> +        vld1.32         {d7[]},   [r0,:32], r1
> +        vld1.32         {d7[1]},  [r0,:32], r1
> +
> +        vqmovun.s16     d4,  \coef0
> +        vqmovun.s16     d5,  \coef1
> +        sub             r0,  r0,  r1, lsl #3

could use an additional register

> +        vaddw.u8        \coef2, \coef2, d6
> +        vaddw.u8        \coef3, \coef3, d7
> +        vst1.32         {d4[0]},  [r0,:32], r1
> +        vst1.32         {d4[1]},  [r0,:32], r1
> +        vqmovun.s16     d6,  \coef2
> +        vst1.32         {d5[0]},  [r0,:32], r1
> +        vst1.32         {d5[1]},  [r0,:32], r1
> +        vqmovun.s16     d7,  \coef3
> +
> +        vst1.32         {d6[0]},  [r0,:32], r1
> +        vst1.32         {d6[1]},  [r0,:32], r1
> +        vst1.32         {d7[0]},  [r0,:32], r1
> +        vst1.32         {d7[1]},  [r0,:32], r1
> +.endm
> +        load_add_store  q8,  q9,  q10, q11
> +        load_add_store  q12, q13, q14, q15
> +.purgem load_add_store
> +
> +        bx              lr
> +endfunc
> +.endm
> +
> +itxfm16_1d_funcs idct
> +itxfm16_1d_funcs iadst
> +
> +.macro itxfm_func16x16 txfm1, txfm2
> +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
> +.ifc \txfm1,idct
> +.ifc \txfm2,idct
> +        cmp             r3,  #1
> +        beq             idct16x16_dc_add_neon
> +.endif
> +.endif
> +1:

unused label

> +        push            {r4-r7,lr}
> +.ifc \txfm1,iadst
> +        vpush           {q4-q7}
> +.else
> +.ifc \txfm2,iadst
> +        vpush           {q4-q7}
> +.endif
> +.endif
> +        mov             r7,  sp
> +
> +        @ Align the stack, allocate a temp buffer
> +T       mov             r12, sp
> +T       bic             r12, r12, #15
> +T       sub             r12, r12, #512
> +T       mov             sp,  r12
> +A       bic             sp,  sp,  #15
> +A       sub             sp,  sp,  #512
> +
> +        mov             r4,  r0
> +        mov             r5,  r1
> +        mov             r6,  r2
> +
> +.ifc \txfm1,idct
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +.endif
> +
> +.irp i, 0, 4, 8, 12
> +        add             r0,  sp,  #(\i*32)
> +        add             r2,  r6,  #(\i*2)
> +        mov             r3,  #\i
> +        bl              \txfm1\()16_1d_4x16_pass1_neon
> +.endr
> +.ifc \txfm2,idct
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +.endif
> +.irp i, 0, 4, 8, 12
> +        add             r0,  r4,  #(\i)
> +        mov             r1,  r5
> +        add             r2,  sp,  #(\i*2)
> +        mov             r3,  #\i
> +        bl              \txfm2\()16_1d_4x16_pass2_neon
> +.endr
> +
> +        mov             sp,  r7
> +.ifc \txfm1,iadst
> +        vpop            {q4-q7}
> +.else
> +.ifc \txfm2,iadst
> +        vpop            {q4-q7}
> +.endif
> +.endif
> +        pop             {r4-r7,pc}
> +endfunc
> +.endm
> +
> +itxfm_func16x16 idct,  idct
> +itxfm_func16x16 iadst, idct
> +itxfm_func16x16 idct,  iadst
> +itxfm_func16x16 iadst, iadst
> +
> +
> +function idct32x32_dc_add_neon
> +        movrel          r12, idct_coeffs
> +        vld1.16         {d0}, [r12,:64]
> +
> +        vmov.i16        q2, #0
> +
> +        vld1.16         {d16[]},   [r2]

alignment

> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vmull.s16       q8,  d16, d0[0]
> +        vrshrn.s32      d16, q8,  #14
> +        vdup.16         q8,  d16[0]
> +        vst1.16         {d4[0]}, [r2]

dito

> +
> +        vrshr.s16       q8,  q8,  #6
> +
> +        mov             r12, #32
> +1:
> +        @ Loop to add the constant from q8 into all 32x32 outputs
> +        vld1.8          {q2-q3},  [r0,:128]
> +        vaddw.u8        q10, q8,  d4
> +        vaddw.u8        q11, q8,  d5
> +        vaddw.u8        q12, q8,  d6
> +        vaddw.u8        q13, q8,  d7
> +        vqmovun.s16     d4,  q10
> +        vqmovun.s16     d5,  q11
> +        vqmovun.s16     d6,  q12
> +        vqmovun.s16     d7,  q13
> +        vst1.8          {q2-q3},  [r0,:128], r1
> +        subs            r12, r12, #1
> +        bne             1b
> +
> +        bx              lr
> +endfunc
> +
> +.macro idct32_odd
> +        movrel          r12, idct_coeffs
> +        add             r12, r12, #32
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        mbutterfly      d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a
> +        mbutterfly      d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a
> +        mbutterfly      d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a
> +        mbutterfly      d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a
> +        mbutterfly      d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a
> +        mbutterfly      d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a
> +        mbutterfly      d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a
> +        mbutterfly      d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a
> +
> +        sub             r12, r12, #32
> +        vld1.16         {q0}, [r12,:128]
> +
> +        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
> +        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
> +        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
> +        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
> +        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
> +        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
> +        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
> +        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
> +
> +        mbutterfly      d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a
> +        mbutterfly_neg  d27, d20, d0[3], d1[0], q8, q9 @ d27 = t29a, d20 = t18a
> +        mbutterfly      d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a
> +        mbutterfly_neg  d25, d22, d1[1], d1[2], q8, q9 @ d25 = t25a, d22 = t22a
> +
> +        butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
> +        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
> +        butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
> +        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
> +        butterfly       d4,  d28, d28, d30 @ d4  = t24a, d28 = t27a
> +        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
> +        butterfly       d7,  d29, d29, d31 @ d7  = t31a, d29 = t28a
> +        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
> +
> +        mbutterfly      d27, d20, d0[1], d0[2], q12, q15 @ d27 = t18a, d20 = t29a
> +        mbutterfly      d29, d5,  d0[1], d0[2], q12, q15 @ d29 = t19,  d5  = t28
> +        mbutterfly_neg  d28, d6,  d0[1], d0[2], q12, q15 @ d28 = t27,  d6  = t20
> +        mbutterfly_neg  d26, d21, d0[1], d0[2], q12, q15 @ d26 = t26a, d21 = t21a
> +
> +        butterfly       d31, d24, d7,  d4  @ d31 = t31,  d24 = t24
> +        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
> +        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
> +        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
> +        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
> +        butterfly_r     d27, d28, d5,  d28 @ d27 = t27a, d28 = t28a
> +        butterfly       d4,  d26, d20, d26 @ d4  = t29,  d26 = t26
> +        butterfly       d19, d20, d29, d6  @ d19 = t19a, d20 = t20
> +        vmov            d29, d4            @ d29 = t29
> +
> +        mbutterfly0     d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27,  d20 = t20
> +        mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a
> +        mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
> +        mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
> +.endm
> +
> +@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
> +@ We don't have register space to do a single pass IDCT of 4x32 though,
> +@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
> +@ a normal IDCT16 with every other input component (the even ones, with
> +@ each output written twice), followed by a separate 16-point IDCT
> +@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
> +@ r0 = dst (temp buffer)
> +@ r1 = unused
> +@ r2 = src
> +function idct32_1d_4x32_pass1_neon
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        @ Double stride of the input, since we only read every other line
> +        mov             r12, #128
> +        vmov.s16        d4, #0
> +
> +        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +
> +        idct16
> +
> +        @ Do four 4x4 transposes. Originally, d16-d31 contain the
> +        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
> +        @ contain the transposed 4x4 blocks.
> +        transpose16_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
> +        @ Store the registers a, b, c, d horizontally, followed
> +        @ by the same registers d, c, b, a mirrored.
> +.macro store_rev a, b, c, d
> +.irp i, \a, \b, \c, \d
> +        vst1.16         {d\i}, [r0,:64]!
> +        vrev64.16       d\i, d\i
> +.endr
> +.irp i, \d, \c, \b, \a
> +        vst1.16         {d\i}, [r0,:64]!
> +.endr
> +.endm
> +        store_rev       16, 20, 24, 28
> +        store_rev       17, 21, 25, 29
> +        store_rev       18, 22, 26, 30
> +        store_rev       19, 23, 27, 31
> +        sub             r0,  r0,  #256
> +.purgem store_rev
> +
> +        @ Move r2 back to the start of the input, and move
> +        @ to the first odd row
> +        sub             r2,  r2,  r12, lsl #4
> +        add             r2,  r2,  #64
> +
> +        vmov.s16        d4, #0
> +        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +
> +        idct32_odd
> +
> +        transpose16_q_2x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
> +
> +        @ Store the registers a, b, c, d horizontally,
> +        @ adding into the output first, and then mirrored, subtracted
> +        @ from the output.
> +.macro store_rev a, b, c, d
> +.irp i, \a, \b, \c, \d
> +        vld1.16         {d4},  [r0,:64]
> +        vadd.s16        d4, d4, d\i
> +        vst1.16         {d4},  [r0,:64]!
> +        vrev64.16       d\i, d\i
> +.endr
> +.irp i, \d, \c, \b, \a
> +        vld1.16         {d4},  [r0,:64]
> +        vsub.s16        d4, d4, d\i
> +        vst1.16         {d4},  [r0,:64]!
> +.endr
> +.endm
> +
> +        store_rev 31, 27, 23, 19
> +        store_rev 30, 26, 22, 18
> +        store_rev 29, 25, 21, 17
> +        store_rev 28, 24, 20, 16
> +.purgem store_rev
> +        bx              lr
> +endfunc
> +
> +@ This is mostly the same as 4x32_pass1, but without the transpose,
> +@ and use the source as temp buffer between the two idct passes, and
> +@ add into the destination.
> +@ r0 = dst
> +@ r1 = dst stride
> +@ r2 = src (temp buffer)
> +function idct32_1d_4x32_pass2_neon
> +        movrel          r12, idct_coeffs
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        mov             r12, #128
> +        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        sub             r2,  r2,  r12, lsl #4
> +
> +        idct16
> +
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vst1.16         {d\i}, [r2,:64], r12
> +.endr
> +
> +        sub             r2,  r2,  r12, lsl #4
> +        add             r2,  r2,  #64
> +
> +        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        sub             r2,  r2,  r12, lsl #4
> +        sub             r2,  r2,  #64
> +
> +        idct32_odd
> +
> +        mov             r12,  #128
> +.macro load_acc_store a, b, c, d, neg=0
> +        vld1.16         {d4},  [r2,:64], r12
> +        vld1.16         {d5},  [r2,:64], r12
> +.if \neg == 0
> +        vadd.s16        d4, d4, d\a
> +        vld1.16         {d6},  [r2,:64], r12
> +        vadd.s16        d5, d5, d\b
> +        vld1.16         {d7},  [r2,:64], r12
> +        vadd.s16        d6, d6, d\c
> +        vadd.s16        d7, d7, d\d
> +.else
> +        vsub.s16        d4, d4, d\a
> +        vld1.16         {d6},  [r2,:64], r12
> +        vsub.s16        d5, d5, d\b
> +        vld1.16         {d7},  [r2,:64], r12
> +        vsub.s16        d6, d6, d\c
> +        vsub.s16        d7, d7, d\d
> +.endif
> +        vld1.32         {d2[]},   [r0,:32], r1
> +        vld1.32         {d2[1]},  [r0,:32], r1
> +        vrshr.s16       q2, q2, #6
> +        vld1.32         {d3[]},   [r0,:32], r1
> +        vrshr.s16       q3, q3, #6
> +        vld1.32         {d3[1]},  [r0,:32], r1
> +        sub             r0,  r0,  r1, lsl #2
> +        vaddw.u8        q2,  q2,  d2
> +        vaddw.u8        q3,  q3,  d3
> +        vqmovun.s16     d4,  q2
> +        vqmovun.s16     d5,  q3
> +        vst1.32         {d4[0]},  [r0,:32], r1
> +        vst1.32         {d4[1]},  [r0,:32], r1
> +        vst1.32         {d5[0]},  [r0,:32], r1
> +        vst1.32         {d5[1]},  [r0,:32], r1
> +.endm
> +        load_acc_store  31, 30, 29, 28
> +        load_acc_store  27, 26, 25, 24
> +        load_acc_store  23, 22, 21, 20
> +        load_acc_store  19, 18, 17, 16
> +        sub             r2,  r2,  r12
> +        neg             r12, r12
> +        load_acc_store  16, 17, 18, 19, 1
> +        load_acc_store  20, 21, 22, 23, 1
> +        load_acc_store  24, 25, 26, 27, 1
> +        load_acc_store  28, 29, 30, 31, 1
> +.purgem load_acc_store
> +        bx              lr
> +endfunc
> +
> +function ff_vp9_idct_idct_32x32_add_neon, export=1
> +        cmp             r3,  #1
> +        beq             idct32x32_dc_add_neon
> +1:

unused label

> +        push            {r4-r7,lr}
> +        vpush           {q4-q7}
> +        mov             r7,  sp
> +
> +        @ Align the stack, allocate a temp buffer
> +T       mov             r12, sp
> +T       bic             r12, r12, #15
> +T       sub             r12, r12, #2048
> +T       mov             sp,  r12
> +A       bic             sp,  sp,  #15
> +A       sub             sp,  sp,  #2048
> +
> +        mov             r4,  r0
> +        mov             r5,  r1
> +        mov             r6,  r2
> +
> +.irp i, 0, 4, 8, 12, 16, 20, 24, 28
> +        add             r0,  sp,  #(\i*64)
> +        add             r2,  r6,  #(\i*2)
> +        bl              idct32_1d_4x32_pass1_neon
> +.endr
> +.irp i, 0, 4, 8, 12, 16, 20, 24, 28
> +        add             r0,  r4,  #(\i)
> +        mov             r1,  r5
> +        add             r2,  sp,  #(\i*2)
> +        bl              idct32_1d_4x32_pass2_neon
> +.endr
> +
> +        mov             sp,  r7
> +        vpop            {q4-q7}
> +        pop             {r4-r7,pc}
> +endfunc

patch ok with nits fixed. instruction rescheduling can be done in a 
separate commit.

Janne
Martin Storsjö Nov. 11, 2016, 8:53 a.m. | #2
On Fri, 11 Nov 2016, Janne Grunau wrote:

> On 2016-10-18 21:07:30 +0300, Martin Storsjö wrote:
>> This work is sponsored by, and copyright, Google.
>>
>> For the transforms up to 8x8, we can fit all the data (including
>> temporaries) in registers and just do a straightforward transform
>> of all the data. For 16x16, we do a transform of 4x16 pixels in
>> 4 slices, using a temporary buffer. For 32x32, we transform 4x32
>> pixels at a time, in two steps of 4x16 pixels each.
>>
>> Examples of relative speedup compared to the C version, from checkasm:
>>                          Cortex       A7     A8     A9    A53
>> vp9_inv_adst_adst_4x4_add_neon:     3.39   5.80   4.18   3.92
>> vp9_inv_adst_adst_8x8_add_neon:     3.94   4.82   4.25   3.89
>> vp9_inv_adst_adst_16x16_add_neon:   3.33   4.27   4.08   4.05
>> vp9_inv_dct_dct_4x4_add_neon:       3.73   5.06   4.26   4.28
>> vp9_inv_dct_dct_8x8_add_neon:       4.59   5.81   5.03   4.73
>> vp9_inv_dct_dct_16x16_add_neon:     3.40   3.39   3.33   3.68
>> vp9_inv_dct_dct_32x32_add_neon:     4.00   3.51   3.80   4.40
>> vp9_inv_wht_wht_4x4_add_neon:       3.24   5.16   3.52   3.67
>>
>> Thus, the speedup vs C code is around 3-5x.
>>
>> This is mostly marginally faster than the corresponding routines
>> in libvpx on most cores, tested with their 32x32 idct (compared to
>> vpx_idct32x32_1024_add_neon). These numbers are slightly in libvpx's
>> favour since their version doesn't clear the input buffer like ours
>> do (although the effect of that on the total runtime probably is
>> negligible.)
>>
>>                            Cortex       A7       A8       A9      A53
>> vp9_inv_dct_dct_32x32_add_neon:    18852.0  16831.6  14217.4  11988.6
>> libvpx vpx_idct32x32_1024_add_neon 20789.0  13344.3  15049.9  13030.5
>>
>> Only on the Cortex A8, the libvpx function is faster. On the other cores,
>> ours is slightly faster even though ours has got source block clearing
>> integrated.
>> ---
>> v2: Updated some broken macro comments, optimized the transposes by
>> using the q registers for part of transposes.
>>
>> Suggestions very much welcome on names for the macros - no idea if
>> the current ones make sense or what one commonly would call these
>> combinations.
>>
>> I'm a bit reluctant to expanding the macros (to be able to schedule
>> instructions better), in order to keep things readable. (Although,
>> I guess this is kinda write-only code, which nobody ever touches
>> afterwards).
>> ---
>>  libavcodec/arm/Makefile          |    3 +-
>>  libavcodec/arm/vp9dsp_init_arm.c |   51 +-
>>  libavcodec/arm/vp9itxfm_neon.S   | 1166 ++++++++++++++++++++++++++++++++++++++
>>  3 files changed, 1218 insertions(+), 2 deletions(-)
>>  create mode 100644 libavcodec/arm/vp9itxfm_neon.S
>>

>> +
>> +const itxfm4_coeffs, align=4
>> +        .short  11585, 6270, 15137, 0
>> +iadst4_coeffs:
>> +        .short  5283, 15212, 9929, 13377
>> +endconst
>> +
>> +const iadst8_coeffs, align=4
>> +        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
>> +endconst
>> +
>> +const idct_coeffs, align=4
>> +        .short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
>> +        .short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
>> +        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
>> +        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
>> +endconst
>> +
>> +const iadst16_coeffs, align=4
>> +        .short  16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
>> +        .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
>> +endconst
>> +
>> +@ Do two 4x4 transposes, using q registers for the subtransposes that don't
>
> it's four 4x4 transposes

Indeed; fixed the comment and the macro name

>> +@ need to address the individual d registers.
>> +@ r0,r1 == rq1, r2,r3 == rq1, etc
>> +.macro transpose16_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
>> +        vtrn.32          \rq0, \rq1
>> +        vtrn.32          \rq2, \rq3
>> +        vtrn.32          \rq4, \rq5
>> +        vtrn.32          \rq6, \rq7
>> +        vtrn.16          \r0,  \r1
>> +        vtrn.16          \r2,  \r3
>> +        vtrn.16          \r4,  \r5
>> +        vtrn.16          \r6,  \r7
>> +        vtrn.16          \r8,  \r9
>> +        vtrn.16          \r10, \r11
>> +        vtrn.16          \r12, \r13
>> +        vtrn.16          \r14, \r15
>> +.endm
>> +
>> +@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
>> +@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
>> +@ in/out are d registers
>> +.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
>> +        vadd.s16        \tmpd1, \in1,  \in2
>> +        vsub.s16        \tmpd2, \in1,  \in2
>> +        vmull.s16       \tmpq3, \tmpd1, d0[0]
>> +        vmull.s16       \tmpq4, \tmpd2, d0[0]
>> +.if \neg > 0
>> +        vneg.s32        \tmpq3, \tmpq3
>> +.endif
>> +        vrshrn.s32      \out1, \tmpq3, #14
>> +        vrshrn.s32      \out2, \tmpq4, #14
>> +.endm
>
> an empty line after .endm improves the readability

Done (likewise for the aarch64 version)

>> +@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
>> +@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
>> +@ Same as mbutterfly0, but with input being 2 q registers, output
>> +@ being 4 d registers.
>> +@ This can do with either 4 or 6 temporary q registers.
>> +.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
>> +        vadd.s16        \tmpq1, \in1,  \in2
>> +        vsub.s16        \tmpq2, \in1,  \in2
>> +        vmull.s16       \tmpq3, \tmpd11, d0[0]
>> +        vmull.s16       \tmpq4, \tmpd12, d0[0]
>> +.ifb \tmpq5
>> +        vrshrn.s32      \out1, \tmpq3, #14
>> +        vrshrn.s32      \out2, \tmpq4, #14
>> +        vmull.s16       \tmpq3, \tmpd21, d0[0]
>> +        vmull.s16       \tmpq4, \tmpd22, d0[0]
>> +        vrshrn.s32      \out3, \tmpq3, #14
>> +        vrshrn.s32      \out4, \tmpq4, #14
>> +.else
>> +        vmull.s16       \tmpq5, \tmpd21, d0[0]
>> +        vmull.s16       \tmpq6, \tmpd22, d0[0]
>> +        vrshrn.s32      \out1, \tmpq3, #14
>> +        vrshrn.s32      \out2, \tmpq4, #14
>> +        vrshrn.s32      \out3, \tmpq5, #14
>> +        vrshrn.s32      \out4, \tmpq6, #14
>> +.endif
>> +.endm
>> +@ out1 = in1 * coef1 - in2 * coef2
>> +@ out2 = in1 * coef2 + in2 * coef1
>> +@ out are 2 q registers, in are 2 d registers
>> +.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2
>> +        vmull.s16       \out1, \in1, \coef1
>> +        vmlsl.s16       \out1, \in2, \coef2
>> +        vmull.s16       \out2, \in1, \coef2
>
> doing the second vmull before the preferable on in-order units

That's what I would have expected as well, but it seems to have a negative 
effect on A8 and A53 (and A9!); only A7 seems to gain from it.

Current version:
vp9_inv_adst_adst_16x16_add_neon:   4622.3   2989.9   2901.8   2609.6
With the vmlsl/vmull swapped:
vp9_inv_adst_adst_16x16_add_neon:   4119.0   3242.2   3204.3   2907.5

Thus keeping it in the current form

>> +        vmlal.s16       \out2, \in2, \coef1
>> +.endm
>> +@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
>> +@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
>> +@ out are 4 q registers, in are 4 d registers
>> +.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
>> +        vmull.s16       \out1, \in1, \coef1
>> +        vmull.s16       \out2, \in2, \coef1
>> +        vmull.s16       \out3, \in1, \coef2
>> +        vmull.s16       \out4, \in2, \coef2
>> +        vmlsl.s16       \out1, \in3, \coef2
>> +        vmlsl.s16       \out2, \in4, \coef2
>> +        vmlal.s16       \out3, \in3, \coef1
>> +        vmlal.s16       \out4, \in4, \coef1
>> +.endm
>> +@ in1 = (in1 * coef1 - in2 * coef2 + (1 << 13)) >> 14
>> +@ in2 = (in1 * coef2 + in2 * coef1 + (1 << 13)) >> 14
>> +@ in are 2 d registers, tmp are 2 q registers
>> +.macro mbutterfly in1, in2, coef1, coef2, tmp1, tmp2, neg=0
>> +        mbutterfly_l    \tmp1, \tmp2, \in1, \in2, \coef1, \coef2
>> +.if \neg > 0
>> +        vneg.s32        \tmp2, \tmp2
>> +.endif
>> +        vrshrn.s32      \in1, \tmp1,  #14
>> +        vrshrn.s32      \in2, \tmp2,  #14
>> +.endm
>> +@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
>> +@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
>> +@ inout are 4 d registers, tmp are 4 q registers
>> +.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
>> +        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
>> +        vrshrn.s32      \inout1, \tmp1,  #14
>> +        vrshrn.s32      \inout2, \tmp2,  #14
>> +        vrshrn.s32      \inout3, \tmp3,  #14
>> +        vrshrn.s32      \inout4, \tmp4,  #14
>> +.endm
>> +.macro mbutterfly_neg in1, in2, coef1, coef2, tmp1, tmp2
>> +        mbutterfly      \in1, \in2, \coef1, \coef2, \tmp1, \tmp2, 1
>> +.endm
>
> tis macro is a little pointless, readability is not really worse for
>
> mbutterfly ..., neg=1 vs mbutterfly_neg ...

Yeah; the only advantage is that it fits the "neg" part in a place where 
we already would have had whitespace:

         mbutterfly      d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a
         mbutterfly_neg  d27, d20, d0[3], d1[0], q8, q9 @ d27 = t29a, d20 = t18a
         mbutterfly      d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a
         mbutterfly_neg  d25, d22, d1[1], d1[2], q8, q9 @ d25 = t25a, d22 = t22a

vs

         mbutterfly      d23, d24, d0[3], d1[0], q8, q9        @ d23 = t17a, d24 = t30a
         mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
         mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
         mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a

Now looking at it afterwards, it doesn't look all that bad though and the 
lines are already very long, so I'll change it and reduce the macro soup 
by getting rid of this unnecessary one.



>> +@ out1 = in1 + in2
>> +@ out2 = in1 - in2
>> +.macro butterfly out1, out2, in1, in2
>> +        vadd.s16        \out1, \in1, \in2
>> +        vsub.s16        \out2, \in1, \in2
>> +.endm
>> +@ out1 = in1 - in2
>> +@ out2 = in1 + in2
>> +.macro butterfly_r out1, out2, in1, in2
>> +        vsub.s16        \out1, \in1, \in2
>> +        vadd.s16        \out2, \in1, \in2
>> +.endm
>> +@ out1 = (in1 + in2 + (1 << 13)) >> 14
>> +@ out2 = (in1 - in2 + (1 << 13)) >> 14
>> +@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
>> +.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
>> +        vadd.s32        \tmp1, \in1, \in2
>> +        vsub.s32        \tmp2, \in1, \in2
>> +        vrshrn.s32      \out1, \tmp1,  #14
>> +        vrshrn.s32      \out2, \tmp2,  #14
>> +.endm
>> +@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
>> +@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
>> +@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
>> +.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
>> +        vadd.s32        \tmp1, \in1, \in3
>> +        vadd.s32        \tmp2, \in2, \in4
>> +        vsub.s32        \tmp3, \in1, \in3
>> +        vsub.s32        \tmp4, \in2, \in4
>> +        vrshrn.s32      \out1, \tmp1,  #14
>> +        vrshrn.s32      \out2, \tmp2,  #14
>> +        vrshrn.s32      \out3, \tmp3,  #14
>> +        vrshrn.s32      \out4, \tmp4,  #14
>> +.endm
>> +
>> +
>> +.macro iwht4 c0, c1, c2, c3
>> +        vadd.i16        \c0,  \c0,  \c1
>> +        vsub.i16        d17,  \c2,  \c3
>> +        vsub.i16        d16,  \c0,  d17
>> +        vshr.s16        d16,  d16,  #1
>> +        vsub.i16        \c2,  d16,  \c1
>> +        vsub.i16        \c1,  d16,  \c3
>> +        vadd.i16        \c3,  d17,  \c2
>> +        vsub.i16        \c0,  \c0,  \c1
>> +.endm
>> +
>> +.macro idct4 c0, c1, c2, c3
>> +        vadd.i16        d16,  \c0,  \c2
>> +        vsub.i16        d17,  \c0,  \c2
>> +        vmull.s16       q11,  \c1,  d0[1]
>> +        vmull.s16       q12,  \c3,  d0[2]
>
> vmlsl.s16 q11, \c3,  d0[2] and reorder for in-order
>
>> +        vmull.s16       q13,  \c1,  d0[2]
>> +        vmull.s16       q14,  \c3,  d0[1]
>
> vmlal.s16 q13, \c3,  d0[1]

Done

Before:
vp9_inv_dct_dct_4x4_add_neon:  117.0   69.0   85.0   79.0

After:
vp9_inv_dct_dct_4x4_add_neon:  108.7   65.0   79.0   78.0


>> +        vmull.s16       q9,   d16,  d0[0]
>> +        vmull.s16       q10,  d17,  d0[0]
>> +        vadd.i32        q13,  q13,  q14
>> +        vsub.i32        q11,  q11,  q12
>> +        vrshrn.s32      d16,  q9,   #14
>> +        vrshrn.s32      d19,  q13,  #14
>> +        vrshrn.s32      d17,  q10,  #14
>> +        vrshrn.s32      d18,  q11,  #14
>> +        vadd.i16        \c0,  d16,  d19
>> +        vadd.i16        \c1,  d17,  d18
>> +        vsub.i16        \c2,  d17,  d18
>> +        vsub.i16        \c3,  d16,  d19
>> +.endm
>> +
>> +.macro iadst4 c0, c1, c2, c3
>> +        vmull.s16       q10,  \c0,  d1[0]
>> +        vmlal.s16       q10,  \c2,  d1[1]
>> +        vmlal.s16       q10,  \c3,  d1[2]
>> +        vmull.s16       q11,  \c0,  d1[2]
>> +        vmlsl.s16       q11,  \c2,  d1[0]
>> +        vsub.s16        \c0,  \c0,  \c2
>> +        vmlsl.s16       q11,  \c3,  d1[1]
>> +        vadd.s16        \c0,  \c0,  \c3
>> +        vmull.s16       q13,  \c1,  d1[3]
>> +        vmull.s16       q12,  \c0,  d1[3]
>> +        vadd.s32        q14,  q10,  q13
>> +        vadd.s32        q1,   q11,  q13
>> +        vrshrn.s32      \c0,  q14,  #14
>> +        vadd.s32        q10,  q10,  q11
>> +        vrshrn.s32      \c1,  q1,   #14
>> +        vsub.s32        q10,  q10,  q13
>> +        vrshrn.s32      \c2,  q12,  #14
>> +        vrshrn.s32      \c3,  q10,  #14
>
> instruction scheduling can be optimized for this one too

I've done a bit of testing with different ordering on this one already, 
and while I can improve quite a bit on the A7, the A8 and A53 immediately 
get slower at the same time, I haven't found any better compromise yet.

>> +.endm
>> +
>> +@ The public functions in this file have got the following signature:
>> +@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
>> +
>> +.macro itxfm_func4x4 txfm1, txfm2
>> +function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
>> +.ifc \txfm1,\txfm2
>> +.ifc \txfm1,idct
>> +        movrel          r12, itxfm4_coeffs
>> +        vld1.16         {d0}, [r12,:64]
>> +.endif
>> +.ifc \txfm1,iadst
>> +        movrel          r12, iadst4_coeffs
>> +        vld1.16         {d1}, [r12,:64]
>> +.endif
>> +.else
>> +        movrel          r12, itxfm4_coeffs
>> +        vld1.16         {q0}, [r12,:128]
>> +.endif
>
> aligned 8 byte and 16 byte loads are equally fast so this adds just
> complexity without gain

checkasm --bench disagrees; A7 and A53 get around one cycle slower on 
dct_dct_4x4 and adst_adst_4x4, if I just load the full q0 from 
itxfm4_coeffs.

Leaving this as is.

>> +
>> +        vmov.i16        q15, #0
>> +.ifc \txfm1,idct
>> +.ifc \txfm2,idct
>> +        cmp             r3,  #1
>> +        bne             1f
>> +        @ DC-only for idct/idct
>> +        vld1.16         {d4[]},   [r2]
>
> alignment

Added :16 alignment here

>> +        vmull.s16       q2,  d4,  d0[0]
>> +        vrshrn.s32      d4,  q2,  #14
>> +        vmull.s16       q2,  d4,  d0[0]
>> +        vrshrn.s32      d4,  q2,  #14
>> +        vst1.16         {d30[0]}, [r2]
>
> same

Done

>> +        vdup.16         q2,  d4[0]
>> +        vmov            q3,  q2
>
> vdup first to q3 to avoid data dependency

Do you mean vdup.16 q2, d4[0], vdup.16 q3, d4[0]? (I don't see the point 
here in duping into q3 and doing vmov q2, q3, if that is what you meant.) 
That turns out to be 1 cycle slower on A53 and A8, 1.5 cycles slower on 
A9.

>> +        b               2f
>> +.endif
>> +.endif
>> +
>> +1:
>> +        vld1.16         {d4-d7},  [r2,:128]
>> +        vst1.16         {q15}, [r2,:128]!
>> +
>> +.ifc \txfm1,iwht
>> +        vshr.s16        q2,  q2,  #2
>> +        vshr.s16        q3,  q3,  #2
>> +.endif
>> +
>> +        \txfm1\()4      d4,  d5,  d6,  d7
>> +
>> +        vst1.16         {q15}, [r2,:128]!
>> +        @ Transpose 4x4 with 16 bit elements
>> +        vtrn.16         d4,  d5
>> +        vtrn.16         d6,  d7
>> +        vtrn.32         d4,  d6
>> +        vtrn.32         d5,  d7
>
> vtrn.32 q2, q3

Done

>> +
>> +        \txfm2\()4      d4,  d5,  d6,  d7
>> +2:
>> +        vld1.32         {d0[]},   [r0,:32], r1
>> +        vld1.32         {d0[1]},  [r0,:32], r1
>> +.ifnc \txfm1,iwht
>> +        vrshr.s16       q2,  q2,  #4
>> +        vrshr.s16       q3,  q3,  #4
>> +.endif
>> +        vaddw.u8        q2,  q2,  d0
>> +        vld1.32         {d1[]},   [r0,:32], r1
>> +        vld1.32         {d1[1]},  [r0,:32], r1
>> +        vqmovun.s16     d0,  q2
>> +        sub             r0,  r0,  r1, lsl #2
>
> since we have free gp registers I'd use different register for load and
> store. probably not faster though

Around one cycle slower on A7 and A53, so skipped. I also tried using 
separate registers for storing odd/even lines, with double stride, but 
that was a pretty large loss on A8, so skipped that for now as well.

>> +
>> +        vaddw.u8        q3,  q3,  d1
>> +        vst1.32         {d0[0]},  [r0,:32], r1
>> +        vqmovun.s16     d1,  q3
>> +
>> +        vst1.32         {d0[1]},  [r0,:32], r1
>> +        vst1.32         {d1[0]},  [r0,:32], r1
>> +        vst1.32         {d1[1]},  [r0,:32], r1
>> +
>> +        bx              lr
>> +endfunc
>> +.endm
>> +
>> +itxfm_func4x4 idct,  idct
>> +itxfm_func4x4 iadst, idct
>> +itxfm_func4x4 idct,  iadst
>> +itxfm_func4x4 iadst, iadst
>> +itxfm_func4x4 iwht,  iwht
>> +
>> +
>> +.macro idct8
>> +        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
>> +        dmbutterfly     d20, d21, d28, d29, d0[1], d0[2], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
>> +        dmbutterfly     d18, d19, d30, d31, d0[3], d1[0], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
>> +        dmbutterfly     d26, d27, d22, d23, d1[1], d1[2], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
>> +
>> +        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
>> +        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
>> +        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
>> +        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
>> +
>> +        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
>> +
>> +        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
>> +
>> +        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
>> +        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
>> +        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
>> +.endm
>> +
>> +.macro iadst8
>> +        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d2[1], d2[0] @ q4,q5  = t1a, q2,q3 = t0a
>> +        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d3[1], d3[0] @ q8,q15 = t5a, q6,q7 = t4a
>> +
>> +        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
>> +
>> +        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
>> +
>> +        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d2[3], d2[2] @ q6,q7 = t3a, q4,q5 = t2a
>> +        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[3], d3[2] @ q10,q13 = t7a, q8,q15 = t6a
>> +
>> +        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
>> +        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
>> +
>> +        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
>> +        vneg.s16        q15, q15          @ q15 = out[7]
>> +        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
>> +
>> +        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d0[1], d0[2] @ q10,q11 = t5a, q5,q7 = t4a
>> +        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d0[2], d0[1] @ q2,q3 = t6a, q13,q14 = t7a
>> +
>> +        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
>> +
>> +        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
>> +        vneg.s16        q11, q11      @ q11 = out[3]
>> +
>> +        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
>> +        vneg.s16        q9,  q9       @ q9 = out[1]
>> +
>> +        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
>> +        vneg.s16        q13, q13      @ q13 = out[5]
>> +.endm
>> +
>> +
>> +.macro itxfm_func8x8 txfm1, txfm2
>> +function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
>> +        @ Push q4-q7 if iadst is used, idct requires
>> +        @ a few scratch registers less, so only push q4-q5
>> +        @ if only idct is involved.
>> +        @ The iadst also uses a few coefficients from
>> +        @ idct, so those always need to be loaded.
>> +        movrel          r12, idct_coeffs
>
> move this into the last else below
>
>> +        vld1.16         {q0}, [r12,:128]
>
> this can follow after this block if the iadst8_coeffs load uses post
> increment

Done

>> +.ifc \txfm1,iadst
>> +        movrel          r12, iadst8_coeffs
>> +        vld1.16         {q1}, [r12,:128]
>> +        vpush           {q4-q7}
>> +.else
>> +.ifc \txfm2,iadst
>
> does .elseifc work?

No, unfortunately it doesn't, so afaik there's no way to do .elseif with 
string comparisons

>> +        movrel          r12, iadst8_coeffs
>> +        vld1.16         {q1}, [r12,:128]
>> +        vpush           {q4-q7}
>> +.else
>> +        vpush           {q4-q5}
>> +.endif
>> +.endif
>> +
>> +        vmov.i16        q2, #0
>> +        vmov.i16        q3, #0
>> +
>> +.ifc \txfm1,idct
>> +.ifc \txfm2,idct
>> +        cmp             r3,  #1
>> +        bne             1f
>> +        @ DC-only for idct/idct
>> +        vld1.16         {d16[]},   [r2]
>
> alignment

Done

>> +        vmull.s16       q8,  d16, d0[0]
>> +        vrshrn.s32      d16, q8,  #14
>> +        vmull.s16       q8,  d16, d0[0]
>> +        vrshrn.s32      d16, q8,  #14
>> +        vdup.16         q8,  d16[0]
>> +        vmov            q9,  q8
>> +        vmov            q10, q8
>> +        vmov            q11, q8
>> +        vmov            q12, q8
>> +        vmov            q13, q8
>> +        vmov            q14, q8
>> +        vmov            q15, q8
>
> all duped from d16[0]

That ends up consistently around 1 cycle slower on most cores, so not done

>> +        vst1.16         {d4[0]}, [r2]
>
> alignment

Done

>> +        b               2f
>> +.endif
>> +.endif
>> +1:
>> +        vld1.16         {q8-q9},    [r2,:128]!
>> +        vld1.16         {q10-q11},  [r2,:128]!
>> +        vld1.16         {q12-q13},  [r2,:128]!
>> +        vld1.16         {q14-q15},  [r2,:128]!
>> +        sub             r2,  r2,  #128
>> +        vst1.16         {q2-q3}, [r2,:128]!
>> +        vst1.16         {q2-q3}, [r2,:128]!
>> +        vst1.16         {q2-q3}, [r2,:128]!
>> +        vst1.16         {q2-q3}, [r2,:128]!
>> +
>> +        \txfm1\()8
>> +
>> +        @ Transpose 8x8 with 16 bit elements
>> +        vswp            d17, d24
>> +        vswp            d19, d26
>> +        vswp            d21, d28
>> +        vswp            d23, d30
>> +        transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
>> +
>> +        \txfm2\()8
>> +2:
>> +        @ Add into the destination
>> +        vld1.8          {d4},  [r0,:64], r1
>> +        vrshr.s16       q8,  q8,  #5
>> +        vld1.8          {d5},  [r0,:64], r1
>> +        vrshr.s16       q9,  q9,  #5
>> +        vld1.8          {d6},  [r0,:64], r1
>> +        vrshr.s16       q10, q10, #5
>> +        vaddw.u8        q8,  q8,  d4
>> +        vld1.8          {d7},  [r0,:64], r1
>> +        vrshr.s16       q11, q11, #5
>> +        vaddw.u8        q9,  q9,  d5
>> +        vld1.8          {d8},  [r0,:64], r1
>> +        vrshr.s16       q12, q12, #5
>> +        vaddw.u8        q10, q10, d6
>> +        vqmovun.s16     d4,  q8
>> +        vld1.8          {d9},  [r0,:64], r1
>> +        vrshr.s16       q13, q13, #5
>> +        vaddw.u8        q11, q11, d7
>> +        vqmovun.s16     d5,  q9
>> +        vld1.8          {d10}, [r0,:64], r1
>> +        vrshr.s16       q14, q14, #5
>> +        vaddw.u8        q12, q12, d8
>> +        vqmovun.s16     d6,  q10
>> +        vld1.8          {d11}, [r0,:64], r1
>> +        vrshr.s16       q15, q15, #5
>> +        vaddw.u8        q13, q13, d9
>> +        vqmovun.s16     d7,  q11
>> +        sub             r0,  r0,  r1, lsl #3
>
> could use a different register loads and stores

Seems to help a little here, thus done

>> +
>> +        vst1.8          {d4},  [r0,:64], r1
>> +        vaddw.u8        q14, q14, d10
>> +        vst1.8          {d5},  [r0,:64], r1
>> +        vqmovun.s16     d8,  q12
>> +        vst1.8          {d6},  [r0,:64], r1
>> +        vaddw.u8        q15, q15, d11
>> +        vst1.8          {d7},  [r0,:64], r1
>> +        vqmovun.s16     d9,  q13
>> +        vst1.8          {d8},  [r0,:64], r1
>> +        vqmovun.s16     d10, q14
>> +        vst1.8          {d9},  [r0,:64], r1
>> +        vqmovun.s16     d11, q15
>> +
>> +        vst1.8          {d10}, [r0,:64], r1
>> +        vst1.8          {d11}, [r0,:64], r1
>> +
>> +.ifc \txfm1,iadst
>> +        vpop            {q4-q7}
>> +.else
>> +.ifc \txfm2,iadst
>> +        vpop            {q4-q7}
>> +.else
>> +        vpop            {q4-q5}
>> +.endif
>> +.endif
>> +        bx              lr
>> +endfunc
>> +.endm
>> +
>> +itxfm_func8x8 idct,  idct
>> +itxfm_func8x8 iadst, idct
>> +itxfm_func8x8 idct,  iadst
>> +itxfm_func8x8 iadst, iadst
>> +
>> +
>> +function idct16x16_dc_add_neon
>> +        movrel          r12, idct_coeffs
>> +        vld1.16         {d0}, [r12,:64]
>> +
>> +        vmov.i16        q2, #0
>> +
>> +        vld1.16         {d16[]},   [r2]
>
> alignment

Done

>> +        vmull.s16       q8,  d16, d0[0]
>> +        vrshrn.s32      d16, q8,  #14
>> +        vmull.s16       q8,  d16, d0[0]
>> +        vrshrn.s32      d16, q8,  #14
>> +        vdup.16         q8,  d16[0]
>> +        vst1.16         {d4[0]}, [r2]
>
> alignment

Done

>> +
>> +        vrshr.s16       q8,  q8,  #6
>> +
>> +        mov             r12, #16
>> +1:
>> +        @ Loop to add the constant from q8 into all 16x16 outputs
>> +        vld1.8          {q3},  [r0,:128]
>> +        vaddw.u8        q10, q8,  d6
>> +        vaddw.u8        q11, q8,  d7
>> +        vqmovun.s16     d6,  q10
>> +        vqmovun.s16     d7,  q11
>> +        vst1.8          {q3},  [r0,:128], r1
>> +        subs            r12, r12, #1
>> +        bne             1b
>> +
>> +        bx              lr
>> +endfunc
>> +
>> +.macro idct16
>> +        mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
>> +        mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
>> +        mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
>> +        mbutterfly      d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
>> +        mbutterfly      d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
>> +        mbutterfly      d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
>> +        mbutterfly      d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
>> +        mbutterfly      d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
>> +
>> +        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
>> +        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
>> +        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
>> +        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
>> +        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
>> +        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
>> +        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
>> +        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
>> +
>> +        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15 @ d22 = t6a, d26 = t5a
>> +        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15 @ d23 = t9a, d25 = t14a
>> +        mbutterfly_neg  d27, d21, d0[1], d0[2], q9,  q15 @ d27 = t13a, d21 = t10a
>> +
>> +        butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = t7a
>> +        butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = t6
>> +        butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = t5
>> +        butterfly       d5,  d6,  d28, d6                @ d5  = t3a,  d6  = t4
>> +        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
>> +        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
>> +        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
>> +        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
>> +
>> +        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
>> +        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
>> +
>> +        vswp            d27, d29                         @ d27 = t12, d29 = t13a
>> +        vswp            d28, d27                         @ d28 = t12, d27 = t11
>> +        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
>> +        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
>> +        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
>> +        butterfly       d23, d24, d7,  d20               @ d23 = out[7], d24 = out[8]
>> +        butterfly       d18, d29, d4,  d29               @ d18 = out[2], d29 = out[13]
>> +        butterfly       d19, d28, d5,  d28               @ d19 = out[3], d28 = out[12]
>> +        vmov            d4,  d21                         @ d4  = t10a
>> +        butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
>> +        butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
>> +.endm
>> +
>> +.macro iadst16
>> +        movrel          r12, iadst16_coeffs
>> +        vld1.16         {q0-q1}, [r12,:128]
>> +
>> +        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
>> +        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = t8
>> +        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
>> +        mbutterfly_l    q7,  q6,  d29, d18, d0[3], d0[2] @ q7  = t3,   q6  = t2
>> +        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
>> +
>> +        mbutterfly_l    q3,  q2,  d21, d26, d2[3], d2[2] @ q3  = t11,  q2  = t10
>> +        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
>> +        mbutterfly_l    q5,  q4,  d27, d20, d1[1], d1[0] @ q5  = t5,   q4  = t4
>> +        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
>> +
>> +        mbutterfly_l    q7,  q6,  d19, d28, d3[1], d3[0] @ q7  = t13,  q6  = t12
>> +        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
>> +        mbutterfly_l    q3,  q2,  d25, d22, d1[3], d1[2] @ q3  = t7,   q2  = t6
>> +        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
>> +
>> +        mbutterfly_l    q5,  q4,  d17, d30, d3[3], d3[2] @ q5  = t15,  q4  = t14
>> +        movrel          r12, idct_coeffs
>> +        vld1.16         {q0}, [r12,:128]
>> +        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
>> +        mbutterfly_l    q7,  q6,  d23, d24, d0[3], d1[0] @ q7  = t9,   q6  = t8
>> +        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
>> +
>> +        mbutterfly_l    q2,  q3,  d28, d19, d1[0], d0[3] @ q2  = t12,  q3  = t13
>> +        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
>> +        mbutterfly_l    q5,  q4,  d21, d26, d1[1], d1[2] @ q5  = t11,  q4  = t10
>> +        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
>> +        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
>> +
>> +        mbutterfly_l    q6,  q7,  d30, d17, d1[2], d1[1] @ q6  = t14,  q7  = t15
>> +        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
>> +        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
>> +        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
>> +
>> +        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
>> +        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
>> +
>> +        mbutterfly_l    q5,  q4,  d19, d28, d0[1], d0[2] @ q5  = t13,  q4  = t12
>> +        mbutterfly_l    q6,  q7,  d30, d17, d0[2], d0[1] @ q6  = t14,  q7  = t15
>> +
>> +        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
>> +        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
>> +        vneg.s16        d29, d29                         @ d29 = out[13]
>> +
>> +        mbutterfly_l    q5,  q4,  d4,  d5,  d0[1], d0[2] @ q5  = t5a,  q4  = t4a
>> +        mbutterfly_l    q6,  q7,  d7,  d6,  d0[2], d0[1] @ q6  = t6a,  q7  = t7a
>> +
>> +        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
>> +        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
>> +
>> +        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
>> +        vneg.s16        d19, d19                         @ d19 = out[3]
>> +        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
>> +
>> +        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
>> +        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
>> +
>> +        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
>> +        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
>> +        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
>> +        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
>> +
>> +        vneg.s16        d31, d5                          @ d31 = out[15]
>> +        vneg.s16        d17, d3                          @ d17 = out[1]
>> +
>> +        vmov            d16, d2
>> +        vmov            d30, d4
>> +.endm
>> +
>> +.macro itxfm16_1d_funcs txfm
>> +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
>> +@ transpose into a horizontal 16x4 slice and store.
>> +@ r0 = dst (temp buffer)
>> +@ r1 = unused
>> +@ r2 = src
>> +@ r3 = slice offset
>> +function \txfm\()16_1d_4x16_pass1_neon
>> +        mov             r12, #32
>> +        vmov.s16        q2, #0
>> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>> +        vld1.16         {d\i}, [r2,:64]
>> +        vst1.16         {d4},  [r2,:64], r12
>> +.endr
>> +
>> +        \txfm\()16
>> +
>> +        @ Do four 4x4 transposes. Originally, d16-d31 contain the
>> +        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
>> +        @ contain the transposed 4x4 blocks.
>> +        transpose16_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
>> +
>> +        @ Store the transposed 4x4 blocks horizontally.
>> +        cmp             r3,  #12
>> +        beq             1f
>> +.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
>> +        vst1.16         {d\i}, [r0,:64]!
>> +.endr
>> +        bx              lr
>> +1:
>> +        @ Special case: For the last input column (r3 == 12),
>> +        @ which would be stored as the last row in the temp buffer,
>> +        @ don't store the first 4x4 block, but keep it in registers
>> +        @ for the first slice of the second pass (where it is the
>> +        @ last 4x4 block).
>> +        add             r0,  r0,  #8
>> +.irp i, 20, 24, 28
>> +        vst1.16         {d\i}, [r0,:64]!
>> +.endr
>> +        add             r0,  r0,  #8
>> +.irp i, 21, 25, 29
>> +        vst1.16         {d\i}, [r0,:64]!
>> +.endr
>> +        add             r0,  r0,  #8
>> +.irp i, 22, 26, 30
>> +        vst1.16         {d\i}, [r0,:64]!
>> +.endr
>> +        add             r0,  r0,  #8
>> +.irp i, 23, 27, 31
>> +        vst1.16         {d\i}, [r0,:64]!
>> +.endr
>> +        vmov            d28, d16
>> +        vmov            d29, d17
>> +        vmov            d30, d18
>> +        vmov            d31, d19
>> +        bx              lr
>> +endfunc
>> +
>> +@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
>> +@ load the destination pixels (from a similar 4x16 slice), add and store back.
>> +@ r0 = dst
>> +@ r1 = dst stride
>> +@ r2 = src (temp buffer)
>> +@ r3 = slice offset
>> +function \txfm\()16_1d_4x16_pass2_neon
>> +        mov             r12, #32
>> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
>> +        vld1.16         {d\i}, [r2,:64], r12
>> +.endr
>> +        cmp             r3,  #0
>> +        beq             1f
>> +.irp i, 28, 29, 30, 31
>> +        vld1.16         {d\i}, [r2,:64], r12
>> +.endr
>> +1:
>> +
>> +        \txfm\()16
>> +
>> +.macro load_add_store coef0, coef1, coef2, coef3
>> +        vrshr.s16       \coef0, \coef0, #6
>> +        vrshr.s16       \coef1, \coef1, #6
>> +
>> +        vld1.32         {d4[]},   [r0,:32], r1
>> +        vld1.32         {d4[1]},  [r0,:32], r1
>> +        vrshr.s16       \coef2, \coef2, #6
>> +        vrshr.s16       \coef3, \coef3, #6
>> +        vld1.32         {d5[]},   [r0,:32], r1
>> +        vld1.32         {d5[1]},  [r0,:32], r1
>> +        vaddw.u8        \coef0, \coef0, d4
>> +        vld1.32         {d6[]},   [r0,:32], r1
>> +        vld1.32         {d6[1]},  [r0,:32], r1
>> +        vaddw.u8        \coef1, \coef1, d5
>> +        vld1.32         {d7[]},   [r0,:32], r1
>> +        vld1.32         {d7[1]},  [r0,:32], r1
>> +
>> +        vqmovun.s16     d4,  \coef0
>> +        vqmovun.s16     d5,  \coef1
>> +        sub             r0,  r0,  r1, lsl #3
>
> could use an additional register

Done, but by using two registers for loading/storing alternating rows, 
which gave a larger speedup on all cores (but requiring two subs inbetween 
instead)

>> +        vaddw.u8        \coef2, \coef2, d6
>> +        vaddw.u8        \coef3, \coef3, d7
>> +        vst1.32         {d4[0]},  [r0,:32], r1
>> +        vst1.32         {d4[1]},  [r0,:32], r1
>> +        vqmovun.s16     d6,  \coef2
>> +        vst1.32         {d5[0]},  [r0,:32], r1
>> +        vst1.32         {d5[1]},  [r0,:32], r1
>> +        vqmovun.s16     d7,  \coef3
>> +
>> +        vst1.32         {d6[0]},  [r0,:32], r1
>> +        vst1.32         {d6[1]},  [r0,:32], r1
>> +        vst1.32         {d7[0]},  [r0,:32], r1
>> +        vst1.32         {d7[1]},  [r0,:32], r1
>> +.endm
>> +        load_add_store  q8,  q9,  q10, q11
>> +        load_add_store  q12, q13, q14, q15
>> +.purgem load_add_store
>> +
>> +        bx              lr
>> +endfunc
>> +.endm
>> +
>> +itxfm16_1d_funcs idct
>> +itxfm16_1d_funcs iadst
>> +
>> +.macro itxfm_func16x16 txfm1, txfm2
>> +function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>> +.ifc \txfm1,idct
>> +.ifc \txfm2,idct
>> +        cmp             r3,  #1
>> +        beq             idct16x16_dc_add_neon
>> +.endif
>> +.endif
>> +1:
>
> unused label

Removed

>> +        push            {r4-r7,lr}
>> +.ifc \txfm1,iadst
>> +        vpush           {q4-q7}
>> +.else
>> +.ifc \txfm2,iadst
>> +        vpush           {q4-q7}
>> +.endif
>> +.endif
>> +        mov             r7,  sp
>> +
>> +        @ Align the stack, allocate a temp buffer
>> +T       mov             r12, sp
>> +T       bic             r12, r12, #15
>> +T       sub             r12, r12, #512
>> +T       mov             sp,  r12
>> +A       bic             sp,  sp,  #15
>> +A       sub             sp,  sp,  #512
>> +
>> +        mov             r4,  r0
>> +        mov             r5,  r1
>> +        mov             r6,  r2
>> +
>> +.ifc \txfm1,idct
>> +        movrel          r12, idct_coeffs
>> +        vld1.16         {q0-q1}, [r12,:128]
>> +.endif
>> +
>> +.irp i, 0, 4, 8, 12
>> +        add             r0,  sp,  #(\i*32)
>> +        add             r2,  r6,  #(\i*2)
>> +        mov             r3,  #\i
>> +        bl              \txfm1\()16_1d_4x16_pass1_neon
>> +.endr
>> +.ifc \txfm2,idct
>> +        movrel          r12, idct_coeffs
>> +        vld1.16         {q0-q1}, [r12,:128]
>> +.endif
>> +.irp i, 0, 4, 8, 12
>> +        add             r0,  r4,  #(\i)
>> +        mov             r1,  r5
>> +        add             r2,  sp,  #(\i*2)
>> +        mov             r3,  #\i
>> +        bl              \txfm2\()16_1d_4x16_pass2_neon
>> +.endr
>> +
>> +        mov             sp,  r7
>> +.ifc \txfm1,iadst
>> +        vpop            {q4-q7}
>> +.else
>> +.ifc \txfm2,iadst
>> +        vpop            {q4-q7}
>> +.endif
>> +.endif
>> +        pop             {r4-r7,pc}
>> +endfunc
>> +.endm
>> +
>> +itxfm_func16x16 idct,  idct
>> +itxfm_func16x16 iadst, idct
>> +itxfm_func16x16 idct,  iadst
>> +itxfm_func16x16 iadst, iadst
>> +
>> +
>> +function idct32x32_dc_add_neon
>> +        movrel          r12, idct_coeffs
>> +        vld1.16         {d0}, [r12,:64]
>> +
>> +        vmov.i16        q2, #0
>> +
>> +        vld1.16         {d16[]},   [r2]
>
> alignment

Done

>> +        vmull.s16       q8,  d16, d0[0]
>> +        vrshrn.s32      d16, q8,  #14
>> +        vmull.s16       q8,  d16, d0[0]
>> +        vrshrn.s32      d16, q8,  #14
>> +        vdup.16         q8,  d16[0]
>> +        vst1.16         {d4[0]}, [r2]
>
> dito

Done

>> +
>> +        vrshr.s16       q8,  q8,  #6
>> +
>> +        mov             r12, #32
>> +1:
>> +        @ Loop to add the constant from q8 into all 32x32 outputs
>> +        vld1.8          {q2-q3},  [r0,:128]
>> +        vaddw.u8        q10, q8,  d4
>> +        vaddw.u8        q11, q8,  d5
>> +        vaddw.u8        q12, q8,  d6
>> +        vaddw.u8        q13, q8,  d7
>> +        vqmovun.s16     d4,  q10
>> +        vqmovun.s16     d5,  q11
>> +        vqmovun.s16     d6,  q12
>> +        vqmovun.s16     d7,  q13
>> +        vst1.8          {q2-q3},  [r0,:128], r1
>> +        subs            r12, r12, #1
>> +        bne             1b
>> +
>> +        bx              lr
>> +endfunc
>> +
>> +.macro idct32_odd
>> +        movrel          r12, idct_coeffs
>> +        add             r12, r12, #32
>> +        vld1.16         {q0-q1}, [r12,:128]
>> +
>> +        mbutterfly      d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a
>> +        mbutterfly      d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a
>> +        mbutterfly      d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a
>> +        mbutterfly      d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a
>> +        mbutterfly      d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a
>> +        mbutterfly      d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a
>> +        mbutterfly      d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a
>> +        mbutterfly      d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a
>> +
>> +        sub             r12, r12, #32
>> +        vld1.16         {q0}, [r12,:128]
>> +
>> +        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
>> +        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
>> +        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
>> +        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
>> +        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
>> +        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
>> +        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
>> +        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
>> +
>> +        mbutterfly      d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a
>> +        mbutterfly_neg  d27, d20, d0[3], d1[0], q8, q9 @ d27 = t29a, d20 = t18a
>> +        mbutterfly      d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a
>> +        mbutterfly_neg  d25, d22, d1[1], d1[2], q8, q9 @ d25 = t25a, d22 = t22a
>> +
>> +        butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
>> +        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
>> +        butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
>> +        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
>> +        butterfly       d4,  d28, d28, d30 @ d4  = t24a, d28 = t27a
>> +        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
>> +        butterfly       d7,  d29, d29, d31 @ d7  = t31a, d29 = t28a
>> +        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
>> +
>> +        mbutterfly      d27, d20, d0[1], d0[2], q12, q15 @ d27 = t18a, d20 = t29a
>> +        mbutterfly      d29, d5,  d0[1], d0[2], q12, q15 @ d29 = t19,  d5  = t28
>> +        mbutterfly_neg  d28, d6,  d0[1], d0[2], q12, q15 @ d28 = t27,  d6  = t20
>> +        mbutterfly_neg  d26, d21, d0[1], d0[2], q12, q15 @ d26 = t26a, d21 = t21a
>> +
>> +        butterfly       d31, d24, d7,  d4  @ d31 = t31,  d24 = t24
>> +        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
>> +        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
>> +        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
>> +        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
>> +        butterfly_r     d27, d28, d5,  d28 @ d27 = t27a, d28 = t28a
>> +        butterfly       d4,  d26, d20, d26 @ d4  = t29,  d26 = t26
>> +        butterfly       d19, d20, d29, d6  @ d19 = t19a, d20 = t20
>> +        vmov            d29, d4            @ d29 = t29
>> +
>> +        mbutterfly0     d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27,  d20 = t20
>> +        mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a
>> +        mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
>> +        mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
>> +.endm
>> +
>> +@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
>> +@ We don't have register space to do a single pass IDCT of 4x32 though,
>> +@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
>> +@ a normal IDCT16 with every other input component (the even ones, with
>> +@ each output written twice), followed by a separate 16-point IDCT
>> +@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
>> +@ r0 = dst (temp buffer)
>> +@ r1 = unused
>> +@ r2 = src
>> +function idct32_1d_4x32_pass1_neon
>> +        movrel          r12, idct_coeffs
>> +        vld1.16         {q0-q1}, [r12,:128]
>> +
>> +        @ Double stride of the input, since we only read every other line
>> +        mov             r12, #128
>> +        vmov.s16        d4, #0
>> +
>> +        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
>> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>> +        vld1.16         {d\i}, [r2,:64]
>> +        vst1.16         {d4},  [r2,:64], r12
>> +.endr
>> +
>> +        idct16
>> +
>> +        @ Do four 4x4 transposes. Originally, d16-d31 contain the
>> +        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
>> +        @ contain the transposed 4x4 blocks.
>> +        transpose16_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
>> +        @ Store the registers a, b, c, d horizontally, followed
>> +        @ by the same registers d, c, b, a mirrored.
>> +.macro store_rev a, b, c, d
>> +.irp i, \a, \b, \c, \d
>> +        vst1.16         {d\i}, [r0,:64]!
>> +        vrev64.16       d\i, d\i
>> +.endr
>> +.irp i, \d, \c, \b, \a
>> +        vst1.16         {d\i}, [r0,:64]!
>> +.endr
>> +.endm
>> +        store_rev       16, 20, 24, 28
>> +        store_rev       17, 21, 25, 29
>> +        store_rev       18, 22, 26, 30
>> +        store_rev       19, 23, 27, 31
>> +        sub             r0,  r0,  #256
>> +.purgem store_rev
>> +
>> +        @ Move r2 back to the start of the input, and move
>> +        @ to the first odd row
>> +        sub             r2,  r2,  r12, lsl #4
>> +        add             r2,  r2,  #64
>> +
>> +        vmov.s16        d4, #0
>> +        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
>> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>> +        vld1.16         {d\i}, [r2,:64]
>> +        vst1.16         {d4},  [r2,:64], r12
>> +.endr
>> +
>> +        idct32_odd
>> +
>> +        transpose16_q_2x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
>> +
>> +        @ Store the registers a, b, c, d horizontally,
>> +        @ adding into the output first, and then mirrored, subtracted
>> +        @ from the output.
>> +.macro store_rev a, b, c, d
>> +.irp i, \a, \b, \c, \d
>> +        vld1.16         {d4},  [r0,:64]
>> +        vadd.s16        d4, d4, d\i
>> +        vst1.16         {d4},  [r0,:64]!
>> +        vrev64.16       d\i, d\i
>> +.endr
>> +.irp i, \d, \c, \b, \a
>> +        vld1.16         {d4},  [r0,:64]
>> +        vsub.s16        d4, d4, d\i
>> +        vst1.16         {d4},  [r0,:64]!
>> +.endr
>> +.endm
>> +
>> +        store_rev 31, 27, 23, 19
>> +        store_rev 30, 26, 22, 18
>> +        store_rev 29, 25, 21, 17
>> +        store_rev 28, 24, 20, 16
>> +.purgem store_rev
>> +        bx              lr
>> +endfunc
>> +
>> +@ This is mostly the same as 4x32_pass1, but without the transpose,
>> +@ and use the source as temp buffer between the two idct passes, and
>> +@ add into the destination.
>> +@ r0 = dst
>> +@ r1 = dst stride
>> +@ r2 = src (temp buffer)
>> +function idct32_1d_4x32_pass2_neon
>> +        movrel          r12, idct_coeffs
>> +        vld1.16         {q0-q1}, [r12,:128]
>> +
>> +        mov             r12, #128
>> +        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
>> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>> +        vld1.16         {d\i}, [r2,:64], r12
>> +.endr
>> +        sub             r2,  r2,  r12, lsl #4
>> +
>> +        idct16
>> +
>> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>> +        vst1.16         {d\i}, [r2,:64], r12
>> +.endr
>> +
>> +        sub             r2,  r2,  r12, lsl #4
>> +        add             r2,  r2,  #64
>> +
>> +        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
>> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>> +        vld1.16         {d\i}, [r2,:64], r12
>> +.endr
>> +        sub             r2,  r2,  r12, lsl #4
>> +        sub             r2,  r2,  #64
>> +
>> +        idct32_odd
>> +
>> +        mov             r12,  #128
>> +.macro load_acc_store a, b, c, d, neg=0
>> +        vld1.16         {d4},  [r2,:64], r12
>> +        vld1.16         {d5},  [r2,:64], r12
>> +.if \neg == 0
>> +        vadd.s16        d4, d4, d\a
>> +        vld1.16         {d6},  [r2,:64], r12
>> +        vadd.s16        d5, d5, d\b
>> +        vld1.16         {d7},  [r2,:64], r12
>> +        vadd.s16        d6, d6, d\c
>> +        vadd.s16        d7, d7, d\d
>> +.else
>> +        vsub.s16        d4, d4, d\a
>> +        vld1.16         {d6},  [r2,:64], r12
>> +        vsub.s16        d5, d5, d\b
>> +        vld1.16         {d7},  [r2,:64], r12
>> +        vsub.s16        d6, d6, d\c
>> +        vsub.s16        d7, d7, d\d
>> +.endif
>> +        vld1.32         {d2[]},   [r0,:32], r1
>> +        vld1.32         {d2[1]},  [r0,:32], r1
>> +        vrshr.s16       q2, q2, #6
>> +        vld1.32         {d3[]},   [r0,:32], r1
>> +        vrshr.s16       q3, q3, #6
>> +        vld1.32         {d3[1]},  [r0,:32], r1
>> +        sub             r0,  r0,  r1, lsl #2
>> +        vaddw.u8        q2,  q2,  d2
>> +        vaddw.u8        q3,  q3,  d3
>> +        vqmovun.s16     d4,  q2
>> +        vqmovun.s16     d5,  q3
>> +        vst1.32         {d4[0]},  [r0,:32], r1
>> +        vst1.32         {d4[1]},  [r0,:32], r1
>> +        vst1.32         {d5[0]},  [r0,:32], r1
>> +        vst1.32         {d5[1]},  [r0,:32], r1
>> +.endm
>> +        load_acc_store  31, 30, 29, 28
>> +        load_acc_store  27, 26, 25, 24
>> +        load_acc_store  23, 22, 21, 20
>> +        load_acc_store  19, 18, 17, 16
>> +        sub             r2,  r2,  r12
>> +        neg             r12, r12
>> +        load_acc_store  16, 17, 18, 19, 1
>> +        load_acc_store  20, 21, 22, 23, 1
>> +        load_acc_store  24, 25, 26, 27, 1
>> +        load_acc_store  28, 29, 30, 31, 1
>> +.purgem load_acc_store
>> +        bx              lr
>> +endfunc
>> +
>> +function ff_vp9_idct_idct_32x32_add_neon, export=1
>> +        cmp             r3,  #1
>> +        beq             idct32x32_dc_add_neon
>> +1:
>
> unused label
>
>> +        push            {r4-r7,lr}
>> +        vpush           {q4-q7}
>> +        mov             r7,  sp
>> +
>> +        @ Align the stack, allocate a temp buffer
>> +T       mov             r12, sp
>> +T       bic             r12, r12, #15
>> +T       sub             r12, r12, #2048
>> +T       mov             sp,  r12
>> +A       bic             sp,  sp,  #15
>> +A       sub             sp,  sp,  #2048
>> +
>> +        mov             r4,  r0
>> +        mov             r5,  r1
>> +        mov             r6,  r2
>> +
>> +.irp i, 0, 4, 8, 12, 16, 20, 24, 28
>> +        add             r0,  sp,  #(\i*64)
>> +        add             r2,  r6,  #(\i*2)
>> +        bl              idct32_1d_4x32_pass1_neon
>> +.endr
>> +.irp i, 0, 4, 8, 12, 16, 20, 24, 28
>> +        add             r0,  r4,  #(\i)
>> +        mov             r1,  r5
>> +        add             r2,  sp,  #(\i*2)
>> +        bl              idct32_1d_4x32_pass2_neon
>> +.endr
>> +
>> +        mov             sp,  r7
>> +        vpop            {q4-q7}
>> +        pop             {r4-r7,pc}
>> +endfunc
>
> patch ok with nits fixed. instruction rescheduling can be done in a
> separate commit.

Ok, thanks! Will push in a little while.

// Martin

Patch

diff --git a/libavcodec/arm/Makefile b/libavcodec/arm/Makefile
index 2638230..01630ac 100644
--- a/libavcodec/arm/Makefile
+++ b/libavcodec/arm/Makefile
@@ -139,4 +139,5 @@  NEON-OBJS-$(CONFIG_RV40_DECODER)       += arm/rv34dsp_neon.o            \
                                           arm/rv40dsp_neon.o
 NEON-OBJS-$(CONFIG_VORBIS_DECODER)     += arm/vorbisdsp_neon.o
 NEON-OBJS-$(CONFIG_VP6_DECODER)        += arm/vp6dsp_neon.o
-NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9mc_neon.o
+NEON-OBJS-$(CONFIG_VP9_DECODER)        += arm/vp9itxfm_neon.o           \
+                                          arm/vp9mc_neon.o
diff --git a/libavcodec/arm/vp9dsp_init_arm.c b/libavcodec/arm/vp9dsp_init_arm.c
index db8c683..2ba2644 100644
--- a/libavcodec/arm/vp9dsp_init_arm.c
+++ b/libavcodec/arm/vp9dsp_init_arm.c
@@ -94,7 +94,7 @@  define_8tap_2d_funcs(8)
 define_8tap_2d_funcs(4)
 
 
-av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
+static av_cold void vp9dsp_mc_init_arm(VP9DSPContext *dsp)
 {
     int cpu_flags = av_get_cpu_flags();
 
@@ -138,3 +138,52 @@  av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
         init_mc_funcs_dirs(4, 4);
     }
 }
+
+#define define_itxfm(type_a, type_b, sz)                                   \
+void ff_vp9_##type_a##_##type_b##_##sz##x##sz##_add_neon(uint8_t *_dst,    \
+                                                         ptrdiff_t stride, \
+                                                         int16_t *_block, int eob)
+
+#define define_itxfm_funcs(sz)      \
+    define_itxfm(idct,  idct,  sz); \
+    define_itxfm(iadst, idct,  sz); \
+    define_itxfm(idct,  iadst, sz); \
+    define_itxfm(iadst, iadst, sz)
+
+define_itxfm_funcs(4);
+define_itxfm_funcs(8);
+define_itxfm_funcs(16);
+define_itxfm(idct, idct, 32);
+define_itxfm(iwht, iwht, 4);
+
+
+static av_cold void vp9dsp_itxfm_init_arm(VP9DSPContext *dsp)
+{
+    int cpu_flags = av_get_cpu_flags();
+
+    if (have_neon(cpu_flags)) {
+#define init_itxfm(tx, sz)                                             \
+    dsp->itxfm_add[tx][DCT_DCT]   = ff_vp9_idct_idct_##sz##_add_neon;  \
+    dsp->itxfm_add[tx][DCT_ADST]  = ff_vp9_iadst_idct_##sz##_add_neon; \
+    dsp->itxfm_add[tx][ADST_DCT]  = ff_vp9_idct_iadst_##sz##_add_neon; \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_iadst_iadst_##sz##_add_neon
+
+#define init_idct(tx, nm)           \
+    dsp->itxfm_add[tx][DCT_DCT]   = \
+    dsp->itxfm_add[tx][ADST_DCT]  = \
+    dsp->itxfm_add[tx][DCT_ADST]  = \
+    dsp->itxfm_add[tx][ADST_ADST] = ff_vp9_##nm##_add_neon
+
+        init_itxfm(TX_4X4, 4x4);
+        init_itxfm(TX_8X8, 8x8);
+        init_itxfm(TX_16X16, 16x16);
+        init_idct(TX_32X32, idct_idct_32x32);
+        init_idct(4, iwht_iwht_4x4);
+    }
+}
+
+av_cold void ff_vp9dsp_init_arm(VP9DSPContext *dsp)
+{
+    vp9dsp_mc_init_arm(dsp);
+    vp9dsp_itxfm_init_arm(dsp);
+}
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
new file mode 100644
index 0000000..96dc3a9
--- /dev/null
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -0,0 +1,1166 @@ 
+/*
+ * Copyright (c) 2016 Google Inc.
+ *
+ * This file is part of Libav.
+ *
+ * Libav is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * Libav is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with Libav; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ */
+
+#include "libavutil/arm/asm.S"
+#include "neon.S"
+
+const itxfm4_coeffs, align=4
+        .short  11585, 6270, 15137, 0
+iadst4_coeffs:
+        .short  5283, 15212, 9929, 13377
+endconst
+
+const iadst8_coeffs, align=4
+        .short  16305, 1606, 14449, 7723, 10394, 12665, 4756, 15679
+endconst
+
+const idct_coeffs, align=4
+        .short  11585, 6270, 15137, 3196, 16069, 13623, 9102, 1606
+        .short  16305, 12665, 10394, 7723, 14449, 15679, 4756, 0
+        .short  804, 16364, 12140, 11003, 7005, 14811, 15426, 5520
+        .short  3981, 15893, 14053, 8423, 9760, 13160, 16207, 2404
+endconst
+
+const iadst16_coeffs, align=4
+        .short  16364, 804, 15893, 3981, 14811, 7005, 13160, 9760
+        .short  11003, 12140, 8423, 14053, 5520, 15426, 2404, 16207
+endconst
+
+@ Do two 4x4 transposes, using q registers for the subtransposes that don't
+@ need to address the individual d registers.
+@ r0,r1 == rq1, r2,r3 == rq1, etc
+.macro transpose16_q_2x_4x4 rq0, rq1, rq2, rq3, rq4, rq5, rq6, rq7, r0, r1, r2, r3, r4, r5, r6, r7, r8, r9, r10, r11, r12, r13, r14, r15
+        vtrn.32          \rq0, \rq1
+        vtrn.32          \rq2, \rq3
+        vtrn.32          \rq4, \rq5
+        vtrn.32          \rq6, \rq7
+        vtrn.16          \r0,  \r1
+        vtrn.16          \r2,  \r3
+        vtrn.16          \r4,  \r5
+        vtrn.16          \r6,  \r7
+        vtrn.16          \r8,  \r9
+        vtrn.16          \r10, \r11
+        vtrn.16          \r12, \r13
+        vtrn.16          \r14, \r15
+.endm
+
+@ out1 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out2 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ in/out are d registers
+.macro mbutterfly0 out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4, neg=0
+        vadd.s16        \tmpd1, \in1,  \in2
+        vsub.s16        \tmpd2, \in1,  \in2
+        vmull.s16       \tmpq3, \tmpd1, d0[0]
+        vmull.s16       \tmpq4, \tmpd2, d0[0]
+.if \neg > 0
+        vneg.s32        \tmpq3, \tmpq3
+.endif
+        vrshrn.s32      \out1, \tmpq3, #14
+        vrshrn.s32      \out2, \tmpq4, #14
+.endm
+@ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
+@ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
+@ Same as mbutterfly0, but with input being 2 q registers, output
+@ being 4 d registers.
+@ This can do with either 4 or 6 temporary q registers.
+.macro dmbutterfly0 out1, out2, out3, out4, in1, in2, tmpq1, tmpq2, tmpd11, tmpd12, tmpd21, tmpd22, tmpq3, tmpq4, tmpq5, tmpq6
+        vadd.s16        \tmpq1, \in1,  \in2
+        vsub.s16        \tmpq2, \in1,  \in2
+        vmull.s16       \tmpq3, \tmpd11, d0[0]
+        vmull.s16       \tmpq4, \tmpd12, d0[0]
+.ifb \tmpq5
+        vrshrn.s32      \out1, \tmpq3, #14
+        vrshrn.s32      \out2, \tmpq4, #14
+        vmull.s16       \tmpq3, \tmpd21, d0[0]
+        vmull.s16       \tmpq4, \tmpd22, d0[0]
+        vrshrn.s32      \out3, \tmpq3, #14
+        vrshrn.s32      \out4, \tmpq4, #14
+.else
+        vmull.s16       \tmpq5, \tmpd21, d0[0]
+        vmull.s16       \tmpq6, \tmpd22, d0[0]
+        vrshrn.s32      \out1, \tmpq3, #14
+        vrshrn.s32      \out2, \tmpq4, #14
+        vrshrn.s32      \out3, \tmpq5, #14
+        vrshrn.s32      \out4, \tmpq6, #14
+.endif
+.endm
+@ out1 = in1 * coef1 - in2 * coef2
+@ out2 = in1 * coef2 + in2 * coef1
+@ out are 2 q registers, in are 2 d registers
+.macro mbutterfly_l out1, out2, in1, in2, coef1, coef2
+        vmull.s16       \out1, \in1, \coef1
+        vmlsl.s16       \out1, \in2, \coef2
+        vmull.s16       \out2, \in1, \coef2
+        vmlal.s16       \out2, \in2, \coef1
+.endm
+@ out1,out2 = in1,in2 * coef1 - in3,in4 * coef2
+@ out3,out4 = in1,in2 * coef2 + in3,in4 * coef1
+@ out are 4 q registers, in are 4 d registers
+.macro dmbutterfly_l out1, out2, out3, out4, in1, in2, in3, in4, coef1, coef2
+        vmull.s16       \out1, \in1, \coef1
+        vmull.s16       \out2, \in2, \coef1
+        vmull.s16       \out3, \in1, \coef2
+        vmull.s16       \out4, \in2, \coef2
+        vmlsl.s16       \out1, \in3, \coef2
+        vmlsl.s16       \out2, \in4, \coef2
+        vmlal.s16       \out3, \in3, \coef1
+        vmlal.s16       \out4, \in4, \coef1
+.endm
+@ in1 = (in1 * coef1 - in2 * coef2 + (1 << 13)) >> 14
+@ in2 = (in1 * coef2 + in2 * coef1 + (1 << 13)) >> 14
+@ in are 2 d registers, tmp are 2 q registers
+.macro mbutterfly in1, in2, coef1, coef2, tmp1, tmp2, neg=0
+        mbutterfly_l    \tmp1, \tmp2, \in1, \in2, \coef1, \coef2
+.if \neg > 0
+        vneg.s32        \tmp2, \tmp2
+.endif
+        vrshrn.s32      \in1, \tmp1,  #14
+        vrshrn.s32      \in2, \tmp2,  #14
+.endm
+@ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
+@ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
+@ inout are 4 d registers, tmp are 4 q registers
+.macro dmbutterfly inout1, inout2, inout3, inout4, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        dmbutterfly_l   \tmp1, \tmp2, \tmp3, \tmp4, \inout1, \inout2, \inout3, \inout4, \coef1, \coef2
+        vrshrn.s32      \inout1, \tmp1,  #14
+        vrshrn.s32      \inout2, \tmp2,  #14
+        vrshrn.s32      \inout3, \tmp3,  #14
+        vrshrn.s32      \inout4, \tmp4,  #14
+.endm
+.macro mbutterfly_neg in1, in2, coef1, coef2, tmp1, tmp2
+        mbutterfly      \in1, \in2, \coef1, \coef2, \tmp1, \tmp2, 1
+.endm
+@ out1 = in1 + in2
+@ out2 = in1 - in2
+.macro butterfly out1, out2, in1, in2
+        vadd.s16        \out1, \in1, \in2
+        vsub.s16        \out2, \in1, \in2
+.endm
+@ out1 = in1 - in2
+@ out2 = in1 + in2
+.macro butterfly_r out1, out2, in1, in2
+        vsub.s16        \out1, \in1, \in2
+        vadd.s16        \out2, \in1, \in2
+.endm
+@ out1 = (in1 + in2 + (1 << 13)) >> 14
+@ out2 = (in1 - in2 + (1 << 13)) >> 14
+@ out are 2 d registers, in are 2 q registers, tmp are 2 q registers
+.macro butterfly_n out1, out2, in1, in2, tmp1, tmp2
+        vadd.s32        \tmp1, \in1, \in2
+        vsub.s32        \tmp2, \in1, \in2
+        vrshrn.s32      \out1, \tmp1,  #14
+        vrshrn.s32      \out2, \tmp2,  #14
+.endm
+@ out1,out2 = (in1,in2 + in3,in4 + (1 << 13)) >> 14
+@ out3,out4 = (in1,in2 - in3,in4 + (1 << 13)) >> 14
+@ out are 4 d registers, in are 4 q registers, tmp are 4 q registers
+.macro dbutterfly_n out1, out2, out3, out4, in1, in2, in3, in4, tmp1, tmp2, tmp3, tmp4
+        vadd.s32        \tmp1, \in1, \in3
+        vadd.s32        \tmp2, \in2, \in4
+        vsub.s32        \tmp3, \in1, \in3
+        vsub.s32        \tmp4, \in2, \in4
+        vrshrn.s32      \out1, \tmp1,  #14
+        vrshrn.s32      \out2, \tmp2,  #14
+        vrshrn.s32      \out3, \tmp3,  #14
+        vrshrn.s32      \out4, \tmp4,  #14
+.endm
+
+
+.macro iwht4 c0, c1, c2, c3
+        vadd.i16        \c0,  \c0,  \c1
+        vsub.i16        d17,  \c2,  \c3
+        vsub.i16        d16,  \c0,  d17
+        vshr.s16        d16,  d16,  #1
+        vsub.i16        \c2,  d16,  \c1
+        vsub.i16        \c1,  d16,  \c3
+        vadd.i16        \c3,  d17,  \c2
+        vsub.i16        \c0,  \c0,  \c1
+.endm
+
+.macro idct4 c0, c1, c2, c3
+        vadd.i16        d16,  \c0,  \c2
+        vsub.i16        d17,  \c0,  \c2
+        vmull.s16       q11,  \c1,  d0[1]
+        vmull.s16       q12,  \c3,  d0[2]
+        vmull.s16       q13,  \c1,  d0[2]
+        vmull.s16       q14,  \c3,  d0[1]
+        vmull.s16       q9,   d16,  d0[0]
+        vmull.s16       q10,  d17,  d0[0]
+        vadd.i32        q13,  q13,  q14
+        vsub.i32        q11,  q11,  q12
+        vrshrn.s32      d16,  q9,   #14
+        vrshrn.s32      d19,  q13,  #14
+        vrshrn.s32      d17,  q10,  #14
+        vrshrn.s32      d18,  q11,  #14
+        vadd.i16        \c0,  d16,  d19
+        vadd.i16        \c1,  d17,  d18
+        vsub.i16        \c2,  d17,  d18
+        vsub.i16        \c3,  d16,  d19
+.endm
+
+.macro iadst4 c0, c1, c2, c3
+        vmull.s16       q10,  \c0,  d1[0]
+        vmlal.s16       q10,  \c2,  d1[1]
+        vmlal.s16       q10,  \c3,  d1[2]
+        vmull.s16       q11,  \c0,  d1[2]
+        vmlsl.s16       q11,  \c2,  d1[0]
+        vsub.s16        \c0,  \c0,  \c2
+        vmlsl.s16       q11,  \c3,  d1[1]
+        vadd.s16        \c0,  \c0,  \c3
+        vmull.s16       q13,  \c1,  d1[3]
+        vmull.s16       q12,  \c0,  d1[3]
+        vadd.s32        q14,  q10,  q13
+        vadd.s32        q1,   q11,  q13
+        vrshrn.s32      \c0,  q14,  #14
+        vadd.s32        q10,  q10,  q11
+        vrshrn.s32      \c1,  q1,   #14
+        vsub.s32        q10,  q10,  q13
+        vrshrn.s32      \c2,  q12,  #14
+        vrshrn.s32      \c3,  q10,  #14
+.endm
+
+@ The public functions in this file have got the following signature:
+@ void itxfm_add(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
+
+.macro itxfm_func4x4 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_4x4_add_neon, export=1
+.ifc \txfm1,\txfm2
+.ifc \txfm1,idct
+        movrel          r12, itxfm4_coeffs
+        vld1.16         {d0}, [r12,:64]
+.endif
+.ifc \txfm1,iadst
+        movrel          r12, iadst4_coeffs
+        vld1.16         {d1}, [r12,:64]
+.endif
+.else
+        movrel          r12, itxfm4_coeffs
+        vld1.16         {q0}, [r12,:128]
+.endif
+
+        vmov.i16        q15, #0
+.ifc \txfm1,idct
+.ifc \txfm2,idct
+        cmp             r3,  #1
+        bne             1f
+        @ DC-only for idct/idct
+        vld1.16         {d4[]},   [r2]
+        vmull.s16       q2,  d4,  d0[0]
+        vrshrn.s32      d4,  q2,  #14
+        vmull.s16       q2,  d4,  d0[0]
+        vrshrn.s32      d4,  q2,  #14
+        vst1.16         {d30[0]}, [r2]
+        vdup.16         q2,  d4[0]
+        vmov            q3,  q2
+        b               2f
+.endif
+.endif
+
+1:
+        vld1.16         {d4-d7},  [r2,:128]
+        vst1.16         {q15}, [r2,:128]!
+
+.ifc \txfm1,iwht
+        vshr.s16        q2,  q2,  #2
+        vshr.s16        q3,  q3,  #2
+.endif
+
+        \txfm1\()4      d4,  d5,  d6,  d7
+
+        vst1.16         {q15}, [r2,:128]!
+        @ Transpose 4x4 with 16 bit elements
+        vtrn.16         d4,  d5
+        vtrn.16         d6,  d7
+        vtrn.32         d4,  d6
+        vtrn.32         d5,  d7
+
+        \txfm2\()4      d4,  d5,  d6,  d7
+2:
+        vld1.32         {d0[]},   [r0,:32], r1
+        vld1.32         {d0[1]},  [r0,:32], r1
+.ifnc \txfm1,iwht
+        vrshr.s16       q2,  q2,  #4
+        vrshr.s16       q3,  q3,  #4
+.endif
+        vaddw.u8        q2,  q2,  d0
+        vld1.32         {d1[]},   [r0,:32], r1
+        vld1.32         {d1[1]},  [r0,:32], r1
+        vqmovun.s16     d0,  q2
+        sub             r0,  r0,  r1, lsl #2
+
+        vaddw.u8        q3,  q3,  d1
+        vst1.32         {d0[0]},  [r0,:32], r1
+        vqmovun.s16     d1,  q3
+
+        vst1.32         {d0[1]},  [r0,:32], r1
+        vst1.32         {d1[0]},  [r0,:32], r1
+        vst1.32         {d1[1]},  [r0,:32], r1
+
+        bx              lr
+endfunc
+.endm
+
+itxfm_func4x4 idct,  idct
+itxfm_func4x4 iadst, idct
+itxfm_func4x4 idct,  iadst
+itxfm_func4x4 iadst, iadst
+itxfm_func4x4 iwht,  iwht
+
+
+.macro idct8
+        dmbutterfly0    d16, d17, d24, d25, q8,  q12, q2, q4, d4, d5, d8, d9, q3, q2, q5, q4 @ q8 = t0a, q12 = t1a
+        dmbutterfly     d20, d21, d28, d29, d0[1], d0[2], q2,  q3,  q4,  q5 @ q10 = t2a, q14 = t3a
+        dmbutterfly     d18, d19, d30, d31, d0[3], d1[0], q2,  q3,  q4,  q5 @ q9  = t4a, q15 = t7a
+        dmbutterfly     d26, d27, d22, d23, d1[1], d1[2], q2,  q3,  q4,  q5 @ q13 = t5a, q11 = t6a
+
+        butterfly       q2,  q14, q8,  q14 @ q2 = t0, q14 = t3
+        butterfly       q3,  q10, q12, q10 @ q3 = t1, q10 = t2
+        butterfly       q4,  q13, q9,  q13 @ q4 = t4, q13 = t5a
+        butterfly       q5,  q11, q15, q11 @ q5 = t7, q11 = t6a
+
+        butterfly       q8,  q15, q2,  q5  @ q8 = out[0], q15 = out[7]
+
+        dmbutterfly0    d4,  d5,  d10, d11, q11, q13, q9,  q13, d18, d19, d26, d27, q2,  q5, q11, q12 @ q2 = t6, q5 = t5
+
+        butterfly       q11, q12, q14, q4  @ q11 = out[3], q12 = out[4]
+        butterfly       q9,  q14, q3,  q2  @ q9 = out[1],  q14 = out[6]
+        butterfly_r     q13, q10, q10, q5  @ q13 = out[5], q10 = out[2]
+.endm
+
+.macro iadst8
+        dmbutterfly_l   q4,  q5,  q2,  q3,  d30, d31, d16, d17, d2[1], d2[0] @ q4,q5  = t1a, q2,q3 = t0a
+        dmbutterfly_l   q8,  q15, q6,  q7,  d22, d23, d24, d25, d3[1], d3[0] @ q8,q15 = t5a, q6,q7 = t4a
+
+        dbutterfly_n    d22, d23, d4,  d5,  q2,  q3,  q6,  q7,  q11, q12, q2,  q3 @ q11 = t0, q2 = t4
+
+        dbutterfly_n    d24, d25, d6,  d7,  q4,  q5,  q8,  q15, q12, q3,  q6,  q7 @ q12 = t1, q3 = t5
+
+        dmbutterfly_l   q6,  q7,  q4,  q5,  d26, d27, d20, d21, d2[3], d2[2] @ q6,q7 = t3a, q4,q5 = t2a
+        dmbutterfly_l   q10, q13, q8,  q15, d18, d19, d28, d29, d3[3], d3[2] @ q10,q13 = t7a, q8,q15 = t6a
+
+        dbutterfly_n    d18, d19, d8,  d9,  q4,  q5,  q8,  q15, q9,  q14, q4, q5 @ q9 = t2, q4 = t6
+        dbutterfly_n    d16, d17, d12, d13, q6,  q7,  q10, q13, q8,  q15, q6, q7 @ q8 = t3, q6 = t7
+
+        butterfly       q15, q12, q12, q8 @ q15 = -out[7], q12 = t3
+        vneg.s16        q15, q15          @ q15 = out[7]
+        butterfly       q8,  q9,  q11, q9 @ q8 = out[0], q9 = t2
+
+        dmbutterfly_l   q10, q11, q5,  q7,  d4,  d5,  d6,  d7,  d0[1], d0[2] @ q10,q11 = t5a, q5,q7 = t4a
+        dmbutterfly_l   q2,  q3,  q13, q14, d12, d13, d8,  d9,  d0[2], d0[1] @ q2,q3 = t6a, q13,q14 = t7a
+
+        dbutterfly_n    d28, d29, d8,  d9,  q10, q11, q13, q14, q4,  q6,  q10, q11 @ q14 = out[6], q4 = t7
+
+        dmbutterfly0    d22, d23, d24, d25, q9,  q12, q6, q13, d12, d13, d26, d27, q9, q10 @ q11 = -out[3], q12 = out[4]
+        vneg.s16        q11, q11      @ q11 = out[3]
+
+        dbutterfly_n    d18, d19, d4,  d5,  q5,  q7,  q2,  q3,  q9, q10, q2,  q3 @ q9 = -out[1], q2 = t6
+        vneg.s16        q9,  q9       @ q9 = out[1]
+
+        dmbutterfly0    d20, d21, d26, d27, q2,  q4,  q3, q5,  d6,  d7,  d10, d11, q6,  q7 @ q10 = out[2], q13 = -out[5]
+        vneg.s16        q13, q13      @ q13 = out[5]
+.endm
+
+
+.macro itxfm_func8x8 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
+        @ Push q4-q7 if iadst is used, idct requires
+        @ a few scratch registers less, so only push q4-q5
+        @ if only idct is involved.
+        @ The iadst also uses a few coefficients from
+        @ idct, so those always need to be loaded.
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+.ifc \txfm1,iadst
+        movrel          r12, iadst8_coeffs
+        vld1.16         {q1}, [r12,:128]
+        vpush           {q4-q7}
+.else
+.ifc \txfm2,iadst
+        movrel          r12, iadst8_coeffs
+        vld1.16         {q1}, [r12,:128]
+        vpush           {q4-q7}
+.else
+        vpush           {q4-q5}
+.endif
+.endif
+
+        vmov.i16        q2, #0
+        vmov.i16        q3, #0
+
+.ifc \txfm1,idct
+.ifc \txfm2,idct
+        cmp             r3,  #1
+        bne             1f
+        @ DC-only for idct/idct
+        vld1.16         {d16[]},   [r2]
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vdup.16         q8,  d16[0]
+        vmov            q9,  q8
+        vmov            q10, q8
+        vmov            q11, q8
+        vmov            q12, q8
+        vmov            q13, q8
+        vmov            q14, q8
+        vmov            q15, q8
+        vst1.16         {d4[0]}, [r2]
+        b               2f
+.endif
+.endif
+1:
+        vld1.16         {q8-q9},    [r2,:128]!
+        vld1.16         {q10-q11},  [r2,:128]!
+        vld1.16         {q12-q13},  [r2,:128]!
+        vld1.16         {q14-q15},  [r2,:128]!
+        sub             r2,  r2,  #128
+        vst1.16         {q2-q3}, [r2,:128]!
+        vst1.16         {q2-q3}, [r2,:128]!
+        vst1.16         {q2-q3}, [r2,:128]!
+        vst1.16         {q2-q3}, [r2,:128]!
+
+        \txfm1\()8
+
+        @ Transpose 8x8 with 16 bit elements
+        vswp            d17, d24
+        vswp            d19, d26
+        vswp            d21, d28
+        vswp            d23, d30
+        transpose16_4x4 q8, q9, q10, q11, q12, q13, q14, q15
+
+        \txfm2\()8
+2:
+        @ Add into the destination
+        vld1.8          {d4},  [r0,:64], r1
+        vrshr.s16       q8,  q8,  #5
+        vld1.8          {d5},  [r0,:64], r1
+        vrshr.s16       q9,  q9,  #5
+        vld1.8          {d6},  [r0,:64], r1
+        vrshr.s16       q10, q10, #5
+        vaddw.u8        q8,  q8,  d4
+        vld1.8          {d7},  [r0,:64], r1
+        vrshr.s16       q11, q11, #5
+        vaddw.u8        q9,  q9,  d5
+        vld1.8          {d8},  [r0,:64], r1
+        vrshr.s16       q12, q12, #5
+        vaddw.u8        q10, q10, d6
+        vqmovun.s16     d4,  q8
+        vld1.8          {d9},  [r0,:64], r1
+        vrshr.s16       q13, q13, #5
+        vaddw.u8        q11, q11, d7
+        vqmovun.s16     d5,  q9
+        vld1.8          {d10}, [r0,:64], r1
+        vrshr.s16       q14, q14, #5
+        vaddw.u8        q12, q12, d8
+        vqmovun.s16     d6,  q10
+        vld1.8          {d11}, [r0,:64], r1
+        vrshr.s16       q15, q15, #5
+        vaddw.u8        q13, q13, d9
+        vqmovun.s16     d7,  q11
+        sub             r0,  r0,  r1, lsl #3
+
+
+        vst1.8          {d4},  [r0,:64], r1
+        vaddw.u8        q14, q14, d10
+        vst1.8          {d5},  [r0,:64], r1
+        vqmovun.s16     d8,  q12
+        vst1.8          {d6},  [r0,:64], r1
+        vaddw.u8        q15, q15, d11
+        vst1.8          {d7},  [r0,:64], r1
+        vqmovun.s16     d9,  q13
+        vst1.8          {d8},  [r0,:64], r1
+        vqmovun.s16     d10, q14
+        vst1.8          {d9},  [r0,:64], r1
+        vqmovun.s16     d11, q15
+
+        vst1.8          {d10}, [r0,:64], r1
+        vst1.8          {d11}, [r0,:64], r1
+
+.ifc \txfm1,iadst
+        vpop            {q4-q7}
+.else
+.ifc \txfm2,iadst
+        vpop            {q4-q7}
+.else
+        vpop            {q4-q5}
+.endif
+.endif
+        bx              lr
+endfunc
+.endm
+
+itxfm_func8x8 idct,  idct
+itxfm_func8x8 iadst, idct
+itxfm_func8x8 idct,  iadst
+itxfm_func8x8 iadst, iadst
+
+
+function idct16x16_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i16        q2, #0
+
+        vld1.16         {d16[]},   [r2]
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vdup.16         q8,  d16[0]
+        vst1.16         {d4[0]}, [r2]
+
+        vrshr.s16       q8,  q8,  #6
+
+        mov             r12, #16
+1:
+        @ Loop to add the constant from q8 into all 16x16 outputs
+        vld1.8          {q3},  [r0,:128]
+        vaddw.u8        q10, q8,  d6
+        vaddw.u8        q11, q8,  d7
+        vqmovun.s16     d6,  q10
+        vqmovun.s16     d7,  q11
+        vst1.8          {q3},  [r0,:128], r1
+        subs            r12, r12, #1
+        bne             1b
+
+        bx              lr
+endfunc
+
+.macro idct16
+        mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
+        mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
+        mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
+        mbutterfly      d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
+        mbutterfly      d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
+        mbutterfly      d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
+        mbutterfly      d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
+        mbutterfly      d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
+
+        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
+        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
+        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
+        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15 @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15 @ d23 = t9a, d25 = t14a
+        mbutterfly_neg  d27, d21, d0[1], d0[2], q9,  q15 @ d27 = t13a, d21 = t10a
+
+        butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = t7a
+        butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = t6
+        butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = t5
+        butterfly       d5,  d6,  d28, d6                @ d5  = t3a,  d6  = t4
+        butterfly       d20, d28, d16, d24               @ d20 = t8a,  d28 = t11a
+        butterfly       d24, d21, d23, d21               @ d24 = t9,   d21 = t10
+        butterfly       d23, d27, d25, d27               @ d23 = t14,  d27 = t13
+        butterfly       d25, d29, d29, d17               @ d25 = t15a, d29 = t12a
+
+        mbutterfly0     d27, d21, d27, d21, d16, d30, q8, q15 @ d27 = t13a, d21 = t10a
+        mbutterfly0     d29, d28, d29, d28, d16, d30, q8, q15 @ d29 = t12,  d28 = t11
+
+        vswp            d27, d29                         @ d27 = t12, d29 = t13a
+        vswp            d28, d27                         @ d28 = t12, d27 = t11
+        butterfly       d16, d31, d18, d25               @ d16 = out[0], d31 = out[15]
+        butterfly       d17, d30, d19, d23               @ d17 = out[1], d30 = out[14]
+        butterfly_r     d25, d22, d22, d24               @ d25 = out[9], d22 = out[6]
+        butterfly       d23, d24, d7,  d20               @ d23 = out[7], d24 = out[8]
+        butterfly       d18, d29, d4,  d29               @ d18 = out[2], d29 = out[13]
+        butterfly       d19, d28, d5,  d28               @ d19 = out[3], d28 = out[12]
+        vmov            d4,  d21                         @ d4  = t10a
+        butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
+        butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
+.endm
+
+.macro iadst16
+        movrel          r12, iadst16_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+
+        mbutterfly_l    q3,  q2,  d31, d16, d0[1], d0[0] @ q3  = t1,   q2  = t0
+        mbutterfly_l    q5,  q4,  d23, d24, d2[1], d2[0] @ q5  = t9,   q4  = t8
+        butterfly_n     d31, d24, q3,  q5,  q6,  q5      @ d31 = t1a,  d24 = t9a
+        mbutterfly_l    q7,  q6,  d29, d18, d0[3], d0[2] @ q7  = t3,   q6  = t2
+        butterfly_n     d16, d23, q2,  q4,  q3,  q4      @ d16 = t0a,  d23 = t8a
+
+        mbutterfly_l    q3,  q2,  d21, d26, d2[3], d2[2] @ q3  = t11,  q2  = t10
+        butterfly_n     d29, d26, q7,  q3,  q4,  q3      @ d29 = t3a,  d26 = t11a
+        mbutterfly_l    q5,  q4,  d27, d20, d1[1], d1[0] @ q5  = t5,   q4  = t4
+        butterfly_n     d18, d21, q6,  q2,  q3,  q2      @ d18 = t2a,  d21 = t10a
+
+        mbutterfly_l    q7,  q6,  d19, d28, d3[1], d3[0] @ q7  = t13,  q6  = t12
+        butterfly_n     d20, d28, q5,  q7,  q2,  q7      @ d20 = t5a,  d28 = t13a
+        mbutterfly_l    q3,  q2,  d25, d22, d1[3], d1[2] @ q3  = t7,   q2  = t6
+        butterfly_n     d27, d19, q4,  q6,  q5,  q6      @ d27 = t4a,  d19 = t12a
+
+        mbutterfly_l    q5,  q4,  d17, d30, d3[3], d3[2] @ q5  = t15,  q4  = t14
+        movrel          r12, idct_coeffs
+        vld1.16         {q0}, [r12,:128]
+        butterfly_n     d22, d30, q3,  q5,  q6,  q5      @ d22 = t7a,  d30 = t15a
+        mbutterfly_l    q7,  q6,  d23, d24, d0[3], d1[0] @ q7  = t9,   q6  = t8
+        butterfly_n     d25, d17, q2,  q4,  q3,  q4      @ d25 = t6a,  d17 = t14a
+
+        mbutterfly_l    q2,  q3,  d28, d19, d1[0], d0[3] @ q2  = t12,  q3  = t13
+        butterfly_n     d23, d19, q6,  q2,  q4,  q2      @ d23 = t8a,  d19 = t12a
+        mbutterfly_l    q5,  q4,  d21, d26, d1[1], d1[2] @ q5  = t11,  q4  = t10
+        butterfly_r     d4,  d27, d16, d27               @ d4  = t4,   d27 = t0
+        butterfly_n     d24, d28, q7,  q3,  q6,  q3      @ d24 = t9a,  d28 = t13a
+
+        mbutterfly_l    q6,  q7,  d30, d17, d1[2], d1[1] @ q6  = t14,  q7  = t15
+        butterfly_r     d5,  d20, d31, d20               @ d5  = t5,   d20 = t1
+        butterfly_n     d21, d17, q4,  q6,  q3,  q6      @ d21 = t10a, d17 = t14a
+        butterfly_n     d26, d30, q5,  q7,  q4,  q7      @ d26 = t11a, d30 = t15a
+
+        butterfly_r     d6,  d25, d18, d25               @ d6  = t6,   d25 = t2
+        butterfly_r     d7,  d22, d29, d22               @ d7  = t7,   d22 = t3
+
+        mbutterfly_l    q5,  q4,  d19, d28, d0[1], d0[2] @ q5  = t13,  q4  = t12
+        mbutterfly_l    q6,  q7,  d30, d17, d0[2], d0[1] @ q6  = t14,  q7  = t15
+
+        butterfly_n     d18, d30, q4,  q6,  q8,  q6      @ d18 = out[2],   d30 = t14a
+        butterfly_n     d29, d17, q5,  q7,  q6,  q7      @ d29 = -out[13], d17 = t15a
+        vneg.s16        d29, d29                         @ d29 = out[13]
+
+        mbutterfly_l    q5,  q4,  d4,  d5,  d0[1], d0[2] @ q5  = t5a,  q4  = t4a
+        mbutterfly_l    q6,  q7,  d7,  d6,  d0[2], d0[1] @ q6  = t6a,  q7  = t7a
+
+        butterfly       d2,  d6,  d27, d25               @ d2 = out[0], d6 = t2a
+        butterfly       d3,  d7,  d23, d21               @ d3 =-out[1], d7 = t10
+
+        butterfly_n     d19, d31, q4,  q6,  q2,  q4      @ d19 = -out[3],  d31 = t6
+        vneg.s16        d19, d19                         @ d19 = out[3]
+        butterfly_n     d28, d16, q5,  q7,  q2,  q5      @ d28 = out[12],  d16 = t7
+
+        butterfly       d5,  d8,  d20, d22               @ d5 =-out[15],d8 = t3a
+        butterfly       d4,  d9,  d24, d26               @ d4 = out[14],d9 = t11
+
+        mbutterfly0     d23, d24, d6,  d8,  d10, d11, q6,  q7, 1 @ d23 = out[7], d24 = out[8]
+        mbutterfly0     d20, d27, d16, d31, d10, d11, q6,  q7    @ d20 = out[4], d27 = out[11]
+        mbutterfly0     d22, d25, d9,  d7,  d10, d11, q6,  q7    @ d22 = out[6], d25 = out[9]
+        mbutterfly0     d21, d26, d30, d17, d10, d11, q6,  q7, 1 @ d21 = out[5], d26 = out[10]
+
+        vneg.s16        d31, d5                          @ d31 = out[15]
+        vneg.s16        d17, d3                          @ d17 = out[1]
+
+        vmov            d16, d2
+        vmov            d30, d4
+.endm
+
+.macro itxfm16_1d_funcs txfm
+@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+@ transpose into a horizontal 16x4 slice and store.
+@ r0 = dst (temp buffer)
+@ r1 = unused
+@ r2 = src
+@ r3 = slice offset
+function \txfm\()16_1d_4x16_pass1_neon
+        mov             r12, #32
+        vmov.s16        q2, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        \txfm\()16
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        cmp             r3,  #12
+        beq             1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        bx              lr
+1:
+        @ Special case: For the last input column (r3 == 12),
+        @ which would be stored as the last row in the temp buffer,
+        @ don't store the first 4x4 block, but keep it in registers
+        @ for the first slice of the second pass (where it is the
+        @ last 4x4 block).
+        add             r0,  r0,  #8
+.irp i, 20, 24, 28
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 21, 25, 29
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 22, 26, 30
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 23, 27, 31
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        vmov            d28, d16
+        vmov            d29, d17
+        vmov            d30, d18
+        vmov            d31, d19
+        bx              lr
+endfunc
+
+@ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
+@ load the destination pixels (from a similar 4x16 slice), add and store back.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+@ r3 = slice offset
+function \txfm\()16_1d_4x16_pass2_neon
+        mov             r12, #32
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        cmp             r3,  #0
+        beq             1f
+.irp i, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+1:
+
+        \txfm\()16
+
+.macro load_add_store coef0, coef1, coef2, coef3
+        vrshr.s16       \coef0, \coef0, #6
+        vrshr.s16       \coef1, \coef1, #6
+
+        vld1.32         {d4[]},   [r0,:32], r1
+        vld1.32         {d4[1]},  [r0,:32], r1
+        vrshr.s16       \coef2, \coef2, #6
+        vrshr.s16       \coef3, \coef3, #6
+        vld1.32         {d5[]},   [r0,:32], r1
+        vld1.32         {d5[1]},  [r0,:32], r1
+        vaddw.u8        \coef0, \coef0, d4
+        vld1.32         {d6[]},   [r0,:32], r1
+        vld1.32         {d6[1]},  [r0,:32], r1
+        vaddw.u8        \coef1, \coef1, d5
+        vld1.32         {d7[]},   [r0,:32], r1
+        vld1.32         {d7[1]},  [r0,:32], r1
+
+        vqmovun.s16     d4,  \coef0
+        vqmovun.s16     d5,  \coef1
+        sub             r0,  r0,  r1, lsl #3
+        vaddw.u8        \coef2, \coef2, d6
+        vaddw.u8        \coef3, \coef3, d7
+        vst1.32         {d4[0]},  [r0,:32], r1
+        vst1.32         {d4[1]},  [r0,:32], r1
+        vqmovun.s16     d6,  \coef2
+        vst1.32         {d5[0]},  [r0,:32], r1
+        vst1.32         {d5[1]},  [r0,:32], r1
+        vqmovun.s16     d7,  \coef3
+
+        vst1.32         {d6[0]},  [r0,:32], r1
+        vst1.32         {d6[1]},  [r0,:32], r1
+        vst1.32         {d7[0]},  [r0,:32], r1
+        vst1.32         {d7[1]},  [r0,:32], r1
+.endm
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+.purgem load_add_store
+
+        bx              lr
+endfunc
+.endm
+
+itxfm16_1d_funcs idct
+itxfm16_1d_funcs iadst
+
+.macro itxfm_func16x16 txfm1, txfm2
+function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
+.ifc \txfm1,idct
+.ifc \txfm2,idct
+        cmp             r3,  #1
+        beq             idct16x16_dc_add_neon
+.endif
+.endif
+1:
+        push            {r4-r7,lr}
+.ifc \txfm1,iadst
+        vpush           {q4-q7}
+.else
+.ifc \txfm2,iadst
+        vpush           {q4-q7}
+.endif
+.endif
+        mov             r7,  sp
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r12, sp
+T       bic             r12, r12, #15
+T       sub             r12, r12, #512
+T       mov             sp,  r12
+A       bic             sp,  sp,  #15
+A       sub             sp,  sp,  #512
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.ifc \txfm1,idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+.endif
+
+.irp i, 0, 4, 8, 12
+        add             r0,  sp,  #(\i*32)
+        add             r2,  r6,  #(\i*2)
+        mov             r3,  #\i
+        bl              \txfm1\()16_1d_4x16_pass1_neon
+.endr
+.ifc \txfm2,idct
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+.endif
+.irp i, 0, 4, 8, 12
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        mov             r3,  #\i
+        bl              \txfm2\()16_1d_4x16_pass2_neon
+.endr
+
+        mov             sp,  r7
+.ifc \txfm1,iadst
+        vpop            {q4-q7}
+.else
+.ifc \txfm2,iadst
+        vpop            {q4-q7}
+.endif
+.endif
+        pop             {r4-r7,pc}
+endfunc
+.endm
+
+itxfm_func16x16 idct,  idct
+itxfm_func16x16 iadst, idct
+itxfm_func16x16 idct,  iadst
+itxfm_func16x16 iadst, iadst
+
+
+function idct32x32_dc_add_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {d0}, [r12,:64]
+
+        vmov.i16        q2, #0
+
+        vld1.16         {d16[]},   [r2]
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vmull.s16       q8,  d16, d0[0]
+        vrshrn.s32      d16, q8,  #14
+        vdup.16         q8,  d16[0]
+        vst1.16         {d4[0]}, [r2]
+
+        vrshr.s16       q8,  q8,  #6
+
+        mov             r12, #32
+1:
+        @ Loop to add the constant from q8 into all 32x32 outputs
+        vld1.8          {q2-q3},  [r0,:128]
+        vaddw.u8        q10, q8,  d4
+        vaddw.u8        q11, q8,  d5
+        vaddw.u8        q12, q8,  d6
+        vaddw.u8        q13, q8,  d7
+        vqmovun.s16     d4,  q10
+        vqmovun.s16     d5,  q11
+        vqmovun.s16     d6,  q12
+        vqmovun.s16     d7,  q13
+        vst1.8          {q2-q3},  [r0,:128], r1
+        subs            r12, r12, #1
+        bne             1b
+
+        bx              lr
+endfunc
+
+.macro idct32_odd
+        movrel          r12, idct_coeffs
+        add             r12, r12, #32
+        vld1.16         {q0-q1}, [r12,:128]
+
+        mbutterfly      d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a
+        mbutterfly      d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a
+        mbutterfly      d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a
+        mbutterfly      d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a
+        mbutterfly      d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a
+        mbutterfly      d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a
+        mbutterfly      d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a
+        mbutterfly      d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a
+
+        sub             r12, r12, #32
+        vld1.16         {q0}, [r12,:128]
+
+        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
+        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
+        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
+        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a
+        mbutterfly_neg  d27, d20, d0[3], d1[0], q8, q9 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a
+        mbutterfly_neg  d25, d22, d1[1], d1[2], q8, q9 @ d25 = t25a, d22 = t22a
+
+        butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
+        butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
+        butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
+        butterfly       d19, d21, d22, d21 @ d19 = t22,  d21 = t21
+        butterfly       d4,  d28, d28, d30 @ d4  = t24a, d28 = t27a
+        butterfly       d23, d26, d25, d26 @ d23 = t25,  d26 = t26
+        butterfly       d7,  d29, d29, d31 @ d7  = t31a, d29 = t28a
+        butterfly       d22, d27, d24, d27 @ d22 = t30,  d27 = t29
+
+        mbutterfly      d27, d20, d0[1], d0[2], q12, q15 @ d27 = t18a, d20 = t29a
+        mbutterfly      d29, d5,  d0[1], d0[2], q12, q15 @ d29 = t19,  d5  = t28
+        mbutterfly_neg  d28, d6,  d0[1], d0[2], q12, q15 @ d28 = t27,  d6  = t20
+        mbutterfly_neg  d26, d21, d0[1], d0[2], q12, q15 @ d26 = t26a, d21 = t21a
+
+        butterfly       d31, d24, d7,  d4  @ d31 = t31,  d24 = t24
+        butterfly       d30, d25, d22, d23 @ d30 = t30a, d25 = t25a
+        butterfly_r     d23, d16, d16, d18 @ d23 = t23,  d16 = t16
+        butterfly_r     d22, d17, d17, d19 @ d22 = t22a, d17 = t17a
+        butterfly       d18, d21, d27, d21 @ d18 = t18,  d21 = t21
+        butterfly_r     d27, d28, d5,  d28 @ d27 = t27a, d28 = t28a
+        butterfly       d4,  d26, d20, d26 @ d4  = t29,  d26 = t26
+        butterfly       d19, d20, d29, d6  @ d19 = t19a, d20 = t20
+        vmov            d29, d4            @ d29 = t29
+
+        mbutterfly0     d27, d20, d27, d20, d4, d6, q2, q3 @ d27 = t27,  d20 = t20
+        mbutterfly0     d26, d21, d26, d21, d4, d6, q2, q3 @ d26 = t26a, d21 = t21a
+        mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
+        mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
+.endm
+
+@ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
+@ We don't have register space to do a single pass IDCT of 4x32 though,
+@ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
+@ a normal IDCT16 with every other input component (the even ones, with
+@ each output written twice), followed by a separate 16-point IDCT
+@ of the odd inputs, added/subtracted onto the outputs of the first idct16.
+@ r0 = dst (temp buffer)
+@ r1 = unused
+@ r2 = src
+function idct32_1d_4x32_pass1_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+
+        @ Double stride of the input, since we only read every other line
+        mov             r12, #128
+        vmov.s16        d4, #0
+
+        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        idct16
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_2x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+        @ Store the registers a, b, c, d horizontally, followed
+        @ by the same registers d, c, b, a mirrored.
+.macro store_rev a, b, c, d
+.irp i, \a, \b, \c, \d
+        vst1.16         {d\i}, [r0,:64]!
+        vrev64.16       d\i, d\i
+.endr
+.irp i, \d, \c, \b, \a
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+.endm
+        store_rev       16, 20, 24, 28
+        store_rev       17, 21, 25, 29
+        store_rev       18, 22, 26, 30
+        store_rev       19, 23, 27, 31
+        sub             r0,  r0,  #256
+.purgem store_rev
+
+        @ Move r2 back to the start of the input, and move
+        @ to the first odd row
+        sub             r2,  r2,  r12, lsl #4
+        add             r2,  r2,  #64
+
+        vmov.s16        d4, #0
+        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        idct32_odd
+
+        transpose16_q_2x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
+
+        @ Store the registers a, b, c, d horizontally,
+        @ adding into the output first, and then mirrored, subtracted
+        @ from the output.
+.macro store_rev a, b, c, d
+.irp i, \a, \b, \c, \d
+        vld1.16         {d4},  [r0,:64]
+        vadd.s16        d4, d4, d\i
+        vst1.16         {d4},  [r0,:64]!
+        vrev64.16       d\i, d\i
+.endr
+.irp i, \d, \c, \b, \a
+        vld1.16         {d4},  [r0,:64]
+        vsub.s16        d4, d4, d\i
+        vst1.16         {d4},  [r0,:64]!
+.endr
+.endm
+
+        store_rev 31, 27, 23, 19
+        store_rev 30, 26, 22, 18
+        store_rev 29, 25, 21, 17
+        store_rev 28, 24, 20, 16
+.purgem store_rev
+        bx              lr
+endfunc
+
+@ This is mostly the same as 4x32_pass1, but without the transpose,
+@ and use the source as temp buffer between the two idct passes, and
+@ add into the destination.
+@ r0 = dst
+@ r1 = dst stride
+@ r2 = src (temp buffer)
+function idct32_1d_4x32_pass2_neon
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+
+        mov             r12, #128
+        @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #4
+
+        idct16
+
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vst1.16         {d\i}, [r2,:64], r12
+.endr
+
+        sub             r2,  r2,  r12, lsl #4
+        add             r2,  r2,  #64
+
+        @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #4
+        sub             r2,  r2,  #64
+
+        idct32_odd
+
+        mov             r12,  #128
+.macro load_acc_store a, b, c, d, neg=0
+        vld1.16         {d4},  [r2,:64], r12
+        vld1.16         {d5},  [r2,:64], r12
+.if \neg == 0
+        vadd.s16        d4, d4, d\a
+        vld1.16         {d6},  [r2,:64], r12
+        vadd.s16        d5, d5, d\b
+        vld1.16         {d7},  [r2,:64], r12
+        vadd.s16        d6, d6, d\c
+        vadd.s16        d7, d7, d\d
+.else
+        vsub.s16        d4, d4, d\a
+        vld1.16         {d6},  [r2,:64], r12
+        vsub.s16        d5, d5, d\b
+        vld1.16         {d7},  [r2,:64], r12
+        vsub.s16        d6, d6, d\c
+        vsub.s16        d7, d7, d\d
+.endif
+        vld1.32         {d2[]},   [r0,:32], r1
+        vld1.32         {d2[1]},  [r0,:32], r1
+        vrshr.s16       q2, q2, #6
+        vld1.32         {d3[]},   [r0,:32], r1
+        vrshr.s16       q3, q3, #6
+        vld1.32         {d3[1]},  [r0,:32], r1
+        sub             r0,  r0,  r1, lsl #2
+        vaddw.u8        q2,  q2,  d2
+        vaddw.u8        q3,  q3,  d3
+        vqmovun.s16     d4,  q2
+        vqmovun.s16     d5,  q3
+        vst1.32         {d4[0]},  [r0,:32], r1
+        vst1.32         {d4[1]},  [r0,:32], r1
+        vst1.32         {d5[0]},  [r0,:32], r1
+        vst1.32         {d5[1]},  [r0,:32], r1
+.endm
+        load_acc_store  31, 30, 29, 28
+        load_acc_store  27, 26, 25, 24
+        load_acc_store  23, 22, 21, 20
+        load_acc_store  19, 18, 17, 16
+        sub             r2,  r2,  r12
+        neg             r12, r12
+        load_acc_store  16, 17, 18, 19, 1
+        load_acc_store  20, 21, 22, 23, 1
+        load_acc_store  24, 25, 26, 27, 1
+        load_acc_store  28, 29, 30, 31, 1
+.purgem load_acc_store
+        bx              lr
+endfunc
+
+function ff_vp9_idct_idct_32x32_add_neon, export=1
+        cmp             r3,  #1
+        beq             idct32x32_dc_add_neon
+1:
+        push            {r4-r7,lr}
+        vpush           {q4-q7}
+        mov             r7,  sp
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r12, sp
+T       bic             r12, r12, #15
+T       sub             r12, r12, #2048
+T       mov             sp,  r12
+A       bic             sp,  sp,  #15
+A       sub             sp,  sp,  #2048
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  sp,  #(\i*64)
+        add             r2,  r6,  #(\i*2)
+        bl              idct32_1d_4x32_pass1_neon
+.endr
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        bl              idct32_1d_4x32_pass2_neon
+.endr
+
+        mov             sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r7,pc}
+endfunc