[11/11] aarch64: vp9itxfm: Do full, separate functions for half/quarter idct16 and idct32

Message ID 1479906058-22747-11-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö Nov. 23, 2016, 1 p.m.
This work is sponsored by, and copyright, Google.

This avoids having to fill the temp buffer with zeros for the
skipped slices, and leads to slightly more straightforward code
for these cases (for the 16x16 case, where the special case pass functions
are written out instead of templated from the same macro), instead of
riddling the common code with special case branches or macro .ifs.

The code size increases from 18548 bytes to 24580 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:       236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:       714.2
vp9_inv_dct_dct_16x16_sub8_add_neon:       926.8
vp9_inv_dct_dct_16x16_sub12_add_neon:     1402.3
vp9_inv_dct_dct_16x16_sub16_add_neon:     1405.9
vp9_inv_dct_dct_32x32_sub1_add_neon:       554.1
vp9_inv_dct_dct_32x32_sub4_add_neon:      3958.8
vp9_inv_dct_dct_32x32_sub8_add_neon:      3958.8
vp9_inv_dct_dct_32x32_sub12_add_neon:     5461.1
vp9_inv_dct_dct_32x32_sub16_add_neon:     5467.4
vp9_inv_dct_dct_32x32_sub20_add_neon:     7175.4
vp9_inv_dct_dct_32x32_sub24_add_neon:     7172.5
vp9_inv_dct_dct_32x32_sub28_add_neon:     8136.8
vp9_inv_dct_dct_32x32_sub32_add_neon:     8135.9

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:       236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:       644.0
vp9_inv_dct_dct_16x16_sub8_add_neon:       854.0
vp9_inv_dct_dct_16x16_sub12_add_neon:     1393.8
vp9_inv_dct_dct_16x16_sub16_add_neon:     1392.6
vp9_inv_dct_dct_32x32_sub1_add_neon:       556.6
vp9_inv_dct_dct_32x32_sub4_add_neon:      3684.3
vp9_inv_dct_dct_32x32_sub8_add_neon:      3682.6
vp9_inv_dct_dct_32x32_sub12_add_neon:     5316.3
vp9_inv_dct_dct_32x32_sub16_add_neon:     5315.9
vp9_inv_dct_dct_32x32_sub20_add_neon:     7146.4
vp9_inv_dct_dct_32x32_sub24_add_neon:     7151.5
vp9_inv_dct_dct_32x32_sub28_add_neon:     8118.8
vp9_inv_dct_dct_32x32_sub32_add_neon:     8117.5

---
This reverts parts of the previous commit (changing some register uses to
another register); if both are to be applied, they should be applied
squashed together. (And similarly for review, it's much easier to squash the
two and review the end result.) They are presented sequentially as two steps,
to show the effect on runtime and code size of each alternative.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 737 +++++++++++++++++++++++--------------
 1 file changed, 458 insertions(+), 279 deletions(-)

Patch

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index d74245f..78041d3 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -710,6 +710,51 @@  endfunc
         st1             {v2.8h},  [\src], \inc
 .endm
 
+.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
+        srshr           \coef0, \coef0, #6
+        ld1             {v2.8b},  [x0], x1
+        srshr           \coef1, \coef1, #6
+        ld1             {v3.8b},  [x3], x1
+        srshr           \coef2, \coef2, #6
+        ld1             {v4.8b},  [x0], x1
+        srshr           \coef3, \coef3, #6
+        uaddw           \coef0, \coef0, v2.8b
+        ld1             {v5.8b},  [x3], x1
+        uaddw           \coef1, \coef1, v3.8b
+        srshr           \coef4, \coef4, #6
+        ld1             {v6.8b},  [x0], x1
+        srshr           \coef5, \coef5, #6
+        ld1             {v7.8b},  [x3], x1
+        sqxtun          v2.8b,  \coef0
+        srshr           \coef6, \coef6, #6
+        sqxtun          v3.8b,  \coef1
+        srshr           \coef7, \coef7, #6
+        uaddw           \coef2, \coef2, v4.8b
+        ld1             {\tmp1},  [x0], x1
+        uaddw           \coef3, \coef3, v5.8b
+        ld1             {\tmp2},  [x3], x1
+        sqxtun          v4.8b,  \coef2
+        sub             x0,  x0,  x1, lsl #2
+        sub             x3,  x3,  x1, lsl #2
+        sqxtun          v5.8b,  \coef3
+        uaddw           \coef4, \coef4, v6.8b
+        st1             {v2.8b},  [x0], x1
+        uaddw           \coef5, \coef5, v7.8b
+        st1             {v3.8b},  [x3], x1
+        sqxtun          v6.8b,  \coef4
+        st1             {v4.8b},  [x0], x1
+        sqxtun          v7.8b,  \coef5
+        st1             {v5.8b},  [x3], x1
+        uaddw           \coef6, \coef6, \tmp1
+        st1             {v6.8b},  [x0], x1
+        uaddw           \coef7, \coef7, \tmp2
+        st1             {v7.8b},  [x3], x1
+        sqxtun          \tmp1,  \coef6
+        sqxtun          \tmp2,  \coef7
+        st1             {\tmp1},  [x0], x1
+        st1             {\tmp2},  [x3], x1
+.endm
+
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 // transpose into a horizontal 16x8 slice and store.
 // x0 = dst (temp buffer)
@@ -728,37 +773,12 @@  function \txfm\()16_1d_8x16_pass1_neon
 
         mov             x9, #32
         movi            v2.8h, #0
-
-.ifc \txfm,idct
-        cmp             w3, #10
-        b.le            3f
-        cmp             w3, #38
-        b.le            4f
-.endif
-
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         load_clear      \i,  x2,  x9
 .endr
 
         bl              \txfm\()16
-.ifc \txfm,idct
-        b               5f
 
-3:
-.irp i, 16, 17, 18, 19
-        load_clear      \i,  x2,  x9
-.endr
-        bl              idct16_quarter
-        b               5f
-
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        load_clear      \i,  x2,  x9
-.endr
-        bl              idct16_half
-.endif
-
-5:
         // Do two 8x8 transposes. Originally, v16-v31 contain the
         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
         // transposed 8x8 blocks.
@@ -812,92 +832,25 @@  endfunc
 // x0 = dst
 // x1 = dst stride
 // x2 = src (temp buffer)
-// w3 = eob
-// x13 = slice offset
+// x3 = slice offset
 function \txfm\()16_1d_8x16_pass2_neon
         mov             x14, x30
         mov             x9, #32
-.ifc \txfm,idct
-        cmp             w3, #10
-        b.le            3f
-        cmp             w3, #38
-        b.le            4f
-.endif
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         load            \i,  x2,  x9
 .endr
-        cbz             x13, 1f
+        cbz             x3,  1f
 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
         load            \i,  x2,  x9
 .endr
 1:
 
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
         bl              \txfm\()16
-.ifc \txfm,idct
-        b               5f
-3:
-.irp i, 16, 17, 18, 19
-        load            \i,  x2,  x9
-.endr
-        bl              idct16_quarter
-        b               5f
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        load            \i,  x2,  x9
-.endr
-        bl              idct16_half
-.endif
 
-5:
-        add             x8,  x0,  x1
-        lsl             x1,  x1,  #1
-.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
-        srshr           \coef0, \coef0, #6
-        ld1             {v2.8b},  [x0], x1
-        srshr           \coef1, \coef1, #6
-        ld1             {v3.8b},  [x8], x1
-        srshr           \coef2, \coef2, #6
-        ld1             {v4.8b},  [x0], x1
-        srshr           \coef3, \coef3, #6
-        uaddw           \coef0, \coef0, v2.8b
-        ld1             {v5.8b},  [x8], x1
-        uaddw           \coef1, \coef1, v3.8b
-        srshr           \coef4, \coef4, #6
-        ld1             {v6.8b},  [x0], x1
-        srshr           \coef5, \coef5, #6
-        ld1             {v7.8b},  [x8], x1
-        sqxtun          v2.8b,  \coef0
-        srshr           \coef6, \coef6, #6
-        sqxtun          v3.8b,  \coef1
-        srshr           \coef7, \coef7, #6
-        uaddw           \coef2, \coef2, v4.8b
-        ld1             {\tmp1},  [x0], x1
-        uaddw           \coef3, \coef3, v5.8b
-        ld1             {\tmp2},  [x8], x1
-        sqxtun          v4.8b,  \coef2
-        sub             x0,  x0,  x1, lsl #2
-        sub             x8,  x8,  x1, lsl #2
-        sqxtun          v5.8b,  \coef3
-        uaddw           \coef4, \coef4, v6.8b
-        st1             {v2.8b},  [x0], x1
-        uaddw           \coef5, \coef5, v7.8b
-        st1             {v3.8b},  [x8], x1
-        sqxtun          v6.8b,  \coef4
-        st1             {v4.8b},  [x0], x1
-        sqxtun          v7.8b,  \coef5
-        st1             {v5.8b},  [x8], x1
-        uaddw           \coef6, \coef6, \tmp1
-        st1             {v6.8b},  [x0], x1
-        uaddw           \coef7, \coef7, \tmp2
-        st1             {v7.8b},  [x8], x1
-        sqxtun          \tmp1,  \coef6
-        sqxtun          \tmp2,  \coef7
-        st1             {\tmp1},  [x0], x1
-        st1             {\tmp2},  [x8], x1
-.endm
         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
-.purgem load_add_store
 
         br              x14
 endfunc
@@ -916,6 +869,10 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
         cmp             w3,  #1
         b.eq            idct16x16_dc_add_neon
+        cmp             w3,  #10
+        b.eq            idct16x16_quarter_add_neon
+        cmp             w3,  #38
+        b.eq            idct16x16_half_add_neon
 .endif
         mov             x15, x30
         // iadst16 requires clobbering v8-v15, but idct16 doesn't need to.
@@ -936,7 +893,6 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifnc \txfm1\()_\txfm2,idct_idct
         movrel          x11, iadst16_coeffs
         mov             x7,  #0
-        mov             w3,  #256
 .else
         movrel          x12, min_eob_idct_idct_16
 .endif
@@ -960,7 +916,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
         add             x0,  x4,  #(\i)
         mov             x1,  x5
         add             x2,  sp,  #(\i*2)
-        mov             x13, #\i
+        mov             x3,  #\i
         bl              \txfm2\()16_1d_8x16_pass2_neon
 .endr
 
@@ -980,6 +936,163 @@  itxfm_func16x16 iadst, idct
 itxfm_func16x16 idct,  iadst
 itxfm_func16x16 iadst, iadst
 
+function idct16_1d_8x16_pass1_quarter_neon
+        mov             x14, x30
+        mov             x9, #32
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_quarter
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        // The first 8x8 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        // Since only a 4x4 part of the input was nonzero,
+        // this means that only 4 rows are nonzero after transposing, and
+        // the second pass only reads the topmost 4 rows. Therefore only
+        // store the topmost 4 rows.
+.irp i, 24, 25, 26, 27
+        add             x0,  x0,  #16
+        store           \i,  x0,  #16
+.endr
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass2_quarter_neon
+        mov             x14, x30
+        mov             x9, #32
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_quarter
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass1_half_neon
+        mov             x14, x30
+        mov             x9, #32
+        movi            v2.8h, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i,  x2,  x9
+.endr
+
+        bl              idct16_half
+
+        // Do two 8x8 transposes. Originally, v16-v31 contain the
+        // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
+        // transposed 8x8 blocks.
+        transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
+        transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
+
+        // Store the transposed 8x8 blocks horizontally.
+        // The first 8x8 block is kept in registers for the second pass,
+        // store the rest in the temp buffer.
+        // Since only a 4x4 part of the input was nonzero,
+        // this means that only 4 rows are nonzero after transposing, and
+        // the second pass only reads the topmost 4 rows. Therefore only
+        // store the topmost 4 rows.
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        add             x0,  x0,  #16
+        store           \i,  x0,  #16
+.endr
+        br              x14
+endfunc
+
+function idct16_1d_8x16_pass2_half_neon
+        mov             x14, x30
+        mov             x9, #32
+        cbz             x3,  1f
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+1:
+
+        add             x3,  x0,  x1
+        lsl             x1,  x1,  #1
+        bl              idct16_half
+
+        load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
+        load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
+
+        br              x14
+endfunc
+
+function idct16x16_quarter_add_neon, export=1
+        mov             x15, x30
+
+        sub             sp,  sp,  #512
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        movrel          x10, idct_coeffs
+        ld1             {v0.8h,v1.8h}, [x10]
+
+.irp i, 0
+        add             x0,  sp,  #(\i*32)
+        mov             x1,  #\i
+        add             x2,  x6,  #(\i*2)
+        bl              idct16_1d_8x16_pass1_quarter_neon
+.endr
+.irp i, 0, 8
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        mov             x3,  #\i
+        bl              idct16_1d_8x16_pass2_quarter_neon
+.endr
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
+
+function idct16x16_half_add_neon, export=1
+        mov             x15, x30
+
+        sub             sp,  sp,  #512
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+        movrel          x10, idct_coeffs
+        ld1             {v0.8h,v1.8h}, [x10]
+
+.irp i, 0
+        add             x0,  sp,  #(\i*32)
+        mov             x1,  #\i
+        add             x2,  x6,  #(\i*2)
+        bl              idct16_1d_8x16_pass1_half_neon
+.endr
+.irp i, 0, 8
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        mov             x3,  #\i
+        bl              idct16_1d_8x16_pass2_half_neon
+.endr
+
+        add             sp,  sp,  #512
+        br              x15
+endfunc
 
 function idct32x32_dc_add_neon
         movrel          x4, idct_coeffs
@@ -1160,6 +1273,85 @@  function idct32_odd_quarter
 endfunc
 
 
+// Store the registers a, b horizontally, followed by the
+// same registers b, a mirrored.
+.macro store_rev1 a, b
+        // There's no rev128 instruction, but we reverse each 64 bit
+        // half, and then flip them using an ext with 8 bytes offset.
+        rev64           v1.8h, v\b\().8h
+        st1             {v\a\().8h},  [x0], #16
+        rev64           v0.8h, v\a\().8h
+        ext             v1.16b, v1.16b, v1.16b, #8
+        st1             {v\b\().8h},  [x0], #16
+        ext             v0.16b, v0.16b, v0.16b, #8
+        st1             {v1.8h},  [x0], #16
+        st1             {v0.8h},  [x0], #16
+.endm
+
+// Store the registers a, b horizontally,
+// adding into the output first, and the mirrored,
+// subtracted from the output.
+.macro store_rev2 a, b
+        ld1             {v4.8h},  [x0]
+        rev64           v1.8h, v\b\().8h
+        add             v4.8h, v4.8h, v\a\().8h
+        rev64           v0.8h, v\a\().8h
+        st1             {v4.8h},  [x0], #16
+        ext             v1.16b, v1.16b, v1.16b, #8
+        ld1             {v5.8h},  [x0]
+        ext             v0.16b, v0.16b, v0.16b, #8
+        add             v5.8h, v5.8h, v\b\().8h
+        st1             {v5.8h},  [x0], #16
+        ld1             {v6.8h},  [x0]
+        sub             v6.8h, v6.8h, v1.8h
+        st1             {v6.8h},  [x0], #16
+        ld1             {v7.8h},  [x0]
+        sub             v7.8h, v7.8h, v0.8h
+        st1             {v7.8h},  [x0], #16
+.endm
+
+.macro load_acc_store a, b, c, d, neg=0
+        ld1             {v4.8h},  [x2], x9
+        ld1             {v5.8h},  [x2], x9
+.if \neg == 0
+        add             v4.8h, v4.8h, v\a\().8h
+        ld1             {v6.8h},  [x2], x9
+        add             v5.8h, v5.8h, v\b\().8h
+        ld1             {v7.8h},  [x2], x9
+        add             v6.8h, v6.8h, v\c\().8h
+        add             v7.8h, v7.8h, v\d\().8h
+.else
+        sub             v4.8h, v4.8h, v\a\().8h
+        ld1             {v6.8h},  [x2], x9
+        sub             v5.8h, v5.8h, v\b\().8h
+        ld1             {v7.8h},  [x2], x9
+        sub             v6.8h, v6.8h, v\c\().8h
+        sub             v7.8h, v7.8h, v\d\().8h
+.endif
+        ld1             {v0.8b}, [x0], x1
+        ld1             {v1.8b}, [x0], x1
+        srshr           v4.8h, v4.8h, #6
+        ld1             {v2.8b}, [x0], x1
+        srshr           v5.8h, v5.8h, #6
+        uaddw           v4.8h, v4.8h, v0.8b
+        ld1             {v3.8b}, [x0], x1
+        srshr           v6.8h, v6.8h, #6
+        uaddw           v5.8h, v5.8h, v1.8b
+        srshr           v7.8h, v7.8h, #6
+        sub             x0,  x0,  x1, lsl #2
+        uaddw           v6.8h, v6.8h, v2.8b
+        sqxtun          v4.8b, v4.8h
+        uaddw           v7.8h, v7.8h, v3.8b
+        sqxtun          v5.8b, v5.8h
+        st1             {v4.8b}, [x0], x1
+        sqxtun          v6.8b, v6.8h
+        st1             {v5.8b}, [x0], x1
+        sqxtun          v7.8b, v7.8h
+        st1             {v6.8b}, [x0], x1
+        st1             {v7.8b}, [x0], x1
+.endm
+
+.macro idct32_funcs suffix
 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
 // a normal IDCT16 with every other input component (the even ones, with
@@ -1171,149 +1363,102 @@  endfunc
 // w3 = eob
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
-function idct32_1d_8x32_pass1_neon
+function idct32_1d_8x32_pass1\suffix\()_neon
         // Check if this whole input slice is zero
+.ifb \suffix
         cmp             w3,  w1
         b.le            1f
+.endif
 
         mov             x14, x30
         ld1             {v0.8h,v1.8h}, [x10]
 
         // Double stride of the input, since we only read every other line
         mov             x9,  #128
-        movi            v4.8h, #0
-
-        cmp             w3,  #4
-        b.le            3f
-        cmp             w3,  #135
-        b.le            4f
+        movi            v2.8h, #0
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2]
-        st1             {v4.8h},  [x2], x9
+        load_clear      \i, x2, x9
 .endr
-
-        bl              idct16
-        sub             x2,  x2,  x9, lsl #4
-        b               5f
-3:
+.endif
+.ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
-        ld1             {v\i\().8h}, [x2]
-        st1             {v4.8h},  [x2], x9
+        load_clear      \i, x2, x9
 .endr
-        bl              idct16_quarter
-        sub             x2,  x2,  x9, lsl #2
-        b               5f
-4:
+.endif
+.ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        ld1             {v\i\().8h}, [x2]
-        st1             {v4.8h},  [x2], x9
+        load_clear      \i, x2, x9
 .endr
-        bl              idct16_half
-        sub             x2,  x2,  x9, lsl #3
+.endif
+
+        bl              idct16\suffix
 
-5:
         // Do two 8x8 transposes. Originally, v16-v31 contain the
         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
         // two transposed 8x8 blocks.
         transpose_8x8H  v16, v17, v18, v19, v20, v21, v22, v23, v2, v3
         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 
-        // Store the registers a, b horizontally, followed by the
-        // same registers b, a mirrored.
-.macro store_rev a, b
-        // There's no rev128 instruction, but we reverse each 64 bit
-        // half, and then flip them using an ext with 8 bytes offset.
-        rev64           v1.8h, v\b\().8h
-        st1             {v\a\().8h},  [x0], #16
-        rev64           v0.8h, v\a\().8h
-        ext             v1.16b, v1.16b, v1.16b, #8
-        st1             {v\b\().8h},  [x0], #16
-        ext             v0.16b, v0.16b, v0.16b, #8
-        st1             {v1.8h},  [x0], #16
-        st1             {v0.8h},  [x0], #16
-.endm
-        store_rev       16, 24
-        store_rev       17, 25
-        store_rev       18, 26
-        store_rev       19, 27
-        store_rev       20, 28
-        store_rev       21, 29
-        store_rev       22, 30
-        store_rev       23, 31
+        store_rev1      16, 24
+        store_rev1      17, 25
+        store_rev1      18, 26
+        store_rev1      19, 27
+        store_rev1      20, 28
+        store_rev1      21, 29
+        store_rev1      22, 30
+        store_rev1      23, 31
         sub             x0,  x0,  #512
-.purgem store_rev
 
-        // Move x2 to the first odd row
+        // Move x2 back to the start of the input, and move
+        // to the first odd row
+.ifb \suffix
+        sub             x2,  x2,  x9, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             x2,  x2,  x9, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             x2,  x2,  x9, lsl #3
+.endif
         add             x2,  x2,  #64
 
-        movi            v4.8h, #0
-
-        cmp             w3,  #34
-        b.le            3f
-        cmp             w3,  #135
-        b.le            4f
-
+        movi            v2.8h, #0
         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2]
-        st1             {v4.8h},  [x2], x9
+        load_clear      \i, x2, x9
 .endr
-
-        bl              idct32_odd
-        b               5f
-3:
+.endif
+.ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
-        ld1             {v\i\().8h}, [x2]
-        st1             {v4.8h},  [x2], x9
+        load_clear      \i, x2, x9
 .endr
-        bl              idct32_odd_quarter
-        b               5f
-4:
+.endif
+.ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        ld1             {v\i\().8h}, [x2]
-        st1             {v4.8h},  [x2], x9
+        load_clear      \i, x2, x9
 .endr
-        bl              idct32_odd_half
+.endif
+
+        bl              idct32_odd\suffix
 
-5:
         transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
         transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
 
-        // Store the registers a, b horizontally,
-        // adding into the output first, and the mirrored,
-        // subtracted from the output.
-.macro store_rev a, b
-        ld1             {v4.8h},  [x0]
-        rev64           v1.8h, v\b\().8h
-        add             v4.8h, v4.8h, v\a\().8h
-        rev64           v0.8h, v\a\().8h
-        st1             {v4.8h},  [x0], #16
-        ext             v1.16b, v1.16b, v1.16b, #8
-        ld1             {v5.8h},  [x0]
-        ext             v0.16b, v0.16b, v0.16b, #8
-        add             v5.8h, v5.8h, v\b\().8h
-        st1             {v5.8h},  [x0], #16
-        ld1             {v6.8h},  [x0]
-        sub             v6.8h, v6.8h, v1.8h
-        st1             {v6.8h},  [x0], #16
-        ld1             {v7.8h},  [x0]
-        sub             v7.8h, v7.8h, v0.8h
-        st1             {v7.8h},  [x0], #16
-.endm
-
-        store_rev       31, 23
-        store_rev       30, 22
-        store_rev       29, 21
-        store_rev       28, 20
-        store_rev       27, 19
-        store_rev       26, 18
-        store_rev       25, 17
-        store_rev       24, 16
-.purgem store_rev
+        store_rev2      31, 23
+        store_rev2      30, 22
+        store_rev2      29, 21
+        store_rev2      28, 20
+        store_rev2      27, 19
+        store_rev2      26, 18
+        store_rev2      25, 17
+        store_rev2      24, 16
         br              x14
 
+.ifb \suffix
 1:
         // Write zeros to the temp buffer for pass 2
         movi            v16.8h,  #0
@@ -1324,6 +1469,7 @@  function idct32_1d_8x32_pass1_neon
         st1             {v16.8h-v19.8h},  [x0], #64
 .endr
         ret
+.endif
 endfunc
 
 // This is mostly the same as 8x32_pass1, but without the transpose,
@@ -1334,116 +1480,63 @@  endfunc
 // x2 = src (temp buffer)
 // x10 = idct_coeffs
 // x11 = idct_coeffs + 32
-function idct32_1d_8x32_pass2_neon
+function idct32_1d_8x32_pass2\suffix\()_neon
         mov             x14, x30
         ld1             {v0.8h,v1.8h}, [x10]
 
         mov             x9, #128
-
-        cmp             w3,  #34
-        b.le            3f
-        cmp             w3,  #135
-        b.le            4f
-
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2], x9
+        load            \i, x2, x9
 .endr
         sub             x2,  x2,  x9, lsl #4
-
-        bl              idct16
-        b               5f
-3:
+.endif
+.ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
-        ld1             {v\i\().8h}, [x2], x9
+        load            \i, x2, x9
 .endr
         sub             x2,  x2,  x9, lsl #2
-        bl              idct16_quarter
-        b               5f
-4:
+.endif
+.ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        ld1             {v\i\().8h}, [x2], x9
+        load            \i, x2, x9
 .endr
         sub             x2,  x2,  x9, lsl #3
-        bl              idct16_half
+.endif
+
+        bl              idct16\suffix
 
-5:
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        st1             {v\i\().8h}, [x2], x9
+        store           \i, x2, x9
 .endr
 
         sub             x2,  x2,  x9, lsl #4
         add             x2,  x2,  #64
 
-        cmp             w3,  #34
-        b.le            3f
-        cmp             w3,  #135
-        b.le            4f
-
         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
-        ld1             {v\i\().8h}, [x2], x9
+        load            \i, x2, x9
 .endr
         sub             x2,  x2,  x9, lsl #4
-
-        bl              idct32_odd
-        b               5f
-3:
+.endif
+.ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
-        ld1             {v\i\().8h}, [x2], x9
+        load            \i, x2, x9
 .endr
         sub             x2,  x2,  x9, lsl #2
-        bl              idct32_odd_quarter
-        b               5f
-4:
+.endif
+.ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        ld1             {v\i\().8h}, [x2], x9
+        load            \i, x2, x9
 .endr
         sub             x2,  x2,  x9, lsl #3
-        bl              idct32_odd_half
-
-5:
-        sub             x2,  x2,  #64
-.macro load_acc_store a, b, c, d, neg=0
-        ld1             {v4.8h},  [x2], x9
-        ld1             {v5.8h},  [x2], x9
-.if \neg == 0
-        add             v4.8h, v4.8h, v\a\().8h
-        ld1             {v6.8h},  [x2], x9
-        add             v5.8h, v5.8h, v\b\().8h
-        ld1             {v7.8h},  [x2], x9
-        add             v6.8h, v6.8h, v\c\().8h
-        add             v7.8h, v7.8h, v\d\().8h
-.else
-        sub             v4.8h, v4.8h, v\a\().8h
-        ld1             {v6.8h},  [x2], x9
-        sub             v5.8h, v5.8h, v\b\().8h
-        ld1             {v7.8h},  [x2], x9
-        sub             v6.8h, v6.8h, v\c\().8h
-        sub             v7.8h, v7.8h, v\d\().8h
 .endif
-        ld1             {v0.8b}, [x0], x1
-        ld1             {v1.8b}, [x0], x1
-        srshr           v4.8h, v4.8h, #6
-        ld1             {v2.8b}, [x0], x1
-        srshr           v5.8h, v5.8h, #6
-        uaddw           v4.8h, v4.8h, v0.8b
-        ld1             {v3.8b}, [x0], x1
-        srshr           v6.8h, v6.8h, #6
-        uaddw           v5.8h, v5.8h, v1.8b
-        srshr           v7.8h, v7.8h, #6
-        sub             x0,  x0,  x1, lsl #2
-        uaddw           v6.8h, v6.8h, v2.8b
-        sqxtun          v4.8b, v4.8h
-        uaddw           v7.8h, v7.8h, v3.8b
-        sqxtun          v5.8b, v5.8h
-        st1             {v4.8b}, [x0], x1
-        sqxtun          v6.8b, v6.8h
-        st1             {v5.8b}, [x0], x1
-        sqxtun          v7.8b, v7.8h
-        st1             {v6.8b}, [x0], x1
-        st1             {v7.8b}, [x0], x1
-.endm
+        sub             x2,  x2,  #64
+
+        bl              idct32_odd\suffix
+
         load_acc_store  31, 30, 29, 28
         load_acc_store  27, 26, 25, 24
         load_acc_store  23, 22, 21, 20
@@ -1454,9 +1547,13 @@  function idct32_1d_8x32_pass2_neon
         load_acc_store  20, 21, 22, 23, 1
         load_acc_store  24, 25, 26, 27, 1
         load_acc_store  28, 29, 30, 31, 1
-.purgem load_acc_store
         br              x14
 endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
 
 const min_eob_idct_idct_32, align=4
         .short  0, 34, 135, 336
@@ -1465,6 +1562,10 @@  endconst
 function ff_vp9_idct_idct_32x32_add_neon, export=1
         cmp             w3,  #1
         b.eq            idct32x32_dc_add_neon
+        cmp             w3,  #34
+        b.le            idct32x32_quarter_add_neon
+        cmp             w3,  #135
+        b.le            idct32x32_half_add_neon
 
         movrel          x10, idct_coeffs
         add             x11, x10, #32
@@ -1505,3 +1606,81 @@  function ff_vp9_idct_idct_32x32_add_neon, export=1
 
         br              x15
 endfunc
+
+function idct32x32_quarter_add_neon
+        movrel          x10, idct_coeffs
+        add             x11, x10, #32
+
+        mov             x15, x30
+
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+
+        sub             sp,  sp,  #2048
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+.irp i, 0
+        add             x0,  sp,  #(\i*64)
+        add             x2,  x6,  #(\i*2)
+        bl              idct32_1d_8x32_pass1_quarter_neon
+.endr
+.irp i, 0, 8, 16, 24
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        bl              idct32_1d_8x32_pass2_quarter_neon
+.endr
+
+        add             sp,  sp,  #2048
+
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+
+        br              x15
+endfunc
+
+function idct32x32_half_add_neon
+        movrel          x10, idct_coeffs
+        add             x11, x10, #32
+
+        mov             x15, x30
+
+        stp             d14, d15, [sp, #-0x10]!
+        stp             d12, d13, [sp, #-0x10]!
+        stp             d10, d11, [sp, #-0x10]!
+        stp             d8,  d9,  [sp, #-0x10]!
+
+        sub             sp,  sp,  #2048
+
+        mov             x4,  x0
+        mov             x5,  x1
+        mov             x6,  x2
+
+.irp i, 0, 8
+        add             x0,  sp,  #(\i*64)
+        add             x2,  x6,  #(\i*2)
+        bl              idct32_1d_8x32_pass1_half_neon
+.endr
+.irp i, 0, 8, 16, 24
+        add             x0,  x4,  #(\i)
+        mov             x1,  x5
+        add             x2,  sp,  #(\i*2)
+        bl              idct32_1d_8x32_pass2_half_neon
+.endr
+
+        add             sp,  sp,  #2048
+
+        ldp             d8,  d9,  [sp], 0x10
+        ldp             d10, d11, [sp], 0x10
+        ldp             d12, d13, [sp], 0x10
+        ldp             d14, d15, [sp], 0x10
+
+        br              x15
+endfunc