[07/11] arm: vp9itxfm: Do full, separate functions for half/quarter idct16 and idct32

Message ID 1479906058-22747-7-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö Nov. 23, 2016, 1 p.m.
This work is sponsored by, and copyright, Google.

This avoids having to fill the temp buffer with zeros for the
skipped slices, and leads to slightly more straightforward code
for these cases (for the 16x16 case, where the special case pass functions
are written out instead of templated from the same macro), instead of
riddling the common code with special case branches or macro .ifs.

The code size increases from 15000 bytes to 19864 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:       271.5    188.7    211.7    235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:      1336.5   1012.5   1225.9    860.7
vp9_inv_dct_dct_16x16_sub8_add_neon:      2023.2   1768.8   1868.1   1358.0
vp9_inv_dct_dct_16x16_sub12_add_neon:     2947.1   2228.9   2304.8   1795.7
vp9_inv_dct_dct_16x16_sub16_add_neon:     3247.9   2536.7   2547.0   2036.1
vp9_inv_dct_dct_32x32_sub1_add_neon:       751.5    456.7    863.5    553.9
vp9_inv_dct_dct_32x32_sub4_add_neon:      8019.6   5868.0   6632.6   5134.4
vp9_inv_dct_dct_32x32_sub8_add_neon:      8808.1   6966.8   7198.0   5690.6
vp9_inv_dct_dct_32x32_sub12_add_neon:    11291.5  10146.7   9628.8   7566.7
vp9_inv_dct_dct_32x32_sub16_add_neon:    12159.2  11004.2  10373.3   8237.7
vp9_inv_dct_dct_32x32_sub20_add_neon:    15230.9  13467.6  11841.1   9748.8
vp9_inv_dct_dct_32x32_sub24_add_neon:    16361.5  14854.5  12677.6  10505.0
vp9_inv_dct_dct_32x32_sub28_add_neon:    17497.8  15833.3  13493.0  11254.0
vp9_inv_dct_dct_32x32_sub32_add_neon:    18591.8  17348.5  14355.5  12001.7

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:       271.5    188.7    211.7    235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:      1209.5    863.9   1034.7    764.7
vp9_inv_dct_dct_16x16_sub8_add_neon:      1915.8   1590.9   1739.0   1281.7
vp9_inv_dct_dct_16x16_sub12_add_neon:     2850.5   2204.3   2292.1   1779.8
vp9_inv_dct_dct_16x16_sub16_add_neon:     3240.1   2490.6   2555.8   2009.9
vp9_inv_dct_dct_32x32_sub1_add_neon:       751.5    458.9    863.5    553.9
vp9_inv_dct_dct_32x32_sub4_add_neon:      7566.3   5721.3   6043.8   4920.7
vp9_inv_dct_dct_32x32_sub8_add_neon:      8366.1   6786.5   6594.1   5476.2
vp9_inv_dct_dct_32x32_sub12_add_neon:    10980.0   9885.5   9237.7   7436.5
vp9_inv_dct_dct_32x32_sub16_add_neon:    11917.3  11156.8   9963.0   8113.0
vp9_inv_dct_dct_32x32_sub20_add_neon:    15201.3  13632.9  11844.2   9819.9
vp9_inv_dct_dct_32x32_sub24_add_neon:    16333.8  14541.2  12654.5  10580.7
vp9_inv_dct_dct_32x32_sub28_add_neon:    17459.1  16165.8  13450.0  11325.3
vp9_inv_dct_dct_32x32_sub32_add_neon:    18612.2  17386.7  14281.6  12065.8

---
This reverts parts of the previous commit (changing some register uses to
another register); if both are to be applied, they should be applied
squashed together. (And similarly for review, it's much easier to squash the
two and review the end result.) They are presented sequentially as two steps,
to show the effect on runtime and code size of each alternative.
---
 libavcodec/arm/vp9itxfm_neon.S | 532 +++++++++++++++++++++++++++++------------
 1 file changed, 374 insertions(+), 158 deletions(-)

Patch

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 99a5e1f..b6c23c8 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -745,6 +745,42 @@  function iadst16
         bx              lr
 endfunc
 
+.macro load_add_store coef0, coef1, coef2, coef3
+        vrshr.s16       \coef0, \coef0, #6
+        vrshr.s16       \coef1, \coef1, #6
+
+        vld1.32         {d4[]},   [r0,:32], r1
+        vld1.32         {d4[1]},  [r3,:32], r1
+        vrshr.s16       \coef2, \coef2, #6
+        vrshr.s16       \coef3, \coef3, #6
+        vld1.32         {d5[]},   [r0,:32], r1
+        vld1.32         {d5[1]},  [r3,:32], r1
+        vaddw.u8        \coef0, \coef0, d4
+        vld1.32         {d6[]},   [r0,:32], r1
+        vld1.32         {d6[1]},  [r3,:32], r1
+        vaddw.u8        \coef1, \coef1, d5
+        vld1.32         {d7[]},   [r0,:32], r1
+        vld1.32         {d7[1]},  [r3,:32], r1
+
+        vqmovun.s16     d4,  \coef0
+        vqmovun.s16     d5,  \coef1
+        sub             r0,  r0,  r1, lsl #2
+        sub             r3,  r3,  r1, lsl #2
+        vaddw.u8        \coef2, \coef2, d6
+        vaddw.u8        \coef3, \coef3, d7
+        vst1.32         {d4[0]},  [r0,:32], r1
+        vst1.32         {d4[1]},  [r3,:32], r1
+        vqmovun.s16     d6,  \coef2
+        vst1.32         {d5[0]},  [r0,:32], r1
+        vst1.32         {d5[1]},  [r3,:32], r1
+        vqmovun.s16     d7,  \coef3
+
+        vst1.32         {d6[0]},  [r0,:32], r1
+        vst1.32         {d6[1]},  [r3,:32], r1
+        vst1.32         {d7[0]},  [r0,:32], r1
+        vst1.32         {d7[1]},  [r3,:32], r1
+.endm
+
 .macro itxfm16_1d_funcs txfm
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 @ transpose into a horizontal 16x4 slice and store.
@@ -763,40 +799,13 @@  function \txfm\()16_1d_4x16_pass1_neon
 
         mov             r12, #32
         vmov.s16        q2, #0
-
-.ifc \txfm,idct
-        cmp             r3,  #10
-        ble             3f
-        cmp             r3,  #38
-        ble             4f
-.endif
-
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
 
         bl              \txfm\()16
-.ifc \txfm,idct
-        b               5f
-
-3:
-.irp i, 16, 17, 18, 19
-        vld1.16         {d\i}, [r2,:64]
-        vst1.16         {d4},  [r2,:64], r12
-.endr
-        bl              idct16_quarter
-        b               5f
-
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        vld1.16         {d\i}, [r2,:64]
-        vst1.16         {d4},  [r2,:64], r12
-.endr
-        bl              idct16_half
-.endif
 
-5:
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
         @ contain the transposed 4x4 blocks.
@@ -855,84 +864,26 @@  endfunc
 @ r0 = dst
 @ r1 = dst stride
 @ r2 = src (temp buffer)
-@ r3 = eob
-@ r9 = slice offset
+@ r3 = slice offset
 function \txfm\()16_1d_4x16_pass2_neon
         push            {lr}
         mov             r12, #32
-.ifc \txfm,idct
-        cmp             r3,  #10
-        ble             3f
-        cmp             r3,  #38
-        ble             4f
-.endif
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
         vld1.16         {d\i}, [r2,:64], r12
 .endr
-        cmp             r9,  #0
+        cmp             r3,  #0
         beq             1f
 .irp i, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64], r12
 .endr
 1:
 
-        bl              \txfm\()16
-.ifc \txfm,idct
-        b               5f
-3:
-.irp i, 16, 17, 18, 19
-        vld1.16         {d\i}, [r2,:64], r12
-.endr
-        bl              idct16_quarter
-        b               5f
-4:
-.irp i, 16, 17, 18, 19, 20, 21, 22, 23
-        vld1.16         {d\i}, [r2,:64], r12
-.endr
-        bl              idct16_half
-.endif
-
-5:
-        add             r8,  r0,  r1
+        add             r3,  r0,  r1
         lsl             r1,  r1,  #1
-.macro load_add_store coef0, coef1, coef2, coef3
-        vrshr.s16       \coef0, \coef0, #6
-        vrshr.s16       \coef1, \coef1, #6
-
-        vld1.32         {d4[]},   [r0,:32], r1
-        vld1.32         {d4[1]},  [r8,:32], r1
-        vrshr.s16       \coef2, \coef2, #6
-        vrshr.s16       \coef3, \coef3, #6
-        vld1.32         {d5[]},   [r0,:32], r1
-        vld1.32         {d5[1]},  [r8,:32], r1
-        vaddw.u8        \coef0, \coef0, d4
-        vld1.32         {d6[]},   [r0,:32], r1
-        vld1.32         {d6[1]},  [r8,:32], r1
-        vaddw.u8        \coef1, \coef1, d5
-        vld1.32         {d7[]},   [r0,:32], r1
-        vld1.32         {d7[1]},  [r8,:32], r1
-
-        vqmovun.s16     d4,  \coef0
-        vqmovun.s16     d5,  \coef1
-        sub             r0,  r0,  r1, lsl #2
-        sub             r8,  r8,  r1, lsl #2
-        vaddw.u8        \coef2, \coef2, d6
-        vaddw.u8        \coef3, \coef3, d7
-        vst1.32         {d4[0]},  [r0,:32], r1
-        vst1.32         {d4[1]},  [r8,:32], r1
-        vqmovun.s16     d6,  \coef2
-        vst1.32         {d5[0]},  [r0,:32], r1
-        vst1.32         {d5[1]},  [r8,:32], r1
-        vqmovun.s16     d7,  \coef3
+        bl              \txfm\()16
 
-        vst1.32         {d6[0]},  [r0,:32], r1
-        vst1.32         {d6[1]},  [r8,:32], r1
-        vst1.32         {d7[0]},  [r0,:32], r1
-        vst1.32         {d7[1]},  [r8,:32], r1
-.endm
         load_add_store  q8,  q9,  q10, q11
         load_add_store  q12, q13, q14, q15
-.purgem load_add_store
 
         pop             {pc}
 endfunc
@@ -951,12 +902,15 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
         cmp             r3,  #1
         beq             idct16x16_dc_add_neon
+        cmp             r3,  #10
+        ble             idct16x16_quarter_add_neon
+        cmp             r3,  #38
+        ble             idct16x16_half_add_neon
 .endif
         push            {r4-r9,lr}
 .ifnc \txfm1\()_\txfm2,idct_idct
         vpush           {q4-q7}
         mov             r9,  #0
-        mov             r3,  #256
 .else
         movrel          r8,  min_eob_idct_idct_16
 .endif
@@ -994,7 +948,7 @@  A       and             r7,  sp,  #15
         add             r0,  r4,  #(\i)
         mov             r1,  r5
         add             r2,  sp,  #(\i*2)
-        mov             r9,  #\i
+        mov             r3,  #\i
         bl              \txfm2\()16_1d_4x16_pass2_neon
 .endr
 
@@ -1012,6 +966,211 @@  itxfm_func16x16 idct,  iadst
 itxfm_func16x16 iadst, iadst
 .ltorg
 
+function idct16_1d_4x16_pass1_quarter_neon
+        push            {lr}
+        mov             r12, #32
+        vmov.s16        q2, #0
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        bl              idct16_quarter
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        @ The first 4x4 block is kept in registers for the second pass,
+        @ store the rest in the temp buffer.
+        add             r0,  r0,  #8
+.irp i, 20, 24, 28
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 21, 25, 29
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 22, 26, 30
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 23, 27, 31
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_quarter_neon
+        push            {lr}
+        cmp             r3,  #0
+        mov             r12, #32
+        beq             1f
+        @ Only load the top 4 lines, and only do it for the later slices.
+        @ For the first slice, d16-d19 is kept in registers from the first pass.
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              idct16_quarter
+
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass1_half_neon
+        push            {lr}
+        mov             r12, #32
+        vmov.s16        q2, #0
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+
+        bl              idct16_half
+
+        @ Do four 4x4 transposes. Originally, d16-d31 contain the
+        @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
+        @ contain the transposed 4x4 blocks.
+        transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
+
+        @ Store the transposed 4x4 blocks horizontally.
+        cmp             r1,  #4
+        beq             1f
+.irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        pop             {pc}
+1:
+        @ Special case: For the second input column (r1 == 4),
+        @ which would be stored as the second row in the temp buffer,
+        @ don't store the first 4x4 block, but keep it in registers
+        @ for the first slice of the second pass (where it is the
+        @ second 4x4 block).
+        add             r0,  r0,  #8
+.irp i, 20, 24, 28
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 21, 25, 29
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 22, 26, 30
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        add             r0,  r0,  #8
+.irp i, 23, 27, 31
+        vst1.16         {d\i}, [r0,:64]!
+.endr
+        vmov            d20, d16
+        vmov            d21, d17
+        vmov            d22, d18
+        vmov            d23, d19
+        pop             {pc}
+endfunc
+
+function idct16_1d_4x16_pass2_half_neon
+        push            {lr}
+        mov             r12, #32
+        cmp             r3,  #0
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        beq             1f
+.irp i, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+1:
+
+        add             r3,  r0,  r1
+        lsl             r1,  r1,  #1
+        bl              idct16_half
+
+        load_add_store  q8,  q9,  q10, q11
+        load_add_store  q12, q13, q14, q15
+
+        pop             {pc}
+endfunc
+.purgem load_add_store
+
+function idct16x16_quarter_add_neon
+        push            {r4-r9,lr}
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #512
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+
+.irp i, 0
+        add             r0,  sp,  #(\i*32)
+        mov             r1,  #\i
+        add             r2,  r6,  #(\i*2)
+        bl              idct16_1d_4x16_pass1_quarter_neon
+.endr
+.irp i, 0, 4, 8, 12
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        mov             r3,  #\i
+        bl              idct16_1d_4x16_pass2_quarter_neon
+.endr
+
+        add             sp,  sp,  r7
+        pop             {r4-r9,pc}
+endfunc
+
+function idct16x16_half_add_neon
+        push            {r4-r9,lr}
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #512
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+        movrel          r12, idct_coeffs
+        vld1.16         {q0-q1}, [r12,:128]
+
+.irp i, 0, 4
+        add             r0,  sp,  #(\i*32)
+        mov             r1,  #\i
+        add             r2,  r6,  #(\i*2)
+        bl              idct16_1d_4x16_pass1_half_neon
+.endr
+.irp i, 0, 4, 8, 12
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        mov             r3,  #\i
+        bl              idct16_1d_4x16_pass2_half_neon
+.endr
+
+        add             sp,  sp,  r7
+        pop             {r4-r9,pc}
+endfunc
 
 function idct32x32_dc_add_neon
         movrel          r12, idct_coeffs
@@ -1198,6 +1357,7 @@  function idct32_odd_quarter
         idct32_end
 endfunc
 
+.macro idct32_funcs suffix
 @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
 @ We don't have register space to do a single pass IDCT of 4x32 though,
 @ but the 32-point IDCT can be decomposed into two 16-point IDCTs;
@@ -1208,7 +1368,7 @@  endfunc
 @ r1 = min eob
 @ r2 = src
 @ r3 = eob
-function idct32_1d_4x32_pass1_neon
+function idct32_1d_4x32_pass1\suffix\()_neon
         @ Check if this whole input slice is zero
         cmp             r3,  r1
         ble             1f
@@ -1221,37 +1381,28 @@  function idct32_1d_4x32_pass1_neon
         mov             r12, #128
         vmov.s16        d4, #0
 
-        cmp             r3,  #34
-        ble             3f
-        cmp             r3,  #135
-        ble             4f
-
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
-
-        bl              idct16
-        sub             r2,  r2,  r12, lsl #4
-        b               5f
-3:
+.endif
+.ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
-        bl              idct16_quarter
-        sub             r2,  r2,  r12, lsl #2
-        b               5f
-4:
+.endif
+.ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
-        bl              idct16_half
-        sub             r2,  r2,  r12, lsl #3
+.endif
+
+        bl              idct16\suffix
 
-5:
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
         @ contain the transposed 4x4 blocks.
@@ -1274,39 +1425,42 @@  function idct32_1d_4x32_pass1_neon
         sub             r0,  r0,  #256
 .purgem store_rev
 
-        @ Move r2 to the first odd row
+        @ Move r2 back to the start of the input, and move
+        @ to the first odd row
+.ifb \suffix
+        sub             r2,  r2,  r12, lsl #4
+.endif
+.ifc \suffix,_quarter
+        sub             r2,  r2,  r12, lsl #2
+.endif
+.ifc \suffix,_half
+        sub             r2,  r2,  r12, lsl #3
+.endif
         add             r2,  r2,  #64
 
         vmov.s16        d4, #0
-
-        cmp             r3,  #34
-        ble             3f
-        cmp             r3,  #135
-        ble             4f
-
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
-
-        bl              idct32_odd
-        b               5f
-3:
+.endif
+.ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
-        bl              idct32_odd_quarter
-        b               5f
-4:
+.endif
+.ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
-        bl              idct32_odd_half
+.endif
+
+        bl              idct32_odd\suffix
 
-5:
         transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
 
         @ Store the registers a, b, c, d horizontally,
@@ -1350,42 +1504,34 @@  endfunc
 @ r0 = dst
 @ r1 = dst stride
 @ r2 = src (temp buffer)
-function idct32_1d_4x32_pass2_neon
+function idct32_1d_4x32_pass2\suffix\()_neon
         push            {lr}
         movrel          r12, idct_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
         mov             r12, #128
-
-        cmp             r3,  #34
-        ble             3f
-        cmp             r3,  #135
-        ble             4f
-
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #4
-
-        bl              idct16
-        b               5f
-3:
+.endif
+.ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #2
-        bl              idct16_quarter
-        b               5f
-
-4:
+.endif
+.ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #3
-        bl              idct16_half
+.endif
+
+        bl              idct16\suffix
 
-5:
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vst1.16         {d\i}, [r2,:64], r12
 .endr
@@ -1393,36 +1539,29 @@  function idct32_1d_4x32_pass2_neon
         sub             r2,  r2,  r12, lsl #4
         add             r2,  r2,  #64
 
-        cmp             r3,  #34
-        ble             3f
-        cmp             r3,  #135
-        ble             4f
-
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
+.ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #4
-
-        bl              idct32_odd
-        b               5f
-
-3:
+.endif
+.ifc \suffix,_quarter
 .irp i, 16, 17, 18, 19
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #2
-        bl              idct32_odd_quarter
-        b               5f
-4:
+.endif
+.ifc \suffix,_half
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #3
-        bl              idct32_odd_half
-
-5:
+.endif
         sub             r2,  r2,  #64
+
+        bl              idct32_odd\suffix
+
         mov             r12, #128
 .macro load_acc_store a, b, c, d, neg=0
         vld1.16         {d4},  [r2,:64], r12
@@ -1471,6 +1610,11 @@  function idct32_1d_4x32_pass2_neon
 .purgem load_acc_store
         pop             {pc}
 endfunc
+.endm
+
+idct32_funcs
+idct32_funcs _quarter
+idct32_funcs _half
 
 const min_eob_idct_idct_32, align=4
         .short  0, 9, 34, 70, 135, 240, 336, 448
@@ -1479,6 +1623,10 @@  endconst
 function ff_vp9_idct_idct_32x32_add_neon, export=1
         cmp             r3,  #1
         beq             idct32x32_dc_add_neon
+        cmp             r3,  #34
+        ble             idct32x32_quarter_add_neon
+        cmp             r3,  #135
+        ble             idct32x32_half_add_neon
         push            {r4-r8,lr}
         vpush           {q4-q7}
         movrel          r8,  min_eob_idct_idct_32
@@ -1511,3 +1659,71 @@  A       and             r7,  sp,  #15
         vpop            {q4-q7}
         pop             {r4-r8,pc}
 endfunc
+
+function idct32x32_quarter_add_neon
+        push            {r4-r8,lr}
+        vpush           {q4-q7}
+        movrel          r8,  min_eob_idct_idct_32
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #2048
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.irp i, 0, 4
+        add             r0,  sp,  #(\i*64)
+        ldrh            r1,  [r8, #(\i/2)]
+        add             r2,  r6,  #(\i*2)
+        bl              idct32_1d_4x32_pass1_quarter_neon
+.endr
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        bl              idct32_1d_4x32_pass2_quarter_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r8,pc}
+endfunc
+
+function idct32x32_half_add_neon
+        push            {r4-r8,lr}
+        vpush           {q4-q7}
+        movrel          r8,  min_eob_idct_idct_32
+
+        @ Align the stack, allocate a temp buffer
+T       mov             r7,  sp
+T       and             r7,  r7,  #15
+A       and             r7,  sp,  #15
+        add             r7,  r7,  #2048
+        sub             sp,  sp,  r7
+
+        mov             r4,  r0
+        mov             r5,  r1
+        mov             r6,  r2
+
+.irp i, 0, 4, 8, 12
+        add             r0,  sp,  #(\i*64)
+        ldrh            r1,  [r8, #(\i/2)]
+        add             r2,  r6,  #(\i*2)
+        bl              idct32_1d_4x32_pass1_half_neon
+.endr
+.irp i, 0, 4, 8, 12, 16, 20, 24, 28
+        add             r0,  r4,  #(\i)
+        mov             r1,  r5
+        add             r2,  sp,  #(\i*2)
+        bl              idct32_1d_4x32_pass2_half_neon
+.endr
+
+        add             sp,  sp,  r7
+        vpop            {q4-q7}
+        pop             {r4-r8,pc}
+endfunc