[2/5] arm: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible (alternative 1)

Message ID 1480584422-24237-2-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö Dec. 1, 2016, 9:26 a.m.
This work is sponsored by, and copyright, Google.

This increases the code size of libavcodec/arm/vp9itxfm_neon.o
from 12388 to 15064 bytes.

Before:                              Cortex A7       A8       A9      A53
vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.7    211.9    235.8
vp9_inv_dct_dct_16x16_sub2_add_neon:    2056.7   1521.2   1734.8   1262.0
vp9_inv_dct_dct_16x16_sub4_add_neon:    2060.8   1608.5   1735.7   1262.0
vp9_inv_dct_dct_16x16_sub8_add_neon:    2444.9   1801.6   2007.8   1508.5
vp9_inv_dct_dct_16x16_sub12_add_neon:   2902.1   2116.7   2285.1   1751.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
vp9_inv_dct_dct_32x32_sub1_add_neon:     752.0    456.7    866.0    553.9
vp9_inv_dct_dct_32x32_sub2_add_neon:   11042.7   8127.5   8582.7   6822.8
vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
vp9_inv_dct_dct_32x32_sub8_add_neon:   11908.0   9281.8   9381.9   7562.4
vp9_inv_dct_dct_32x32_sub12_add_neon:  13015.2  10791.1  10220.3   8318.9
vp9_inv_dct_dct_32x32_sub16_add_neon:  14150.3  11886.2  11032.6   9064.8
vp9_inv_dct_dct_32x32_sub20_add_neon:  15165.7  12993.8  11847.0   9816.7
vp9_inv_dct_dct_32x32_sub24_add_neon:  16280.8  15111.2  12658.6  10576.8
vp9_inv_dct_dct_32x32_sub28_add_neon:  17412.6  15549.4  13462.7  11325.6
vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.5    211.5    236.1
vp9_inv_dct_dct_16x16_sub2_add_neon:    1448.2    994.0   1191.3    836.0
vp9_inv_dct_dct_16x16_sub4_add_neon:    1437.0    991.0   1191.6    836.0
vp9_inv_dct_dct_16x16_sub8_add_neon:    2114.5   1757.9   1855.3   1335.3
vp9_inv_dct_dct_16x16_sub12_add_neon:   2862.7   2141.5   2293.3   1772.7
vp9_inv_dct_dct_16x16_sub16_add_neon:   3299.6   2419.1   2552.7   2033.0
vp9_inv_dct_dct_32x32_sub1_add_neon:     753.0    457.5    864.3    554.8
vp9_inv_dct_dct_32x32_sub2_add_neon:    7867.8   5978.6   6594.6   5109.9
vp9_inv_dct_dct_32x32_sub4_add_neon:    7871.0   5772.5   6582.2   5108.5
vp9_inv_dct_dct_32x32_sub8_add_neon:    8694.8   6925.7   7125.7   5671.4
vp9_inv_dct_dct_32x32_sub12_add_neon:  11250.3   9654.7   9557.6   7540.5
vp9_inv_dct_dct_32x32_sub16_add_neon:  12129.5  11061.1  10295.0   8220.7
vp9_inv_dct_dct_32x32_sub20_add_neon:  15218.4  13580.8  11841.3   9739.9
vp9_inv_dct_dct_32x32_sub24_add_neon:  16343.5  15097.0  12629.2  10496.6
vp9_inv_dct_dct_32x32_sub28_add_neon:  17482.2  15516.4  13476.0  11261.0
vp9_inv_dct_dct_32x32_sub32_add_neon:  18586.7  16817.5  14289.3  12019.0

---
If we wouldn't have made the core transforms standalone functions
in the previous patch, the code size would increase to around 21 KB (which
isn't too bad), but the idct32 pass1/2 functions would bloat up so much
that they would require literal pools within the functions themselves.
---
 libavcodec/arm/vp9itxfm_neon.S | 351 ++++++++++++++++++++++++++++++++++++++---
 1 file changed, 331 insertions(+), 20 deletions(-)

Comments

Janne Grunau Feb. 3, 2017, 12:49 p.m. | #1
On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
> 
> This increases the code size of libavcodec/arm/vp9itxfm_neon.o
> from 12388 to 15064 bytes.
> 
> Before:                              Cortex A7       A8       A9      A53
> vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.7    211.9    235.8
> vp9_inv_dct_dct_16x16_sub2_add_neon:    2056.7   1521.2   1734.8   1262.0
> vp9_inv_dct_dct_16x16_sub4_add_neon:    2060.8   1608.5   1735.7   1262.0
> vp9_inv_dct_dct_16x16_sub8_add_neon:    2444.9   1801.6   2007.8   1508.5
> vp9_inv_dct_dct_16x16_sub12_add_neon:   2902.1   2116.7   2285.1   1751.7
> vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
> vp9_inv_dct_dct_32x32_sub1_add_neon:     752.0    456.7    866.0    553.9
> vp9_inv_dct_dct_32x32_sub2_add_neon:   11042.7   8127.5   8582.7   6822.8
> vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
> vp9_inv_dct_dct_32x32_sub8_add_neon:   11908.0   9281.8   9381.9   7562.4
> vp9_inv_dct_dct_32x32_sub12_add_neon:  13015.2  10791.1  10220.3   8318.9
> vp9_inv_dct_dct_32x32_sub16_add_neon:  14150.3  11886.2  11032.6   9064.8
> vp9_inv_dct_dct_32x32_sub20_add_neon:  15165.7  12993.8  11847.0   9816.7
> vp9_inv_dct_dct_32x32_sub24_add_neon:  16280.8  15111.2  12658.6  10576.8
> vp9_inv_dct_dct_32x32_sub28_add_neon:  17412.6  15549.4  13462.7  11325.6
> vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9
> 
> After:
> vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.5    211.5    236.1
> vp9_inv_dct_dct_16x16_sub2_add_neon:    1448.2    994.0   1191.3    836.0
> vp9_inv_dct_dct_16x16_sub4_add_neon:    1437.0    991.0   1191.6    836.0
> vp9_inv_dct_dct_16x16_sub8_add_neon:    2114.5   1757.9   1855.3   1335.3
> vp9_inv_dct_dct_16x16_sub12_add_neon:   2862.7   2141.5   2293.3   1772.7
> vp9_inv_dct_dct_16x16_sub16_add_neon:   3299.6   2419.1   2552.7   2033.0
> vp9_inv_dct_dct_32x32_sub1_add_neon:     753.0    457.5    864.3    554.8
> vp9_inv_dct_dct_32x32_sub2_add_neon:    7867.8   5978.6   6594.6   5109.9
> vp9_inv_dct_dct_32x32_sub4_add_neon:    7871.0   5772.5   6582.2   5108.5
> vp9_inv_dct_dct_32x32_sub8_add_neon:    8694.8   6925.7   7125.7   5671.4
> vp9_inv_dct_dct_32x32_sub12_add_neon:  11250.3   9654.7   9557.6   7540.5
> vp9_inv_dct_dct_32x32_sub16_add_neon:  12129.5  11061.1  10295.0   8220.7
> vp9_inv_dct_dct_32x32_sub20_add_neon:  15218.4  13580.8  11841.3   9739.9
> vp9_inv_dct_dct_32x32_sub24_add_neon:  16343.5  15097.0  12629.2  10496.6
> vp9_inv_dct_dct_32x32_sub28_add_neon:  17482.2  15516.4  13476.0  11261.0
> vp9_inv_dct_dct_32x32_sub32_add_neon:  18586.7  16817.5  14289.3  12019.0
> 
> ---
> If we wouldn't have made the core transforms standalone functions
> in the previous patch, the code size would increase to around 21 KB (which
> isn't too bad), but the idct32 pass1/2 functions would bloat up so much
> that they would require literal pools within the functions themselves.
> ---
>  libavcodec/arm/vp9itxfm_neon.S | 351 ++++++++++++++++++++++++++++++++++++++---
>  1 file changed, 331 insertions(+), 20 deletions(-)
> 
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> index 22e63e5..bd3f678 100644
> --- a/libavcodec/arm/vp9itxfm_neon.S
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -74,6 +74,14 @@ endconst
>          vrshrn.s32      \out2, \tmpq4, #14
>  .endm
>  
> +@ Same as mbutterfly0 above, but treating the input in in2 as zero,
> +@ writing the same output into both out1 and out2.
> +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
> +        vmull.s16       \tmpq3, \in1, d0[0]
> +        vrshrn.s32      \out1,  \tmpq3, #14
> +        vmov            \out2,  \out1

if you haven't already tried doing the vrshrn twice could be faster 
since it has less dependencies

> +.endm
> +
>  @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
>  @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
>  @ Same as mbutterfly0, but with input being 2 q registers, output
> @@ -137,6 +145,23 @@ endconst
>          vrshrn.s32      \inout2, \tmp2,  #14
>  .endm
>  
> +@ Same as mbutterfly above, but treating the input in inout2 as zero
> +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
> +        vmull.s16       \tmp1,   \inout1, \coef1
> +        vmull.s16       \tmp2,   \inout1, \coef2
> +        vrshrn.s32      \inout1, \tmp1,   #14
> +        vrshrn.s32      \inout2, \tmp2,   #14
> +.endm
> +
> +@ Same as mbutterfly above, but treating the input in inout1 as zero
> +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
> +        vmull.s16       \tmp1,   \inout2, \coef2
> +        vmull.s16       \tmp2,   \inout2, \coef1
> +        vneg.s32        \tmp1,   \tmp1
> +        vrshrn.s32      \inout2, \tmp2,   #14
> +        vrshrn.s32      \inout1, \tmp1,   #14
> +.endm
> +
>  @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
>  @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
>  @ inout are 4 d registers, tmp are 4 q registers
> @@ -534,7 +559,7 @@ function idct16x16_dc_add_neon
>  endfunc
>  .ltorg
>  
> -function idct16
> +.macro idct16_full
>          mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
>          mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
>          mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
> @@ -556,7 +581,10 @@ function idct16
>          mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
>          mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
>          mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
> +        idct16_end
> +.endm
>  
> +.macro idct16_end
>          butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = t7a
>          butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = t6
>          butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = t5
> @@ -581,6 +609,66 @@ function idct16
>          butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
>          butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
>          bx              lr
> +.endm
> +
> +function idct16
> +        idct16_full
> +endfunc
> +
> +function idct16_half
> +        mbutterfly0_h   d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
> +        mbutterfly_h1   d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
> +        mbutterfly_h1   d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
> +        mbutterfly_h2   d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
> +        mbutterfly_h1   d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
> +        mbutterfly_h2   d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
> +        mbutterfly_h1   d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
> +        mbutterfly_h2   d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
> +
> +        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
> +        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
> +        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
> +        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
> +        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
> +        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
> +        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
> +        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
> +
> +        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
> +        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
> +        mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
> +        idct16_end
> +endfunc
> +
> +function idct16_quarter
> +        vmull.s16       q12, d19, d3[2]
> +        vmull.s16       q2,  d17, d1[3]
> +        vmull.s16       q3,  d18, d1[0]
> +        vmull.s16       q15, d18, d0[3]
> +        vneg.s32        q12, q12
> +        vmull.s16       q14, d17, d2[0]
> +        vmull.s16       q13, d19, d3[1]
> +        vmull.s16       q11, d16, d0[0]
> +        vrshrn.s32      d24, q12, #14
> +        vrshrn.s32      d16, q2,  #14
> +        vrshrn.s32      d7,  q3,  #14
> +        vrshrn.s32      d6,  q15, #14
> +        vrshrn.s32      d29, q14, #14
> +        vrshrn.s32      d17, q13, #14
> +        vrshrn.s32      d28, q11, #14
> +
> +        mbutterfly_l    q10, q11, d17, d24, d0[1], d0[2]
> +        mbutterfly_l    q9,  q15, d29, d16, d0[1], d0[2]
> +        vneg.s32        q11, q11
> +        vrshrn.s32      d27, q10, #14
> +        vrshrn.s32      d21, q11, #14
> +        vrshrn.s32      d23, q9,  #14
> +        vrshrn.s32      d25, q15, #14
> +        vmov            d4,  d28
> +        vmov            d5,  d28
> +        mbutterfly0     d22, d26, d7,  d6,  d18, d30, q9,  q15
> +        vmov            d20, d28
> +        idct16_end
>  endfunc
>  
>  function iadst16
> @@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon
>  
>          mov             r12, #32
>          vmov.s16        q2, #0
> +
> +.ifc \txfm,idct
> +        cmp             r3,  #10
> +        ble             3f
> +        cmp             r3,  #38
> +        ble             4f
> +.endif

I'd test only for less or equal 38 here

> +
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          vld1.16         {d\i}, [r2,:64]
>          vst1.16         {d4},  [r2,:64], r12
>  .endr
>  
>          bl              \txfm\()16
> +.ifc \txfm,idct
> +        b               5f

cmp             r3,  #10

> +
> +3:
> +.irp i, 16, 17, 18, 19
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +        bl              idct16_quarter
> +        b               5f

remove this

> +
> +4:
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12

.if \i == 19
blle idct16_half
ble  5f
.endif

saves a little binary space not sure if it's worth it.

> +.endr
> +        bl              idct16_half
> +.endif
>  
> +5:
>          @ Do four 4x4 transposes. Originally, d16-d31 contain the
>          @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
>          @ contain the transposed 4x4 blocks.
> @@ -721,58 +836,80 @@ endfunc
>  @ r0 = dst
>  @ r1 = dst stride
>  @ r2 = src (temp buffer)
> -@ r3 = slice offset
> +@ r3 = eob
> +@ r8 = slice offset
>  function \txfm\()16_1d_4x16_pass2_neon
>          push            {lr}
>          mov             r12, #32
> +.ifc \txfm,idct
> +        cmp             r3,  #10
> +        ble             3f
> +        cmp             r3,  #38
> +        ble             4f

same applies here

> +.endif
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
>          vld1.16         {d\i}, [r2,:64], r12
>  .endr
> -        cmp             r3,  #0
> +        cmp             r8,  #0
>          beq             1f
>  .irp i, 28, 29, 30, 31
>          vld1.16         {d\i}, [r2,:64], r12
>  .endr
>  1:
>  
> -        add             r3,  r0,  r1
> -        lsl             r1,  r1,  #1
>          bl              \txfm\()16
> +.ifc \txfm,idct
> +        b               5f
> +3:
> +.irp i, 16, 17, 18, 19
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        bl              idct16_quarter
> +        b               5f
> +4:
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        bl              idct16_half
> +.endif
>  
> +5:
> +        add             r8,  r0,  r1
> +        lsl             r1,  r1,  #1
>  .macro load_add_store coef0, coef1, coef2, coef3
>          vrshr.s16       \coef0, \coef0, #6
>          vrshr.s16       \coef1, \coef1, #6
>  
>          vld1.32         {d4[]},   [r0,:32], r1
> -        vld1.32         {d4[1]},  [r3,:32], r1
> +        vld1.32         {d4[1]},  [r8,:32], r1
>          vrshr.s16       \coef2, \coef2, #6
>          vrshr.s16       \coef3, \coef3, #6
>          vld1.32         {d5[]},   [r0,:32], r1
> -        vld1.32         {d5[1]},  [r3,:32], r1
> +        vld1.32         {d5[1]},  [r8,:32], r1
>          vaddw.u8        \coef0, \coef0, d4
>          vld1.32         {d6[]},   [r0,:32], r1
> -        vld1.32         {d6[1]},  [r3,:32], r1
> +        vld1.32         {d6[1]},  [r8,:32], r1
>          vaddw.u8        \coef1, \coef1, d5
>          vld1.32         {d7[]},   [r0,:32], r1
> -        vld1.32         {d7[1]},  [r3,:32], r1
> +        vld1.32         {d7[1]},  [r8,:32], r1
>  
>          vqmovun.s16     d4,  \coef0
>          vqmovun.s16     d5,  \coef1
>          sub             r0,  r0,  r1, lsl #2
> -        sub             r3,  r3,  r1, lsl #2
> +        sub             r8,  r8,  r1, lsl #2
>          vaddw.u8        \coef2, \coef2, d6
>          vaddw.u8        \coef3, \coef3, d7
>          vst1.32         {d4[0]},  [r0,:32], r1
> -        vst1.32         {d4[1]},  [r3,:32], r1
> +        vst1.32         {d4[1]},  [r8,:32], r1
>          vqmovun.s16     d6,  \coef2
>          vst1.32         {d5[0]},  [r0,:32], r1
> -        vst1.32         {d5[1]},  [r3,:32], r1
> +        vst1.32         {d5[1]},  [r8,:32], r1
>          vqmovun.s16     d7,  \coef3
>  
>          vst1.32         {d6[0]},  [r0,:32], r1
> -        vst1.32         {d6[1]},  [r3,:32], r1
> +        vst1.32         {d6[1]},  [r8,:32], r1
>          vst1.32         {d7[0]},  [r0,:32], r1
> -        vst1.32         {d7[1]},  [r3,:32], r1
> +        vst1.32         {d7[1]},  [r8,:32], r1
>  .endm
>          load_add_store  q8,  q9,  q10, q11
>          load_add_store  q12, q13, q14, q15
> @@ -799,6 +936,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>          push            {r4-r8,lr}
>  .ifnc \txfm1\()_\txfm2,idct_idct
>          vpush           {q4-q7}
> +        mov             r3,  #256
>  .else
>          movrel          r8,  min_eob_idct_idct_16 + 2
>  .endif
> @@ -859,7 +997,7 @@ A       and             r7,  sp,  #15
>          add             r0,  r4,  #(\i)
>          mov             r1,  r5
>          add             r2,  sp,  #(\i*2)
> -        mov             r3,  #\i
> +        mov             r8,  #\i
>          bl              \txfm2\()16_1d_4x16_pass2_neon
>  .endr
>  
> @@ -913,7 +1051,7 @@ function idct32x32_dc_add_neon
>          bx              lr
>  endfunc
>  
> -function idct32_odd
> +.macro idct32_odd_full
>          movrel          r12, idct_coeffs
>          add             r12, r12, #32
>          vld1.16         {q0-q1}, [r12,:128]
> @@ -943,7 +1081,10 @@ function idct32_odd
>          mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
>          mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
>          mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
> +        idct32_end
> +.endm
>  
> +.macro idct32_end
>          butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
>          butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
>          butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
> @@ -973,6 +1114,91 @@ function idct32_odd
>          mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
>          mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
>          bx              lr
> +.endm
> +
> +function idct32_odd
> +        idct32_odd_full
> +endfunc
> +
> +function idct32_odd_half
> +        movrel          r12, idct_coeffs
> +        add             r12, r12, #32
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        mbutterfly_h1   d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a
> +        mbutterfly_h2   d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a
> +        mbutterfly_h1   d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a
> +        mbutterfly_h2   d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a
> +        mbutterfly_h1   d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a
> +        mbutterfly_h2   d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a
> +        mbutterfly_h1   d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a
> +        mbutterfly_h2   d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a
> +
> +        sub             r12, r12, #32
> +        vld1.16         {q0}, [r12,:128]
> +
> +        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
> +        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
> +        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
> +        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
> +        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
> +        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
> +        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
> +        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
> +
> +        mbutterfly      d23, d24, d0[3], d1[0], q8, q9        @ d23 = t17a, d24 = t30a
> +        mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
> +        mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
> +        mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
> +
> +        idct32_end
> +endfunc
> +
> +function idct32_odd_quarter
> +        movrel          r12, idct_coeffs
> +        add             r12, r12, #32
> +        vld1.16         {q0-q1}, [r12,:128]
> +
> +        vmull.s16       q2,  d16, d0[0]
> +        vmull.s16       q14, d19, d1[3]
> +        vmull.s16       q15, d16, d0[1]
> +        vmull.s16       q11, d17, d3[2]
> +        vmull.s16       q3,  d17, d3[3]
> +        vmull.s16       q13, d19, d1[2]
> +        vmull.s16       q10, d18, d2[0]
> +        vmull.s16       q12, d18, d2[1]
> +
> +        sub             r12, r12, #32
> +        vld1.16         {q0}, [r12,:128]
> +
> +        vneg.s32        q14, q14
> +        vneg.s32        q3,  q3
> +
> +        vrshrn.s32      d4,  q2,  #14
> +        vrshrn.s32      d5,  q14, #14
> +        vrshrn.s32      d29, q15, #14
> +        vrshrn.s32      d28, q11, #14
> +        vrshrn.s32      d7,  q3,  #14
> +        vrshrn.s32      d31, q13, #14
> +        vrshrn.s32      d6,  q10, #14
> +        vrshrn.s32      d30, q12, #14
> +
> +        mbutterfly_l    q8,  q9,  d29, d4,  d0[3], d1[0]
> +        mbutterfly_l    q13, q10, d31, d5,  d0[3], d1[0]
> +        vrshrn.s32      d23, q8,  #14
> +        vrshrn.s32      d24, q9,  #14
> +        vneg.s32        q10, q10
> +        vrshrn.s32      d27, q13, #14
> +        vrshrn.s32      d20, q10, #14
> +        mbutterfly_l    q8,  q9,  d30, d6,  d1[1], d1[2]
> +        vrshrn.s32      d21, q8,  #14
> +        vrshrn.s32      d26, q9,  #14
> +        mbutterfly_l    q8,  q9,  d28, d7,  d1[1], d1[2]
> +        vrshrn.s32      d25, q8,  #14
> +        vneg.s32        q9,  q9
> +        vrshrn.s32      d22, q9,  #14
> +
> +        idct32_end
>  endfunc
>  
>  @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
> @@ -994,6 +1220,11 @@ function idct32_1d_4x32_pass1_neon
>          mov             r12, #128
>          vmov.s16        d4, #0
>  
> +        cmp             r3,  #34
> +        ble             3f
> +        cmp             r3,  #135
> +        ble             4f
> +
>          @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          vld1.16         {d\i}, [r2,:64]
> @@ -1001,7 +1232,25 @@ function idct32_1d_4x32_pass1_neon
>  .endr
>  
>          bl              idct16
> +        sub             r2,  r2,  r12, lsl #4
> +        b               5f
> +3:
> +.irp i, 16, 17, 18, 19
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +        bl              idct16_quarter
> +        sub             r2,  r2,  r12, lsl #2
> +        b               5f
> +4:
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +        bl              idct16_half
> +        sub             r2,  r2,  r12, lsl #3

same could be done here but I'm not sure if it's a godd idea

> +5:
>          @ Do four 4x4 transposes. Originally, d16-d31 contain the
>          @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
>          @ contain the transposed 4x4 blocks.
> @@ -1024,12 +1273,16 @@ function idct32_1d_4x32_pass1_neon
>          sub             r0,  r0,  #256
>  .purgem store_rev
>  
> -        @ Move r2 back to the start of the input, and move
> -        @ to the first odd row
> -        sub             r2,  r2,  r12, lsl #4
> +        @ Move r2 to the first odd row
>          add             r2,  r2,  #64
>  
>          vmov.s16        d4, #0
> +
> +        cmp             r3,  #34
> +        ble             3f
> +        cmp             r3,  #135
> +        ble             4f
> +
>          @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          vld1.16         {d\i}, [r2,:64]
> @@ -1037,7 +1290,22 @@ function idct32_1d_4x32_pass1_neon
>  .endr
>  
>          bl              idct32_odd
> +        b               5f
> +3:
> +.irp i, 16, 17, 18, 19
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +        bl              idct32_odd_quarter
> +        b               5f
> +4:
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        vld1.16         {d\i}, [r2,:64]
> +        vst1.16         {d4},  [r2,:64], r12
> +.endr
> +        bl              idct32_odd_half
>  
> +5:
>          transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
>  
>          @ Store the registers a, b, c, d horizontally,
> @@ -1078,6 +1346,12 @@ function idct32_1d_4x32_pass2_neon
>          vld1.16         {q0-q1}, [r12,:128]
>  
>          mov             r12, #128
> +
> +        cmp             r3,  #34
> +        ble             3f
> +        cmp             r3,  #135
> +        ble             4f
> +
>          @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          vld1.16         {d\i}, [r2,:64], r12
> @@ -1085,7 +1359,23 @@ function idct32_1d_4x32_pass2_neon
>          sub             r2,  r2,  r12, lsl #4
>  
>          bl              idct16
> +        b               5f
> +3:
> +.irp i, 16, 17, 18, 19
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        sub             r2,  r2,  r12, lsl #2
> +        bl              idct16_quarter
> +        b               5f
> +
> +4:
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        sub             r2,  r2,  r12, lsl #3
> +        bl              idct16_half
>  
> +5:
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          vst1.16         {d\i}, [r2,:64], r12
>  .endr
> @@ -1093,15 +1383,36 @@ function idct32_1d_4x32_pass2_neon
>          sub             r2,  r2,  r12, lsl #4
>          add             r2,  r2,  #64
>  
> +        cmp             r3,  #34
> +        ble             3f
> +        cmp             r3,  #135
> +        ble             4f
> +
>          @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>          vld1.16         {d\i}, [r2,:64], r12
>  .endr
>          sub             r2,  r2,  r12, lsl #4
> -        sub             r2,  r2,  #64
>  
>          bl              idct32_odd
> +        b               5f
>  
> +3:
> +.irp i, 16, 17, 18, 19
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        sub             r2,  r2,  r12, lsl #2
> +        bl              idct32_odd_quarter
> +        b               5f
> +4:
> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> +        vld1.16         {d\i}, [r2,:64], r12
> +.endr
> +        sub             r2,  r2,  r12, lsl #3
> +        bl              idct32_odd_half
> +
> +5:
> +        sub             r2,  r2,  #64
>          mov             r12, #128
>  .macro load_acc_store a, b, c, d, neg=0
>          vld1.16         {d4},  [r2,:64], r12

otherwise ok

JAnne
Martin Storsjö Feb. 3, 2017, 9:44 p.m. | #2
On Fri, 3 Feb 2017, Janne Grunau wrote:

> On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote:
>> This work is sponsored by, and copyright, Google.
>> 
>> This increases the code size of libavcodec/arm/vp9itxfm_neon.o
>> from 12388 to 15064 bytes.
>> 
>> Before:                              Cortex A7       A8       A9      A53
>> vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.7    211.9    235.8
>> vp9_inv_dct_dct_16x16_sub2_add_neon:    2056.7   1521.2   1734.8   1262.0
>> vp9_inv_dct_dct_16x16_sub4_add_neon:    2060.8   1608.5   1735.7   1262.0
>> vp9_inv_dct_dct_16x16_sub8_add_neon:    2444.9   1801.6   2007.8   1508.5
>> vp9_inv_dct_dct_16x16_sub12_add_neon:   2902.1   2116.7   2285.1   1751.7
>> vp9_inv_dct_dct_16x16_sub16_add_neon:   3211.2   2443.5   2546.1   1999.5
>> vp9_inv_dct_dct_32x32_sub1_add_neon:     752.0    456.7    866.0    553.9
>> vp9_inv_dct_dct_32x32_sub2_add_neon:   11042.7   8127.5   8582.7   6822.8
>> vp9_inv_dct_dct_32x32_sub4_add_neon:   10682.0   8043.8   8581.3   6810.1
>> vp9_inv_dct_dct_32x32_sub8_add_neon:   11908.0   9281.8   9381.9   7562.4
>> vp9_inv_dct_dct_32x32_sub12_add_neon:  13015.2  10791.1  10220.3   8318.9
>> vp9_inv_dct_dct_32x32_sub16_add_neon:  14150.3  11886.2  11032.6   9064.8
>> vp9_inv_dct_dct_32x32_sub20_add_neon:  15165.7  12993.8  11847.0   9816.7
>> vp9_inv_dct_dct_32x32_sub24_add_neon:  16280.8  15111.2  12658.6  10576.8
>> vp9_inv_dct_dct_32x32_sub28_add_neon:  17412.6  15549.4  13462.7  11325.6
>> vp9_inv_dct_dct_32x32_sub32_add_neon:  18522.4  17277.4  14286.7  12087.9
>> 
>> After:
>> vp9_inv_dct_dct_16x16_sub1_add_neon:     273.0    189.5    211.5    236.1
>> vp9_inv_dct_dct_16x16_sub2_add_neon:    1448.2    994.0   1191.3    836.0
>> vp9_inv_dct_dct_16x16_sub4_add_neon:    1437.0    991.0   1191.6    836.0
>> vp9_inv_dct_dct_16x16_sub8_add_neon:    2114.5   1757.9   1855.3   1335.3
>> vp9_inv_dct_dct_16x16_sub12_add_neon:   2862.7   2141.5   2293.3   1772.7
>> vp9_inv_dct_dct_16x16_sub16_add_neon:   3299.6   2419.1   2552.7   2033.0
>> vp9_inv_dct_dct_32x32_sub1_add_neon:     753.0    457.5    864.3    554.8
>> vp9_inv_dct_dct_32x32_sub2_add_neon:    7867.8   5978.6   6594.6   5109.9
>> vp9_inv_dct_dct_32x32_sub4_add_neon:    7871.0   5772.5   6582.2   5108.5
>> vp9_inv_dct_dct_32x32_sub8_add_neon:    8694.8   6925.7   7125.7   5671.4
>> vp9_inv_dct_dct_32x32_sub12_add_neon:  11250.3   9654.7   9557.6   7540.5
>> vp9_inv_dct_dct_32x32_sub16_add_neon:  12129.5  11061.1  10295.0   8220.7
>> vp9_inv_dct_dct_32x32_sub20_add_neon:  15218.4  13580.8  11841.3   9739.9
>> vp9_inv_dct_dct_32x32_sub24_add_neon:  16343.5  15097.0  12629.2  10496.6
>> vp9_inv_dct_dct_32x32_sub28_add_neon:  17482.2  15516.4  13476.0  11261.0
>> vp9_inv_dct_dct_32x32_sub32_add_neon:  18586.7  16817.5  14289.3  12019.0
>> 
>> ---
>> If we wouldn't have made the core transforms standalone functions
>> in the previous patch, the code size would increase to around 21 KB (which
>> isn't too bad), but the idct32 pass1/2 functions would bloat up so much
>> that they would require literal pools within the functions themselves.
>> ---
>>  libavcodec/arm/vp9itxfm_neon.S | 351 ++++++++++++++++++++++++++++++++++++++---
>>  1 file changed, 331 insertions(+), 20 deletions(-)
>> 
>> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
>> index 22e63e5..bd3f678 100644
>> --- a/libavcodec/arm/vp9itxfm_neon.S
>> +++ b/libavcodec/arm/vp9itxfm_neon.S
>> @@ -74,6 +74,14 @@ endconst
>>          vrshrn.s32      \out2, \tmpq4, #14
>>  .endm
>> 
>> +@ Same as mbutterfly0 above, but treating the input in in2 as zero,
>> +@ writing the same output into both out1 and out2.
>> +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
>> +        vmull.s16       \tmpq3, \in1, d0[0]
>> +        vrshrn.s32      \out1,  \tmpq3, #14
>> +        vmov            \out2,  \out1
>
> if you haven't already tried doing the vrshrn twice could be faster 
> since it has less dependencies

Didn't think of that - it does indeed seem to help (both here and in the 
aarch64 version), so applied that.

>> @@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon
>>
>>          mov             r12, #32
>>          vmov.s16        q2, #0
>> +
>> +.ifc \txfm,idct
>> +        cmp             r3,  #10
>> +        ble             3f
>> +        cmp             r3,  #38
>> +        ble             4f
>> +.endif
>
> I'd test only for less or equal 38 here
>
>> +
>>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>>          vld1.16         {d\i}, [r2,:64]
>>          vst1.16         {d4},  [r2,:64], r12
>>  .endr
>>
>>          bl              \txfm\()16
>> +.ifc \txfm,idct
>> +        b               5f
>
> cmp             r3,  #10
>
>> +
>> +3:
>> +.irp i, 16, 17, 18, 19
>> +        vld1.16         {d\i}, [r2,:64]
>> +        vst1.16         {d4},  [r2,:64], r12
>> +.endr
>> +        bl              idct16_quarter
>> +        b               5f
>
> remove this
>
>> +
>> +4:
>> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23
>> +        vld1.16         {d\i}, [r2,:64]
>> +        vst1.16         {d4},  [r2,:64], r12
>
> .if \i == 19
> blle idct16_half
> ble  5f
> .endif
>
> saves a little binary space not sure if it's worth it.

Thanks for the reviews!


Hmm, that looks pretty neat.

I folded in this change into the aarch64 version (and the rshrn instead of 
mov) as well, using a b.gt instead of conditional bl, like this:

.if \i == 19
         b.gt            4f
         bl              idct16_quarter
         b               5f
4:
.endif

In principle I guess one could interleave the same in the full loop as 
well, having only one loop, with special case checks for i == 19 and i == 
23. Then we'd end up with two comparisons instead of one when doing the 
full case - not sure if it's preferrable or not.

The main question though is whether you prefer this or alternative 2.

// Martin
Janne Grunau Feb. 4, 2017, 4:44 p.m. | #3
On 2017-02-03 23:44:51 +0200, Martin Storsjö wrote:
> On Fri, 3 Feb 2017, Janne Grunau wrote:
> 
> >On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote:
> >>This work is sponsored by, and copyright, Google.
> >>
> 
> >>@@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon
> >>
> >>         mov             r12, #32
> >>         vmov.s16        q2, #0
> >>+
> >>+.ifc \txfm,idct
> >>+        cmp             r3,  #10
> >>+        ble             3f
> >>+        cmp             r3,  #38
> >>+        ble             4f
> >>+.endif
> >
> >I'd test only for less or equal 38 here
> >
> >>+
> >> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> >>         vld1.16         {d\i}, [r2,:64]
> >>         vst1.16         {d4},  [r2,:64], r12
> >> .endr
> >>
> >>         bl              \txfm\()16
> >>+.ifc \txfm,idct
> >>+        b               5f
> >
> >cmp             r3,  #10
> >
> >>+
> >>+3:
> >>+.irp i, 16, 17, 18, 19
> >>+        vld1.16         {d\i}, [r2,:64]
> >>+        vst1.16         {d4},  [r2,:64], r12
> >>+.endr
> >>+        bl              idct16_quarter
> >>+        b               5f
> >
> >remove this
> >
> >>+
> >>+4:
> >>+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
> >>+        vld1.16         {d\i}, [r2,:64]
> >>+        vst1.16         {d4},  [r2,:64], r12
> >
> >.if \i == 19
> >blle idct16_half
> >ble  5f
> >.endif
> >
> >saves a little binary space not sure if it's worth it.
> 
> Hmm, that looks pretty neat.
> 
> I folded in this change into the aarch64 version (and the rshrn instead of
> mov) as well, using a b.gt instead of conditional bl, like this:
> 
> .if \i == 19
>         b.gt            4f
>         bl              idct16_quarter
>         b               5f
> 4:
> .endif
> 
> In principle I guess one could interleave the same in the full loop as well,
> having only one loop, with special case checks for i == 19 and i == 23. Then
> we'd end up with two comparisons instead of one when doing the full case -
> not sure if it's preferrable or not.

I doubt the comparisons are noticeable. so folding it into the main loop 
should be fine.

> The main question though is whether you prefer this or alternative 2.

see my other mail. I have no strong opinion.

Janne
Martin Storsjö Feb. 4, 2017, 10:19 p.m. | #4
On Sat, 4 Feb 2017, Janne Grunau wrote:

> On 2017-02-03 23:44:51 +0200, Martin Storsjö wrote:
>> On Fri, 3 Feb 2017, Janne Grunau wrote:
>> 
>> >On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote:
>> >>This work is sponsored by, and copyright, Google.
>> >>
>> 
>> >>@@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon
>> >>
>> >>         mov             r12, #32
>> >>         vmov.s16        q2, #0
>> >>+
>> >>+.ifc \txfm,idct
>> >>+        cmp             r3,  #10
>> >>+        ble             3f
>> >>+        cmp             r3,  #38
>> >>+        ble             4f
>> >>+.endif
>> >
>> >I'd test only for less or equal 38 here
>> >
>> >>+
>> >> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
>> >>         vld1.16         {d\i}, [r2,:64]
>> >>         vst1.16         {d4},  [r2,:64], r12
>> >> .endr
>> >>
>> >>         bl              \txfm\()16
>> >>+.ifc \txfm,idct
>> >>+        b               5f
>> >
>> >cmp             r3,  #10
>> >
>> >>+
>> >>+3:
>> >>+.irp i, 16, 17, 18, 19
>> >>+        vld1.16         {d\i}, [r2,:64]
>> >>+        vst1.16         {d4},  [r2,:64], r12
>> >>+.endr
>> >>+        bl              idct16_quarter
>> >>+        b               5f
>> >
>> >remove this
>> >
>> >>+
>> >>+4:
>> >>+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
>> >>+        vld1.16         {d\i}, [r2,:64]
>> >>+        vst1.16         {d4},  [r2,:64], r12
>> >
>> >.if \i == 19
>> >blle idct16_half
>> >ble  5f
>> >.endif
>> >
>> >saves a little binary space not sure if it's worth it.
>> 
>> Hmm, that looks pretty neat.
>> 
>> I folded in this change into the aarch64 version (and the rshrn instead of
>> mov) as well, using a b.gt instead of conditional bl, like this:
>> 
>> .if \i == 19
>>         b.gt            4f
>>         bl              idct16_quarter
>>         b               5f
>> 4:
>> .endif
>> 
>> In principle I guess one could interleave the same in the full loop as well,
>> having only one loop, with special case checks for i == 19 and i == 23. Then
>> we'd end up with two comparisons instead of one when doing the full case -
>> not sure if it's preferrable or not.
>
> I doubt the comparisons are noticeable. so folding it into the main loop 
> should be fine.

Hmm, indeed. And in this case, the diff of this alternative turns out 
pretty small and neat actually.

// Martin

Patch

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 22e63e5..bd3f678 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -74,6 +74,14 @@  endconst
         vrshrn.s32      \out2, \tmpq4, #14
 .endm
 
+@ Same as mbutterfly0 above, but treating the input in in2 as zero,
+@ writing the same output into both out1 and out2.
+.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4
+        vmull.s16       \tmpq3, \in1, d0[0]
+        vrshrn.s32      \out1,  \tmpq3, #14
+        vmov            \out2,  \out1
+.endm
+
 @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14
 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14
 @ Same as mbutterfly0, but with input being 2 q registers, output
@@ -137,6 +145,23 @@  endconst
         vrshrn.s32      \inout2, \tmp2,  #14
 .endm
 
+@ Same as mbutterfly above, but treating the input in inout2 as zero
+.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmull.s16       \tmp1,   \inout1, \coef1
+        vmull.s16       \tmp2,   \inout1, \coef2
+        vrshrn.s32      \inout1, \tmp1,   #14
+        vrshrn.s32      \inout2, \tmp2,   #14
+.endm
+
+@ Same as mbutterfly above, but treating the input in inout1 as zero
+.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2
+        vmull.s16       \tmp1,   \inout2, \coef2
+        vmull.s16       \tmp2,   \inout2, \coef1
+        vneg.s32        \tmp1,   \tmp1
+        vrshrn.s32      \inout2, \tmp2,   #14
+        vrshrn.s32      \inout1, \tmp1,   #14
+.endm
+
 @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14
 @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14
 @ inout are 4 d registers, tmp are 4 q registers
@@ -534,7 +559,7 @@  function idct16x16_dc_add_neon
 endfunc
 .ltorg
 
-function idct16
+.macro idct16_full
         mbutterfly0     d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
         mbutterfly      d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
         mbutterfly      d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
@@ -556,7 +581,10 @@  function idct16
         mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
         mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
         mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+.endm
 
+.macro idct16_end
         butterfly       d18, d7,  d4,  d7                @ d18 = t0a,  d7  = t7a
         butterfly       d19, d22, d5,  d22               @ d19 = t1a,  d22 = t6
         butterfly       d4,  d26, d20, d26               @ d4  = t2a,  d26 = t5
@@ -581,6 +609,66 @@  function idct16
         butterfly       d20, d27, d6,  d27               @ d20 = out[4], d27 = out[11]
         butterfly       d21, d26, d26, d4                @ d21 = out[5], d26 = out[10]
         bx              lr
+.endm
+
+function idct16
+        idct16_full
+endfunc
+
+function idct16_half
+        mbutterfly0_h   d16, d24, d16, d24, d4, d6,  q2,  q3 @ d16 = t0a,  d24 = t1a
+        mbutterfly_h1   d20, d28, d0[1], d0[2], q2,  q3  @ d20 = t2a,  d28 = t3a
+        mbutterfly_h1   d18, d30, d0[3], d1[0], q2,  q3  @ d18 = t4a,  d30 = t7a
+        mbutterfly_h2   d26, d22, d1[1], d1[2], q2,  q3  @ d26 = t5a,  d22 = t6a
+        mbutterfly_h1   d17, d31, d1[3], d2[0], q2,  q3  @ d17 = t8a,  d31 = t15a
+        mbutterfly_h2   d25, d23, d2[1], d2[2], q2,  q3  @ d25 = t9a,  d23 = t14a
+        mbutterfly_h1   d21, d27, d2[3], d3[0], q2,  q3  @ d21 = t10a, d27 = t13a
+        mbutterfly_h2   d29, d19, d3[1], d3[2], q2,  q3  @ d29 = t11a, d19 = t12a
+
+        butterfly       d4,  d28, d16, d28               @ d4  = t0,   d28 = t3
+        butterfly       d5,  d20, d24, d20               @ d5  = t1,   d20 = t2
+        butterfly       d6,  d26, d18, d26               @ d6  = t4,   d26 = t5
+        butterfly       d7,  d22, d30, d22               @ d7  = t7,   d22 = t6
+        butterfly       d16, d25, d17, d25               @ d16 = t8,   d25 = t9
+        butterfly       d24, d21, d29, d21               @ d24 = t11,  d21 = t10
+        butterfly       d17, d27, d19, d27               @ d17 = t12,  d27 = t13
+        butterfly       d29, d23, d31, d23               @ d29 = t15,  d23 = t14
+
+        mbutterfly0     d22, d26, d22, d26, d18, d30, q9,  q15  @ d22 = t6a, d26 = t5a
+        mbutterfly      d23, d25, d0[1], d0[2], q9,  q15        @ d23 = t9a,  d25 = t14a
+        mbutterfly      d27, d21, d0[1], d0[2], q9,  q15, neg=1 @ d27 = t13a, d21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        vmull.s16       q12, d19, d3[2]
+        vmull.s16       q2,  d17, d1[3]
+        vmull.s16       q3,  d18, d1[0]
+        vmull.s16       q15, d18, d0[3]
+        vneg.s32        q12, q12
+        vmull.s16       q14, d17, d2[0]
+        vmull.s16       q13, d19, d3[1]
+        vmull.s16       q11, d16, d0[0]
+        vrshrn.s32      d24, q12, #14
+        vrshrn.s32      d16, q2,  #14
+        vrshrn.s32      d7,  q3,  #14
+        vrshrn.s32      d6,  q15, #14
+        vrshrn.s32      d29, q14, #14
+        vrshrn.s32      d17, q13, #14
+        vrshrn.s32      d28, q11, #14
+
+        mbutterfly_l    q10, q11, d17, d24, d0[1], d0[2]
+        mbutterfly_l    q9,  q15, d29, d16, d0[1], d0[2]
+        vneg.s32        q11, q11
+        vrshrn.s32      d27, q10, #14
+        vrshrn.s32      d21, q11, #14
+        vrshrn.s32      d23, q9,  #14
+        vrshrn.s32      d25, q15, #14
+        vmov            d4,  d28
+        vmov            d5,  d28
+        mbutterfly0     d22, d26, d7,  d6,  d18, d30, q9,  q15
+        vmov            d20, d28
+        idct16_end
 endfunc
 
 function iadst16
@@ -668,13 +756,40 @@  function \txfm\()16_1d_4x16_pass1_neon
 
         mov             r12, #32
         vmov.s16        q2, #0
+
+.ifc \txfm,idct
+        cmp             r3,  #10
+        ble             3f
+        cmp             r3,  #38
+        ble             4f
+.endif
+
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
 .endr
 
         bl              \txfm\()16
+.ifc \txfm,idct
+        b               5f
+
+3:
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+        bl              idct16_quarter
+        b               5f
+
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+        bl              idct16_half
+.endif
 
+5:
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
         @ contain the transposed 4x4 blocks.
@@ -721,58 +836,80 @@  endfunc
 @ r0 = dst
 @ r1 = dst stride
 @ r2 = src (temp buffer)
-@ r3 = slice offset
+@ r3 = eob
+@ r8 = slice offset
 function \txfm\()16_1d_4x16_pass2_neon
         push            {lr}
         mov             r12, #32
+.ifc \txfm,idct
+        cmp             r3,  #10
+        ble             3f
+        cmp             r3,  #38
+        ble             4f
+.endif
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27
         vld1.16         {d\i}, [r2,:64], r12
 .endr
-        cmp             r3,  #0
+        cmp             r8,  #0
         beq             1f
 .irp i, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64], r12
 .endr
 1:
 
-        add             r3,  r0,  r1
-        lsl             r1,  r1,  #1
         bl              \txfm\()16
+.ifc \txfm,idct
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        bl              idct16_quarter
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        bl              idct16_half
+.endif
 
+5:
+        add             r8,  r0,  r1
+        lsl             r1,  r1,  #1
 .macro load_add_store coef0, coef1, coef2, coef3
         vrshr.s16       \coef0, \coef0, #6
         vrshr.s16       \coef1, \coef1, #6
 
         vld1.32         {d4[]},   [r0,:32], r1
-        vld1.32         {d4[1]},  [r3,:32], r1
+        vld1.32         {d4[1]},  [r8,:32], r1
         vrshr.s16       \coef2, \coef2, #6
         vrshr.s16       \coef3, \coef3, #6
         vld1.32         {d5[]},   [r0,:32], r1
-        vld1.32         {d5[1]},  [r3,:32], r1
+        vld1.32         {d5[1]},  [r8,:32], r1
         vaddw.u8        \coef0, \coef0, d4
         vld1.32         {d6[]},   [r0,:32], r1
-        vld1.32         {d6[1]},  [r3,:32], r1
+        vld1.32         {d6[1]},  [r8,:32], r1
         vaddw.u8        \coef1, \coef1, d5
         vld1.32         {d7[]},   [r0,:32], r1
-        vld1.32         {d7[1]},  [r3,:32], r1
+        vld1.32         {d7[1]},  [r8,:32], r1
 
         vqmovun.s16     d4,  \coef0
         vqmovun.s16     d5,  \coef1
         sub             r0,  r0,  r1, lsl #2
-        sub             r3,  r3,  r1, lsl #2
+        sub             r8,  r8,  r1, lsl #2
         vaddw.u8        \coef2, \coef2, d6
         vaddw.u8        \coef3, \coef3, d7
         vst1.32         {d4[0]},  [r0,:32], r1
-        vst1.32         {d4[1]},  [r3,:32], r1
+        vst1.32         {d4[1]},  [r8,:32], r1
         vqmovun.s16     d6,  \coef2
         vst1.32         {d5[0]},  [r0,:32], r1
-        vst1.32         {d5[1]},  [r3,:32], r1
+        vst1.32         {d5[1]},  [r8,:32], r1
         vqmovun.s16     d7,  \coef3
 
         vst1.32         {d6[0]},  [r0,:32], r1
-        vst1.32         {d6[1]},  [r3,:32], r1
+        vst1.32         {d6[1]},  [r8,:32], r1
         vst1.32         {d7[0]},  [r0,:32], r1
-        vst1.32         {d7[1]},  [r3,:32], r1
+        vst1.32         {d7[1]},  [r8,:32], r1
 .endm
         load_add_store  q8,  q9,  q10, q11
         load_add_store  q12, q13, q14, q15
@@ -799,6 +936,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
         push            {r4-r8,lr}
 .ifnc \txfm1\()_\txfm2,idct_idct
         vpush           {q4-q7}
+        mov             r3,  #256
 .else
         movrel          r8,  min_eob_idct_idct_16 + 2
 .endif
@@ -859,7 +997,7 @@  A       and             r7,  sp,  #15
         add             r0,  r4,  #(\i)
         mov             r1,  r5
         add             r2,  sp,  #(\i*2)
-        mov             r3,  #\i
+        mov             r8,  #\i
         bl              \txfm2\()16_1d_4x16_pass2_neon
 .endr
 
@@ -913,7 +1051,7 @@  function idct32x32_dc_add_neon
         bx              lr
 endfunc
 
-function idct32_odd
+.macro idct32_odd_full
         movrel          r12, idct_coeffs
         add             r12, r12, #32
         vld1.16         {q0-q1}, [r12,:128]
@@ -943,7 +1081,10 @@  function idct32_odd
         mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
         mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
         mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+        idct32_end
+.endm
 
+.macro idct32_end
         butterfly       d16, d5,  d4,  d5  @ d16 = t16a, d5  = t19a
         butterfly       d17, d20, d23, d20 @ d17 = t17,  d20 = t18
         butterfly       d18, d6,  d7,  d6  @ d18 = t23a, d6  = t20a
@@ -973,6 +1114,91 @@  function idct32_odd
         mbutterfly0     d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25,  d22 = t22
         mbutterfly0     d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a
         bx              lr
+.endm
+
+function idct32_odd
+        idct32_odd_full
+endfunc
+
+function idct32_odd_half
+        movrel          r12, idct_coeffs
+        add             r12, r12, #32
+        vld1.16         {q0-q1}, [r12,:128]
+
+        mbutterfly_h1   d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a
+        mbutterfly_h2   d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a
+        mbutterfly_h1   d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a
+        mbutterfly_h2   d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a
+        mbutterfly_h1   d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a
+        mbutterfly_h2   d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a
+        mbutterfly_h1   d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a
+        mbutterfly_h2   d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a
+
+        sub             r12, r12, #32
+        vld1.16         {q0}, [r12,:128]
+
+        butterfly       d4,  d24, d16, d24 @ d4  = t16, d24 = t17
+        butterfly       d5,  d20, d28, d20 @ d5  = t19, d20 = t18
+        butterfly       d6,  d26, d18, d26 @ d6  = t20, d26 = t21
+        butterfly       d7,  d22, d30, d22 @ d7  = t23, d22 = t22
+        butterfly       d28, d25, d17, d25 @ d28 = t24, d25 = t25
+        butterfly       d30, d21, d29, d21 @ d30 = t27, d21 = t26
+        butterfly       d29, d23, d31, d23 @ d29 = t31, d23 = t30
+        butterfly       d31, d27, d19, d27 @ d31 = t28, d27 = t29
+
+        mbutterfly      d23, d24, d0[3], d1[0], q8, q9        @ d23 = t17a, d24 = t30a
+        mbutterfly      d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a
+        mbutterfly      d21, d26, d1[1], d1[2], q8, q9        @ d21 = t21a, d26 = t26a
+        mbutterfly      d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a
+
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        movrel          r12, idct_coeffs
+        add             r12, r12, #32
+        vld1.16         {q0-q1}, [r12,:128]
+
+        vmull.s16       q2,  d16, d0[0]
+        vmull.s16       q14, d19, d1[3]
+        vmull.s16       q15, d16, d0[1]
+        vmull.s16       q11, d17, d3[2]
+        vmull.s16       q3,  d17, d3[3]
+        vmull.s16       q13, d19, d1[2]
+        vmull.s16       q10, d18, d2[0]
+        vmull.s16       q12, d18, d2[1]
+
+        sub             r12, r12, #32
+        vld1.16         {q0}, [r12,:128]
+
+        vneg.s32        q14, q14
+        vneg.s32        q3,  q3
+
+        vrshrn.s32      d4,  q2,  #14
+        vrshrn.s32      d5,  q14, #14
+        vrshrn.s32      d29, q15, #14
+        vrshrn.s32      d28, q11, #14
+        vrshrn.s32      d7,  q3,  #14
+        vrshrn.s32      d31, q13, #14
+        vrshrn.s32      d6,  q10, #14
+        vrshrn.s32      d30, q12, #14
+
+        mbutterfly_l    q8,  q9,  d29, d4,  d0[3], d1[0]
+        mbutterfly_l    q13, q10, d31, d5,  d0[3], d1[0]
+        vrshrn.s32      d23, q8,  #14
+        vrshrn.s32      d24, q9,  #14
+        vneg.s32        q10, q10
+        vrshrn.s32      d27, q13, #14
+        vrshrn.s32      d20, q10, #14
+        mbutterfly_l    q8,  q9,  d30, d6,  d1[1], d1[2]
+        vrshrn.s32      d21, q8,  #14
+        vrshrn.s32      d26, q9,  #14
+        mbutterfly_l    q8,  q9,  d28, d7,  d1[1], d1[2]
+        vrshrn.s32      d25, q8,  #14
+        vneg.s32        q9,  q9
+        vrshrn.s32      d22, q9,  #14
+
+        idct32_end
 endfunc
 
 @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix.
@@ -994,6 +1220,11 @@  function idct32_1d_4x32_pass1_neon
         mov             r12, #128
         vmov.s16        d4, #0
 
+        cmp             r3,  #34
+        ble             3f
+        cmp             r3,  #135
+        ble             4f
+
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
@@ -1001,7 +1232,25 @@  function idct32_1d_4x32_pass1_neon
 .endr
 
         bl              idct16
+        sub             r2,  r2,  r12, lsl #4
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+        bl              idct16_quarter
+        sub             r2,  r2,  r12, lsl #2
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+        bl              idct16_half
+        sub             r2,  r2,  r12, lsl #3
 
+5:
         @ Do four 4x4 transposes. Originally, d16-d31 contain the
         @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31
         @ contain the transposed 4x4 blocks.
@@ -1024,12 +1273,16 @@  function idct32_1d_4x32_pass1_neon
         sub             r0,  r0,  #256
 .purgem store_rev
 
-        @ Move r2 back to the start of the input, and move
-        @ to the first odd row
-        sub             r2,  r2,  r12, lsl #4
+        @ Move r2 to the first odd row
         add             r2,  r2,  #64
 
         vmov.s16        d4, #0
+
+        cmp             r3,  #34
+        ble             3f
+        cmp             r3,  #135
+        ble             4f
+
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
@@ -1037,7 +1290,22 @@  function idct32_1d_4x32_pass1_neon
 .endr
 
         bl              idct32_odd
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+        bl              idct32_odd_quarter
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64]
+        vst1.16         {d4},  [r2,:64], r12
+.endr
+        bl              idct32_odd_half
 
+5:
         transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9,  q8,  d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16
 
         @ Store the registers a, b, c, d horizontally,
@@ -1078,6 +1346,12 @@  function idct32_1d_4x32_pass2_neon
         vld1.16         {q0-q1}, [r12,:128]
 
         mov             r12, #128
+
+        cmp             r3,  #34
+        ble             3f
+        cmp             r3,  #135
+        ble             4f
+
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64], r12
@@ -1085,7 +1359,23 @@  function idct32_1d_4x32_pass2_neon
         sub             r2,  r2,  r12, lsl #4
 
         bl              idct16
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+        bl              idct16_quarter
+        b               5f
+
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+        bl              idct16_half
 
+5:
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vst1.16         {d\i}, [r2,:64], r12
 .endr
@@ -1093,15 +1383,36 @@  function idct32_1d_4x32_pass2_neon
         sub             r2,  r2,  r12, lsl #4
         add             r2,  r2,  #64
 
+        cmp             r3,  #34
+        ble             3f
+        cmp             r3,  #135
+        ble             4f
+
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64], r12
 .endr
         sub             r2,  r2,  r12, lsl #4
-        sub             r2,  r2,  #64
 
         bl              idct32_odd
+        b               5f
 
+3:
+.irp i, 16, 17, 18, 19
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #2
+        bl              idct32_odd_quarter
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        vld1.16         {d\i}, [r2,:64], r12
+.endr
+        sub             r2,  r2,  r12, lsl #3
+        bl              idct32_odd_half
+
+5:
+        sub             r2,  r2,  #64
         mov             r12, #128
 .macro load_acc_store a, b, c, d, neg=0
         vld1.16         {d4},  [r2,:64], r12