[10/11] aarch64: vp9itxfm: Do a simpler half/quarter idct16/idct32 when possible

Message ID 1479906058-22747-10-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö Nov. 23, 2016, 1 p.m.
This work is sponsored by, and copyright, Google.

This increases the code size of libavcodec/aarch64/vp9itxfm_neon.o
from 14784 to 18548 bytes.

Before:
vp9_inv_dct_dct_16x16_sub1_add_neon:       236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:      1065.0
vp9_inv_dct_dct_16x16_sub8_add_neon:      1065.0
vp9_inv_dct_dct_16x16_sub12_add_neon:     1390.5
vp9_inv_dct_dct_16x16_sub16_add_neon:     1390.3
vp9_inv_dct_dct_32x32_sub1_add_neon:       556.5
vp9_inv_dct_dct_32x32_sub4_add_neon:      5203.8
vp9_inv_dct_dct_32x32_sub8_add_neon:      5199.8
vp9_inv_dct_dct_32x32_sub12_add_neon:     6172.3
vp9_inv_dct_dct_32x32_sub16_add_neon:     6176.1
vp9_inv_dct_dct_32x32_sub20_add_neon:     7144.5
vp9_inv_dct_dct_32x32_sub24_add_neon:     7143.7
vp9_inv_dct_dct_32x32_sub28_add_neon:     8114.2
vp9_inv_dct_dct_32x32_sub32_add_neon:     8112.0

After:
vp9_inv_dct_dct_16x16_sub1_add_neon:       236.7
vp9_inv_dct_dct_16x16_sub4_add_neon:       714.2
vp9_inv_dct_dct_16x16_sub8_add_neon:       926.8
vp9_inv_dct_dct_16x16_sub12_add_neon:     1402.3
vp9_inv_dct_dct_16x16_sub16_add_neon:     1405.9
vp9_inv_dct_dct_32x32_sub1_add_neon:       554.1
vp9_inv_dct_dct_32x32_sub4_add_neon:      3958.8
vp9_inv_dct_dct_32x32_sub8_add_neon:      3958.8
vp9_inv_dct_dct_32x32_sub12_add_neon:     5461.1
vp9_inv_dct_dct_32x32_sub16_add_neon:     5467.4
vp9_inv_dct_dct_32x32_sub20_add_neon:     7175.4
vp9_inv_dct_dct_32x32_sub24_add_neon:     7172.5
vp9_inv_dct_dct_32x32_sub28_add_neon:     8136.8
vp9_inv_dct_dct_32x32_sub32_add_neon:     8135.9

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 369 +++++++++++++++++++++++++++++++++++--
 1 file changed, 349 insertions(+), 20 deletions(-)

Patch

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index ef40a5a..d74245f 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -75,6 +75,16 @@  endconst
 .endif
 .endm
 
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+        smull           \tmp1\().4s,  \in1\().4h,  v0.h[0]
+        smull2          \tmp2\().4s,  \in1\().8h,  v0.h[0]
+        rshrn           \out1\().4h,  \tmp1\().4s, #14
+        rshrn2          \out1\().8h,  \tmp2\().4s, #14
+        mov             \out2\().16b, \out1\().16b
+.endm
+
 // out1,out2 = in1 * coef1 - in2 * coef2
 // out3,out4 = in1 * coef2 + in2 * coef1
 // out are 4 x .4s registers, in are 2 x .8h registers
@@ -104,6 +114,43 @@  endconst
         rshrn2          \inout2\().8h, \tmp4\().4s,  #14
 .endm
 
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout1\().4h, \coef1
+        smull2          \tmp2\().4s, \inout1\().8h, \coef1
+        smull           \tmp3\().4s, \inout1\().4h, \coef2
+        smull2          \tmp4\().4s, \inout1\().8h, \coef2
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+        smull           \tmp1\().4s, \inout2\().4h, \coef2
+        smull2          \tmp2\().4s, \inout2\().8h, \coef2
+        smull           \tmp3\().4s, \inout2\().4h, \coef1
+        smull2          \tmp4\().4s, \inout2\().8h, \coef1
+        neg             \tmp1\().4s, \tmp1\().4s
+        neg             \tmp2\().4s, \tmp2\().4s
+        rshrn           \inout2\().4h, \tmp3\().4s, #14
+        rshrn2          \inout2\().8h, \tmp4\().4s, #14
+        rshrn           \inout1\().4h, \tmp1\().4s, #14
+        rshrn2          \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+        smull           \out1\().4s, \in\().4h, \coef
+        smull2          \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+        rshrn           \out\().4h, \in1\().4s, \shift
+        rshrn2          \out\().8h, \in2\().4s, \shift
+.endm
+
+
 // out1 = in1 + in2
 // out2 = in1 - in2
 .macro butterfly_8h out1, out2, in1, in2
@@ -463,7 +510,7 @@  function idct16x16_dc_add_neon
         ret
 endfunc
 
-function idct16
+.macro idct16_full
         dmbutterfly0    v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
         dmbutterfly     v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
         dmbutterfly     v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
@@ -485,7 +532,10 @@  function idct16
         dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
         dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
         dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+.endm
 
+.macro idct16_end
         butterfly_8h    v18, v7,  v4,  v7                // v18 = t0a,  v7  = t7a
         butterfly_8h    v19, v22, v5,  v22               // v19 = t1a,  v22 = t6
         butterfly_8h    v4,  v26, v20, v26               // v4  = t2a,  v26 = t5
@@ -507,6 +557,68 @@  function idct16
         butterfly_8h    v20, v27, v6,  v27               // v20 = out[4], v27 = out[11]
         butterfly_8h    v21, v26, v26, v3                // v21 = out[5], v26 = out[10]
         ret
+.endm
+
+function idct16
+        idct16_full
+endfunc
+
+function idct16_half
+        dmbutterfly0_h  v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a,  v24 = t1a
+        dmbutterfly_h1  v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a,  v28 = t3a
+        dmbutterfly_h1  v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a,  v30 = t7a
+        dmbutterfly_h2  v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a,  v22 = t6a
+        dmbutterfly_h1  v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a,  v31 = t15a
+        dmbutterfly_h2  v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a,  v23 = t14a
+        dmbutterfly_h1  v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+        dmbutterfly_h2  v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+
+        butterfly_8h    v4,  v28, v16, v28               // v4  = t0,   v28 = t3
+        butterfly_8h    v5,  v20, v24, v20               // v5  = t1,   v20 = t2
+        butterfly_8h    v6,  v26, v18, v26               // v6  = t4,   v26 = t5
+        butterfly_8h    v7,  v22, v30, v22               // v7  = t7,   v22 = t6
+        butterfly_8h    v16, v25, v17, v25               // v16 = t8,   v25 = t9
+        butterfly_8h    v24, v21, v29, v21               // v24 = t11,  v21 = t10
+        butterfly_8h    v17, v27, v19, v27               // v17 = t12,  v27 = t13
+        butterfly_8h    v29, v23, v31, v23               // v29 = t15,  v23 = t14
+
+        dmbutterfly0    v22, v26, v22, v26, v2, v3, v18, v19, v30, v31        // v22 = t6a,  v26 = t5a
+        dmbutterfly     v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31        // v23 = t9a,  v25 = t14a
+        dmbutterfly     v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+        idct16_end
+endfunc
+
+function idct16_quarter
+        dsmull_h        v24, v25, v19, v1.h[6]
+        dsmull_h        v4,  v5,  v17, v0.h[7]
+        dsmull_h        v7,  v6,  v18, v0.h[4]
+        dsmull_h        v30, v31, v18, v0.h[3]
+        neg             v24.4s,  v24.4s
+        neg             v25.4s,  v25.4s
+        dsmull_h        v29, v28, v17, v1.h[0]
+        dsmull_h        v26, v27, v19, v1.h[5]
+        dsmull_h        v22, v23, v16, v0.h[0]
+        drshrn_h        v24, v24, v25, #14
+        drshrn_h        v16, v4,  v5,  #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v6,  v30, v31, #14
+        drshrn_h        v29, v29, v28, #14
+        drshrn_h        v17, v26, v27, #14
+        drshrn_h        v28, v22, v23, #14
+
+        dmbutterfly_l   v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
+        dmbutterfly_l   v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
+        neg             v22.4s,  v22.4s
+        neg             v23.4s,  v23.4s
+        drshrn_h        v27, v20, v21, #14
+        drshrn_h        v21, v22, v23, #14
+        drshrn_h        v23, v18, v19, #14
+        drshrn_h        v25, v30, v31, #14
+        mov             v4.16b,  v28.16b
+        mov             v5.16b,  v28.16b
+        dmbutterfly0    v22, v26, v7,  v6,  v18, v19, v30, v31
+        mov             v20.16b, v28.16b
+        idct16_end
 endfunc
 
 function iadst16
@@ -616,12 +728,37 @@  function \txfm\()16_1d_8x16_pass1_neon
 
         mov             x9, #32
         movi            v2.8h, #0
+
+.ifc \txfm,idct
+        cmp             w3, #10
+        b.le            3f
+        cmp             w3, #38
+        b.le            4f
+.endif
+
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         load_clear      \i,  x2,  x9
 .endr
 
         bl              \txfm\()16
+.ifc \txfm,idct
+        b               5f
+
+3:
+.irp i, 16, 17, 18, 19
+        load_clear      \i,  x2,  x9
+.endr
+        bl              idct16_quarter
+        b               5f
+
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load_clear      \i,  x2,  x9
+.endr
+        bl              idct16_half
+.endif
 
+5:
         // Do two 8x8 transposes. Originally, v16-v31 contain the
         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
         // transposed 8x8 blocks.
@@ -675,38 +812,60 @@  endfunc
 // x0 = dst
 // x1 = dst stride
 // x2 = src (temp buffer)
-// x3 = slice offset
+// w3 = eob
+// x13 = slice offset
 function \txfm\()16_1d_8x16_pass2_neon
         mov             x14, x30
         mov             x9, #32
+.ifc \txfm,idct
+        cmp             w3, #10
+        b.le            3f
+        cmp             w3, #38
+        b.le            4f
+.endif
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23
         load            \i,  x2,  x9
 .endr
-        cbz             x3,  1f
+        cbz             x13, 1f
 .irp i, 24, 25, 26, 27, 28, 29, 30, 31
         load            \i,  x2,  x9
 .endr
 1:
 
-        add             x3,  x0,  x1
-        lsl             x1,  x1,  #1
         bl              \txfm\()16
+.ifc \txfm,idct
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        load            \i,  x2,  x9
+.endr
+        bl              idct16_quarter
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        load            \i,  x2,  x9
+.endr
+        bl              idct16_half
+.endif
 
+5:
+        add             x8,  x0,  x1
+        lsl             x1,  x1,  #1
 .macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
         srshr           \coef0, \coef0, #6
         ld1             {v2.8b},  [x0], x1
         srshr           \coef1, \coef1, #6
-        ld1             {v3.8b},  [x3], x1
+        ld1             {v3.8b},  [x8], x1
         srshr           \coef2, \coef2, #6
         ld1             {v4.8b},  [x0], x1
         srshr           \coef3, \coef3, #6
         uaddw           \coef0, \coef0, v2.8b
-        ld1             {v5.8b},  [x3], x1
+        ld1             {v5.8b},  [x8], x1
         uaddw           \coef1, \coef1, v3.8b
         srshr           \coef4, \coef4, #6
         ld1             {v6.8b},  [x0], x1
         srshr           \coef5, \coef5, #6
-        ld1             {v7.8b},  [x3], x1
+        ld1             {v7.8b},  [x8], x1
         sqxtun          v2.8b,  \coef0
         srshr           \coef6, \coef6, #6
         sqxtun          v3.8b,  \coef1
@@ -714,27 +873,27 @@  function \txfm\()16_1d_8x16_pass2_neon
         uaddw           \coef2, \coef2, v4.8b
         ld1             {\tmp1},  [x0], x1
         uaddw           \coef3, \coef3, v5.8b
-        ld1             {\tmp2},  [x3], x1
+        ld1             {\tmp2},  [x8], x1
         sqxtun          v4.8b,  \coef2
         sub             x0,  x0,  x1, lsl #2
-        sub             x3,  x3,  x1, lsl #2
+        sub             x8,  x8,  x1, lsl #2
         sqxtun          v5.8b,  \coef3
         uaddw           \coef4, \coef4, v6.8b
         st1             {v2.8b},  [x0], x1
         uaddw           \coef5, \coef5, v7.8b
-        st1             {v3.8b},  [x3], x1
+        st1             {v3.8b},  [x8], x1
         sqxtun          v6.8b,  \coef4
         st1             {v4.8b},  [x0], x1
         sqxtun          v7.8b,  \coef5
-        st1             {v5.8b},  [x3], x1
+        st1             {v5.8b},  [x8], x1
         uaddw           \coef6, \coef6, \tmp1
         st1             {v6.8b},  [x0], x1
         uaddw           \coef7, \coef7, \tmp2
-        st1             {v7.8b},  [x3], x1
+        st1             {v7.8b},  [x8], x1
         sqxtun          \tmp1,  \coef6
         sqxtun          \tmp2,  \coef7
         st1             {\tmp1},  [x0], x1
-        st1             {\tmp2},  [x3], x1
+        st1             {\tmp2},  [x8], x1
 .endm
         load_add_store  v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
         load_add_store  v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
@@ -777,6 +936,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifnc \txfm1\()_\txfm2,idct_idct
         movrel          x11, iadst16_coeffs
         mov             x7,  #0
+        mov             w3,  #256
 .else
         movrel          x12, min_eob_idct_idct_16
 .endif
@@ -800,7 +960,7 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
         add             x0,  x4,  #(\i)
         mov             x1,  x5
         add             x2,  sp,  #(\i*2)
-        mov             x3,  #\i
+        mov             x13, #\i
         bl              \txfm2\()16_1d_8x16_pass2_neon
 .endr
 
@@ -856,7 +1016,7 @@  function idct32x32_dc_add_neon
         ret
 endfunc
 
-function idct32_odd
+.macro idct32_odd_full
         ld1             {v0.8h,v1.8h}, [x11]
 
         dmbutterfly     v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
@@ -883,7 +1043,10 @@  function idct32_odd
         dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
         dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
         dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+.endm
 
+.macro idct32_end
         butterfly_8h    v16, v5,  v4,  v5  // v16 = t16a, v5  = t19a
         butterfly_8h    v17, v20, v23, v20 // v17 = t17,  v20 = t18
         butterfly_8h    v18, v6,  v7,  v6  // v18 = t23a, v6  = t20a
@@ -912,8 +1075,91 @@  function idct32_odd
         dmbutterfly0    v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25,  v22 = t22
         dmbutterfly0    v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
         ret
+.endm
+
+function idct32_odd
+        idct32_odd_full
+endfunc
+
+function idct32_odd_half
+        ld1             {v0.8h,v1.8h}, [x11]
+
+        dmbutterfly_h1  v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+        dmbutterfly_h2  v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+        dmbutterfly_h1  v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+        dmbutterfly_h2  v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+        dmbutterfly_h1  v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+        dmbutterfly_h2  v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+        dmbutterfly_h1  v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+        dmbutterfly_h2  v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+        ld1             {v0.8h}, [x10]
+
+        butterfly_8h    v4,  v24, v16, v24 // v4  = t16, v24 = t17
+        butterfly_8h    v5,  v20, v28, v20 // v5  = t19, v20 = t18
+        butterfly_8h    v6,  v26, v18, v26 // v6  = t20, v26 = t21
+        butterfly_8h    v7,  v22, v30, v22 // v7  = t23, v22 = t22
+        butterfly_8h    v28, v25, v17, v25 // v28 = t24, v25 = t25
+        butterfly_8h    v30, v21, v29, v21 // v30 = t27, v21 = t26
+        butterfly_8h    v29, v23, v31, v23 // v29 = t31, v23 = t30
+        butterfly_8h    v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+        dmbutterfly     v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19        // v23 = t17a, v24 = t30a
+        dmbutterfly     v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+        dmbutterfly     v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19        // v21 = t21a, v26 = t26a
+        dmbutterfly     v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+        idct32_end
+endfunc
+
+function idct32_odd_quarter
+        ld1             {v0.8h,v1.8h}, [x11]
+
+        dsmull_h        v4,  v5,  v16, v0.h[0]
+        dsmull_h        v28, v29, v19, v0.h[7]
+        dsmull_h        v30, v31, v16, v0.h[1]
+        dsmull_h        v22, v23, v17, v1.h[6]
+        dsmull_h        v7,  v6,  v17, v1.h[7]
+        dsmull_h        v26, v27, v19, v0.h[6]
+        dsmull_h        v20, v21, v18, v1.h[0]
+        dsmull_h        v24, v25, v18, v1.h[1]
+
+        ld1             {v0.8h}, [x10]
+
+        neg             v28.4s, v28.4s
+        neg             v29.4s, v29.4s
+        neg             v7.4s,  v7.4s
+        neg             v6.4s,  v6.4s
+
+        drshrn_h        v4,  v4,  v5,  #14
+        drshrn_h        v5,  v28, v29, #14
+        drshrn_h        v29, v30, v31, #14
+        drshrn_h        v28, v22, v23, #14
+        drshrn_h        v7,  v7,  v6,  #14
+        drshrn_h        v31, v26, v27, #14
+        drshrn_h        v6,  v20, v21, #14
+        drshrn_h        v30, v24, v25, #14
+
+        dmbutterfly_l   v16, v17, v18, v19, v29, v4,  v0.h[3], v0.h[4]
+        dmbutterfly_l   v27, v26, v20, v21, v31, v5,  v0.h[3], v0.h[4]
+        drshrn_h        v23, v16, v17, #14
+        drshrn_h        v24, v18, v19, #14
+        neg             v20.4s, v20.4s
+        neg             v21.4s, v21.4s
+        drshrn_h        v27, v27, v26, #14
+        drshrn_h        v20, v20, v21, #14
+        dmbutterfly_l   v16, v17, v18, v19, v30, v6,  v0.h[5], v0.h[6]
+        drshrn_h        v21, v16, v17, #14
+        drshrn_h        v26, v18, v19, #14
+        dmbutterfly_l   v16, v17, v18, v19, v28, v7,  v0.h[5], v0.h[6]
+        drshrn_h        v25, v16, v17, #14
+        neg             v18.4s, v18.4s
+        neg             v19.4s, v19.4s
+        drshrn_h        v22, v18, v19, #14
+
+        idct32_end
 endfunc
 
+
 // Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
 // The 32-point IDCT can be decomposed into two 16-point IDCTs;
 // a normal IDCT16 with every other input component (the even ones, with
@@ -937,6 +1183,11 @@  function idct32_1d_8x32_pass1_neon
         mov             x9,  #128
         movi            v4.8h, #0
 
+        cmp             w3,  #4
+        b.le            3f
+        cmp             w3,  #135
+        b.le            4f
+
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         ld1             {v\i\().8h}, [x2]
@@ -944,7 +1195,25 @@  function idct32_1d_8x32_pass1_neon
 .endr
 
         bl              idct16
+        sub             x2,  x2,  x9, lsl #4
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        ld1             {v\i\().8h}, [x2]
+        st1             {v4.8h},  [x2], x9
+.endr
+        bl              idct16_quarter
+        sub             x2,  x2,  x9, lsl #2
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x2]
+        st1             {v4.8h},  [x2], x9
+.endr
+        bl              idct16_half
+        sub             x2,  x2,  x9, lsl #3
 
+5:
         // Do two 8x8 transposes. Originally, v16-v31 contain the
         // 16 rows. Afterwards, v16-v23 and v24-v31 contain the
         // two transposed 8x8 blocks.
@@ -976,12 +1245,16 @@  function idct32_1d_8x32_pass1_neon
         sub             x0,  x0,  #512
 .purgem store_rev
 
-        // Move x2 back to the start of the input, and move
-        // to the first odd row
-        sub             x2,  x2,  x9, lsl #4
+        // Move x2 to the first odd row
         add             x2,  x2,  #64
 
         movi            v4.8h, #0
+
+        cmp             w3,  #34
+        b.le            3f
+        cmp             w3,  #135
+        b.le            4f
+
         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         ld1             {v\i\().8h}, [x2]
@@ -989,7 +1262,22 @@  function idct32_1d_8x32_pass1_neon
 .endr
 
         bl              idct32_odd
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        ld1             {v\i\().8h}, [x2]
+        st1             {v4.8h},  [x2], x9
+.endr
+        bl              idct32_odd_quarter
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x2]
+        st1             {v4.8h},  [x2], x9
+.endr
+        bl              idct32_odd_half
 
+5:
         transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
         transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
 
@@ -1051,6 +1339,12 @@  function idct32_1d_8x32_pass2_neon
         ld1             {v0.8h,v1.8h}, [x10]
 
         mov             x9, #128
+
+        cmp             w3,  #34
+        b.le            3f
+        cmp             w3,  #135
+        b.le            4f
+
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         ld1             {v\i\().8h}, [x2], x9
@@ -1058,7 +1352,22 @@  function idct32_1d_8x32_pass2_neon
         sub             x2,  x2,  x9, lsl #4
 
         bl              idct16
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        ld1             {v\i\().8h}, [x2], x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+        bl              idct16_quarter
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x2], x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+        bl              idct16_half
 
+5:
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         st1             {v\i\().8h}, [x2], x9
 .endr
@@ -1066,15 +1375,35 @@  function idct32_1d_8x32_pass2_neon
         sub             x2,  x2,  x9, lsl #4
         add             x2,  x2,  #64
 
+        cmp             w3,  #34
+        b.le            3f
+        cmp             w3,  #135
+        b.le            4f
+
         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         ld1             {v\i\().8h}, [x2], x9
 .endr
         sub             x2,  x2,  x9, lsl #4
-        sub             x2,  x2,  #64
 
         bl              idct32_odd
+        b               5f
+3:
+.irp i, 16, 17, 18, 19
+        ld1             {v\i\().8h}, [x2], x9
+.endr
+        sub             x2,  x2,  x9, lsl #2
+        bl              idct32_odd_quarter
+        b               5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+        ld1             {v\i\().8h}, [x2], x9
+.endr
+        sub             x2,  x2,  x9, lsl #3
+        bl              idct32_odd_half
 
+5:
+        sub             x2,  x2,  #64
 .macro load_acc_store a, b, c, d, neg=0
         ld1             {v4.8h},  [x2], x9
         ld1             {v5.8h},  [x2], x9