@@ -75,6 +75,16 @@ endconst
.endif
.endm
+// Same as dmbutterfly0 above, but treating the input in in2 as zero,
+// writing the same output into both out1 and out2.
+.macro dmbutterfly0_h out1, out2, in1, in2, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6
+ smull \tmp1\().4s, \in1\().4h, v0.h[0]
+ smull2 \tmp2\().4s, \in1\().8h, v0.h[0]
+ rshrn \out1\().4h, \tmp1\().4s, #14
+ rshrn2 \out1\().8h, \tmp2\().4s, #14
+ mov \out2\().16b, \out1\().16b
+.endm
+
// out1,out2 = in1 * coef1 - in2 * coef2
// out3,out4 = in1 * coef2 + in2 * coef1
// out are 4 x .4s registers, in are 2 x .8h registers
@@ -104,6 +114,43 @@ endconst
rshrn2 \inout2\().8h, \tmp4\().4s, #14
.endm
+// Same as dmbutterfly above, but treating the input in inout2 as zero
+.macro dmbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().4s, \inout1\().4h, \coef1
+ smull2 \tmp2\().4s, \inout1\().8h, \coef1
+ smull \tmp3\().4s, \inout1\().4h, \coef2
+ smull2 \tmp4\().4s, \inout1\().8h, \coef2
+ rshrn \inout1\().4h, \tmp1\().4s, #14
+ rshrn2 \inout1\().8h, \tmp2\().4s, #14
+ rshrn \inout2\().4h, \tmp3\().4s, #14
+ rshrn2 \inout2\().8h, \tmp4\().4s, #14
+.endm
+
+// Same as dmbutterfly above, but treating the input in inout1 as zero
+.macro dmbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2, tmp3, tmp4
+ smull \tmp1\().4s, \inout2\().4h, \coef2
+ smull2 \tmp2\().4s, \inout2\().8h, \coef2
+ smull \tmp3\().4s, \inout2\().4h, \coef1
+ smull2 \tmp4\().4s, \inout2\().8h, \coef1
+ neg \tmp1\().4s, \tmp1\().4s
+ neg \tmp2\().4s, \tmp2\().4s
+ rshrn \inout2\().4h, \tmp3\().4s, #14
+ rshrn2 \inout2\().8h, \tmp4\().4s, #14
+ rshrn \inout1\().4h, \tmp1\().4s, #14
+ rshrn2 \inout1\().8h, \tmp2\().4s, #14
+.endm
+
+.macro dsmull_h out1, out2, in, coef
+ smull \out1\().4s, \in\().4h, \coef
+ smull2 \out2\().4s, \in\().8h, \coef
+.endm
+
+.macro drshrn_h out, in1, in2, shift
+ rshrn \out\().4h, \in1\().4s, \shift
+ rshrn2 \out\().8h, \in2\().4s, \shift
+.endm
+
+
// out1 = in1 + in2
// out2 = in1 - in2
.macro butterfly_8h out1, out2, in1, in2
@@ -463,7 +510,7 @@ function idct16x16_dc_add_neon
ret
endfunc
-function idct16
+.macro idct16_full
dmbutterfly0 v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
dmbutterfly v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
dmbutterfly v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
@@ -485,7 +532,10 @@ function idct16
dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ idct16_end
+.endm
+.macro idct16_end
butterfly_8h v18, v7, v4, v7 // v18 = t0a, v7 = t7a
butterfly_8h v19, v22, v5, v22 // v19 = t1a, v22 = t6
butterfly_8h v4, v26, v20, v26 // v4 = t2a, v26 = t5
@@ -507,6 +557,68 @@ function idct16
butterfly_8h v20, v27, v6, v27 // v20 = out[4], v27 = out[11]
butterfly_8h v21, v26, v26, v3 // v21 = out[5], v26 = out[10]
ret
+.endm
+
+function idct16
+ idct16_full
+endfunc
+
+function idct16_half
+ dmbutterfly0_h v16, v24, v16, v24, v2, v3, v4, v5, v6, v7 // v16 = t0a, v24 = t1a
+ dmbutterfly_h1 v20, v28, v0.h[1], v0.h[2], v2, v3, v4, v5 // v20 = t2a, v28 = t3a
+ dmbutterfly_h1 v18, v30, v0.h[3], v0.h[4], v2, v3, v4, v5 // v18 = t4a, v30 = t7a
+ dmbutterfly_h2 v26, v22, v0.h[5], v0.h[6], v2, v3, v4, v5 // v26 = t5a, v22 = t6a
+ dmbutterfly_h1 v17, v31, v0.h[7], v1.h[0], v2, v3, v4, v5 // v17 = t8a, v31 = t15a
+ dmbutterfly_h2 v25, v23, v1.h[1], v1.h[2], v2, v3, v4, v5 // v25 = t9a, v23 = t14a
+ dmbutterfly_h1 v21, v27, v1.h[3], v1.h[4], v2, v3, v4, v5 // v21 = t10a, v27 = t13a
+ dmbutterfly_h2 v29, v19, v1.h[5], v1.h[6], v2, v3, v4, v5 // v29 = t11a, v19 = t12a
+
+ butterfly_8h v4, v28, v16, v28 // v4 = t0, v28 = t3
+ butterfly_8h v5, v20, v24, v20 // v5 = t1, v20 = t2
+ butterfly_8h v6, v26, v18, v26 // v6 = t4, v26 = t5
+ butterfly_8h v7, v22, v30, v22 // v7 = t7, v22 = t6
+ butterfly_8h v16, v25, v17, v25 // v16 = t8, v25 = t9
+ butterfly_8h v24, v21, v29, v21 // v24 = t11, v21 = t10
+ butterfly_8h v17, v27, v19, v27 // v17 = t12, v27 = t13
+ butterfly_8h v29, v23, v31, v23 // v29 = t15, v23 = t14
+
+ dmbutterfly0 v22, v26, v22, v26, v2, v3, v18, v19, v30, v31 // v22 = t6a, v26 = t5a
+ dmbutterfly v23, v25, v0.h[1], v0.h[2], v18, v19, v30, v31 // v23 = t9a, v25 = t14a
+ dmbutterfly v27, v21, v0.h[1], v0.h[2], v18, v19, v30, v31, neg=1 // v27 = t13a, v21 = t10a
+ idct16_end
+endfunc
+
+function idct16_quarter
+ dsmull_h v24, v25, v19, v1.h[6]
+ dsmull_h v4, v5, v17, v0.h[7]
+ dsmull_h v7, v6, v18, v0.h[4]
+ dsmull_h v30, v31, v18, v0.h[3]
+ neg v24.4s, v24.4s
+ neg v25.4s, v25.4s
+ dsmull_h v29, v28, v17, v1.h[0]
+ dsmull_h v26, v27, v19, v1.h[5]
+ dsmull_h v22, v23, v16, v0.h[0]
+ drshrn_h v24, v24, v25, #14
+ drshrn_h v16, v4, v5, #14
+ drshrn_h v7, v7, v6, #14
+ drshrn_h v6, v30, v31, #14
+ drshrn_h v29, v29, v28, #14
+ drshrn_h v17, v26, v27, #14
+ drshrn_h v28, v22, v23, #14
+
+ dmbutterfly_l v20, v21, v22, v23, v17, v24, v0.h[1], v0.h[2]
+ dmbutterfly_l v18, v19, v30, v31, v29, v16, v0.h[1], v0.h[2]
+ neg v22.4s, v22.4s
+ neg v23.4s, v23.4s
+ drshrn_h v27, v20, v21, #14
+ drshrn_h v21, v22, v23, #14
+ drshrn_h v23, v18, v19, #14
+ drshrn_h v25, v30, v31, #14
+ mov v4.16b, v28.16b
+ mov v5.16b, v28.16b
+ dmbutterfly0 v22, v26, v7, v6, v18, v19, v30, v31
+ mov v20.16b, v28.16b
+ idct16_end
endfunc
function iadst16
@@ -616,12 +728,37 @@ function \txfm\()16_1d_8x16_pass1_neon
mov x9, #32
movi v2.8h, #0
+
+.ifc \txfm,idct
+ cmp w3, #10
+ b.le 3f
+ cmp w3, #38
+ b.le 4f
+.endif
+
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
load_clear \i, x2, x9
.endr
bl \txfm\()16
+.ifc \txfm,idct
+ b 5f
+
+3:
+.irp i, 16, 17, 18, 19
+ load_clear \i, x2, x9
+.endr
+ bl idct16_quarter
+ b 5f
+
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load_clear \i, x2, x9
+.endr
+ bl idct16_half
+.endif
+5:
// Do two 8x8 transposes. Originally, v16-v31 contain the
// 16 rows. Afterwards, v16-v23 and v24-v31 contain the two
// transposed 8x8 blocks.
@@ -675,38 +812,60 @@ endfunc
// x0 = dst
// x1 = dst stride
// x2 = src (temp buffer)
-// x3 = slice offset
+// w3 = eob
+// x13 = slice offset
function \txfm\()16_1d_8x16_pass2_neon
mov x14, x30
mov x9, #32
+.ifc \txfm,idct
+ cmp w3, #10
+ b.le 3f
+ cmp w3, #38
+ b.le 4f
+.endif
.irp i, 16, 17, 18, 19, 20, 21, 22, 23
load \i, x2, x9
.endr
- cbz x3, 1f
+ cbz x13, 1f
.irp i, 24, 25, 26, 27, 28, 29, 30, 31
load \i, x2, x9
.endr
1:
- add x3, x0, x1
- lsl x1, x1, #1
bl \txfm\()16
+.ifc \txfm,idct
+ b 5f
+3:
+.irp i, 16, 17, 18, 19
+ load \i, x2, x9
+.endr
+ bl idct16_quarter
+ b 5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ load \i, x2, x9
+.endr
+ bl idct16_half
+.endif
+5:
+ add x8, x0, x1
+ lsl x1, x1, #1
.macro load_add_store coef0, coef1, coef2, coef3, coef4, coef5, coef6, coef7, tmp1, tmp2
srshr \coef0, \coef0, #6
ld1 {v2.8b}, [x0], x1
srshr \coef1, \coef1, #6
- ld1 {v3.8b}, [x3], x1
+ ld1 {v3.8b}, [x8], x1
srshr \coef2, \coef2, #6
ld1 {v4.8b}, [x0], x1
srshr \coef3, \coef3, #6
uaddw \coef0, \coef0, v2.8b
- ld1 {v5.8b}, [x3], x1
+ ld1 {v5.8b}, [x8], x1
uaddw \coef1, \coef1, v3.8b
srshr \coef4, \coef4, #6
ld1 {v6.8b}, [x0], x1
srshr \coef5, \coef5, #6
- ld1 {v7.8b}, [x3], x1
+ ld1 {v7.8b}, [x8], x1
sqxtun v2.8b, \coef0
srshr \coef6, \coef6, #6
sqxtun v3.8b, \coef1
@@ -714,27 +873,27 @@ function \txfm\()16_1d_8x16_pass2_neon
uaddw \coef2, \coef2, v4.8b
ld1 {\tmp1}, [x0], x1
uaddw \coef3, \coef3, v5.8b
- ld1 {\tmp2}, [x3], x1
+ ld1 {\tmp2}, [x8], x1
sqxtun v4.8b, \coef2
sub x0, x0, x1, lsl #2
- sub x3, x3, x1, lsl #2
+ sub x8, x8, x1, lsl #2
sqxtun v5.8b, \coef3
uaddw \coef4, \coef4, v6.8b
st1 {v2.8b}, [x0], x1
uaddw \coef5, \coef5, v7.8b
- st1 {v3.8b}, [x3], x1
+ st1 {v3.8b}, [x8], x1
sqxtun v6.8b, \coef4
st1 {v4.8b}, [x0], x1
sqxtun v7.8b, \coef5
- st1 {v5.8b}, [x3], x1
+ st1 {v5.8b}, [x8], x1
uaddw \coef6, \coef6, \tmp1
st1 {v6.8b}, [x0], x1
uaddw \coef7, \coef7, \tmp2
- st1 {v7.8b}, [x3], x1
+ st1 {v7.8b}, [x8], x1
sqxtun \tmp1, \coef6
sqxtun \tmp2, \coef7
st1 {\tmp1}, [x0], x1
- st1 {\tmp2}, [x3], x1
+ st1 {\tmp2}, [x8], x1
.endm
load_add_store v16.8h, v17.8h, v18.8h, v19.8h, v20.8h, v21.8h, v22.8h, v23.8h, v16.8b, v17.8b
load_add_store v24.8h, v25.8h, v26.8h, v27.8h, v28.8h, v29.8h, v30.8h, v31.8h, v16.8b, v17.8b
@@ -777,6 +936,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
.ifnc \txfm1\()_\txfm2,idct_idct
movrel x11, iadst16_coeffs
mov x7, #0
+ mov w3, #256
.else
movrel x12, min_eob_idct_idct_16
.endif
@@ -800,7 +960,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
add x0, x4, #(\i)
mov x1, x5
add x2, sp, #(\i*2)
- mov x3, #\i
+ mov x13, #\i
bl \txfm2\()16_1d_8x16_pass2_neon
.endr
@@ -856,7 +1016,7 @@ function idct32x32_dc_add_neon
ret
endfunc
-function idct32_odd
+.macro idct32_odd_full
ld1 {v0.8h,v1.8h}, [x11]
dmbutterfly v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
@@ -883,7 +1043,10 @@ function idct32_odd
dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ idct32_end
+.endm
+.macro idct32_end
butterfly_8h v16, v5, v4, v5 // v16 = t16a, v5 = t19a
butterfly_8h v17, v20, v23, v20 // v17 = t17, v20 = t18
butterfly_8h v18, v6, v7, v6 // v18 = t23a, v6 = t20a
@@ -912,8 +1075,91 @@ function idct32_odd
dmbutterfly0 v25, v22, v25, v22, v2, v3, v4, v5, v6, v7 // v25 = t25, v22 = t22
dmbutterfly0 v24, v23, v24, v23, v2, v3, v4, v5, v6, v7 // v24 = t24a, v23 = t23a
ret
+.endm
+
+function idct32_odd
+ idct32_odd_full
+endfunc
+
+function idct32_odd_half
+ ld1 {v0.8h,v1.8h}, [x11]
+
+ dmbutterfly_h1 v16, v31, v0.h[0], v0.h[1], v4, v5, v6, v7 // v16 = t16a, v31 = t31a
+ dmbutterfly_h2 v24, v23, v0.h[2], v0.h[3], v4, v5, v6, v7 // v24 = t17a, v23 = t30a
+ dmbutterfly_h1 v20, v27, v0.h[4], v0.h[5], v4, v5, v6, v7 // v20 = t18a, v27 = t29a
+ dmbutterfly_h2 v28, v19, v0.h[6], v0.h[7], v4, v5, v6, v7 // v28 = t19a, v19 = t28a
+ dmbutterfly_h1 v18, v29, v1.h[0], v1.h[1], v4, v5, v6, v7 // v18 = t20a, v29 = t27a
+ dmbutterfly_h2 v26, v21, v1.h[2], v1.h[3], v4, v5, v6, v7 // v26 = t21a, v21 = t26a
+ dmbutterfly_h1 v22, v25, v1.h[4], v1.h[5], v4, v5, v6, v7 // v22 = t22a, v25 = t25a
+ dmbutterfly_h2 v30, v17, v1.h[6], v1.h[7], v4, v5, v6, v7 // v30 = t23a, v17 = t24a
+
+ ld1 {v0.8h}, [x10]
+
+ butterfly_8h v4, v24, v16, v24 // v4 = t16, v24 = t17
+ butterfly_8h v5, v20, v28, v20 // v5 = t19, v20 = t18
+ butterfly_8h v6, v26, v18, v26 // v6 = t20, v26 = t21
+ butterfly_8h v7, v22, v30, v22 // v7 = t23, v22 = t22
+ butterfly_8h v28, v25, v17, v25 // v28 = t24, v25 = t25
+ butterfly_8h v30, v21, v29, v21 // v30 = t27, v21 = t26
+ butterfly_8h v29, v23, v31, v23 // v29 = t31, v23 = t30
+ butterfly_8h v31, v27, v19, v27 // v31 = t28, v27 = t29
+
+ dmbutterfly v23, v24, v0.h[3], v0.h[4], v16, v17, v18, v19 // v23 = t17a, v24 = t30a
+ dmbutterfly v27, v20, v0.h[3], v0.h[4], v16, v17, v18, v19, neg=1 // v27 = t29a, v20 = t18a
+ dmbutterfly v21, v26, v0.h[5], v0.h[6], v16, v17, v18, v19 // v21 = t21a, v26 = t26a
+ dmbutterfly v25, v22, v0.h[5], v0.h[6], v16, v17, v18, v19, neg=1 // v25 = t25a, v22 = t22a
+ idct32_end
+endfunc
+
+function idct32_odd_quarter
+ ld1 {v0.8h,v1.8h}, [x11]
+
+ dsmull_h v4, v5, v16, v0.h[0]
+ dsmull_h v28, v29, v19, v0.h[7]
+ dsmull_h v30, v31, v16, v0.h[1]
+ dsmull_h v22, v23, v17, v1.h[6]
+ dsmull_h v7, v6, v17, v1.h[7]
+ dsmull_h v26, v27, v19, v0.h[6]
+ dsmull_h v20, v21, v18, v1.h[0]
+ dsmull_h v24, v25, v18, v1.h[1]
+
+ ld1 {v0.8h}, [x10]
+
+ neg v28.4s, v28.4s
+ neg v29.4s, v29.4s
+ neg v7.4s, v7.4s
+ neg v6.4s, v6.4s
+
+ drshrn_h v4, v4, v5, #14
+ drshrn_h v5, v28, v29, #14
+ drshrn_h v29, v30, v31, #14
+ drshrn_h v28, v22, v23, #14
+ drshrn_h v7, v7, v6, #14
+ drshrn_h v31, v26, v27, #14
+ drshrn_h v6, v20, v21, #14
+ drshrn_h v30, v24, v25, #14
+
+ dmbutterfly_l v16, v17, v18, v19, v29, v4, v0.h[3], v0.h[4]
+ dmbutterfly_l v27, v26, v20, v21, v31, v5, v0.h[3], v0.h[4]
+ drshrn_h v23, v16, v17, #14
+ drshrn_h v24, v18, v19, #14
+ neg v20.4s, v20.4s
+ neg v21.4s, v21.4s
+ drshrn_h v27, v27, v26, #14
+ drshrn_h v20, v20, v21, #14
+ dmbutterfly_l v16, v17, v18, v19, v30, v6, v0.h[5], v0.h[6]
+ drshrn_h v21, v16, v17, #14
+ drshrn_h v26, v18, v19, #14
+ dmbutterfly_l v16, v17, v18, v19, v28, v7, v0.h[5], v0.h[6]
+ drshrn_h v25, v16, v17, #14
+ neg v18.4s, v18.4s
+ neg v19.4s, v19.4s
+ drshrn_h v22, v18, v19, #14
+
+ idct32_end
endfunc
+
// Do an 32-point IDCT of a 8x32 slice out of a 32x32 matrix.
// The 32-point IDCT can be decomposed into two 16-point IDCTs;
// a normal IDCT16 with every other input component (the even ones, with
@@ -937,6 +1183,11 @@ function idct32_1d_8x32_pass1_neon
mov x9, #128
movi v4.8h, #0
+ cmp w3, #4
+ b.le 3f
+ cmp w3, #135
+ b.le 4f
+
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x2]
@@ -944,7 +1195,25 @@ function idct32_1d_8x32_pass1_neon
.endr
bl idct16
+ sub x2, x2, x9, lsl #4
+ b 5f
+3:
+.irp i, 16, 17, 18, 19
+ ld1 {v\i\().8h}, [x2]
+ st1 {v4.8h}, [x2], x9
+.endr
+ bl idct16_quarter
+ sub x2, x2, x9, lsl #2
+ b 5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2]
+ st1 {v4.8h}, [x2], x9
+.endr
+ bl idct16_half
+ sub x2, x2, x9, lsl #3
+5:
// Do two 8x8 transposes. Originally, v16-v31 contain the
// 16 rows. Afterwards, v16-v23 and v24-v31 contain the
// two transposed 8x8 blocks.
@@ -976,12 +1245,16 @@ function idct32_1d_8x32_pass1_neon
sub x0, x0, #512
.purgem store_rev
- // Move x2 back to the start of the input, and move
- // to the first odd row
- sub x2, x2, x9, lsl #4
+ // Move x2 to the first odd row
add x2, x2, #64
movi v4.8h, #0
+
+ cmp w3, #34
+ b.le 3f
+ cmp w3, #135
+ b.le 4f
+
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x2]
@@ -989,7 +1262,22 @@ function idct32_1d_8x32_pass1_neon
.endr
bl idct32_odd
+ b 5f
+3:
+.irp i, 16, 17, 18, 19
+ ld1 {v\i\().8h}, [x2]
+ st1 {v4.8h}, [x2], x9
+.endr
+ bl idct32_odd_quarter
+ b 5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2]
+ st1 {v4.8h}, [x2], x9
+.endr
+ bl idct32_odd_half
+5:
transpose_8x8H v31, v30, v29, v28, v27, v26, v25, v24, v2, v3
transpose_8x8H v23, v22, v21, v20, v19, v18, v17, v16, v2, v3
@@ -1051,6 +1339,12 @@ function idct32_1d_8x32_pass2_neon
ld1 {v0.8h,v1.8h}, [x10]
mov x9, #128
+
+ cmp w3, #34
+ b.le 3f
+ cmp w3, #135
+ b.le 4f
+
// v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x2], x9
@@ -1058,7 +1352,22 @@ function idct32_1d_8x32_pass2_neon
sub x2, x2, x9, lsl #4
bl idct16
+ b 5f
+3:
+.irp i, 16, 17, 18, 19
+ ld1 {v\i\().8h}, [x2], x9
+.endr
+ sub x2, x2, x9, lsl #2
+ bl idct16_quarter
+ b 5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2], x9
+.endr
+ sub x2, x2, x9, lsl #3
+ bl idct16_half
+5:
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
st1 {v\i\().8h}, [x2], x9
.endr
@@ -1066,15 +1375,35 @@ function idct32_1d_8x32_pass2_neon
sub x2, x2, x9, lsl #4
add x2, x2, #64
+ cmp w3, #34
+ b.le 3f
+ cmp w3, #135
+ b.le 4f
+
// v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
.irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
ld1 {v\i\().8h}, [x2], x9
.endr
sub x2, x2, x9, lsl #4
- sub x2, x2, #64
bl idct32_odd
+ b 5f
+3:
+.irp i, 16, 17, 18, 19
+ ld1 {v\i\().8h}, [x2], x9
+.endr
+ sub x2, x2, x9, lsl #2
+ bl idct32_odd_quarter
+ b 5f
+4:
+.irp i, 16, 17, 18, 19, 20, 21, 22, 23
+ ld1 {v\i\().8h}, [x2], x9
+.endr
+ sub x2, x2, x9, lsl #3
+ bl idct32_odd_half
+5:
+ sub x2, x2, #64
.macro load_acc_store a, b, c, d, neg=0
ld1 {v4.8h}, [x2], x9
ld1 {v5.8h}, [x2], x9