Message ID | 1480584422-24237-2-git-send-email-martin@martin.st |
---|---|
State | Superseded |
Headers | show |
On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > This increases the code size of libavcodec/arm/vp9itxfm_neon.o > from 12388 to 15064 bytes. > > Before: Cortex A7 A8 A9 A53 > vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.7 211.9 235.8 > vp9_inv_dct_dct_16x16_sub2_add_neon: 2056.7 1521.2 1734.8 1262.0 > vp9_inv_dct_dct_16x16_sub4_add_neon: 2060.8 1608.5 1735.7 1262.0 > vp9_inv_dct_dct_16x16_sub8_add_neon: 2444.9 1801.6 2007.8 1508.5 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2902.1 2116.7 2285.1 1751.7 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 > vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 456.7 866.0 553.9 > vp9_inv_dct_dct_32x32_sub2_add_neon: 11042.7 8127.5 8582.7 6822.8 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 > vp9_inv_dct_dct_32x32_sub8_add_neon: 11908.0 9281.8 9381.9 7562.4 > vp9_inv_dct_dct_32x32_sub12_add_neon: 13015.2 10791.1 10220.3 8318.9 > vp9_inv_dct_dct_32x32_sub16_add_neon: 14150.3 11886.2 11032.6 9064.8 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15165.7 12993.8 11847.0 9816.7 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16280.8 15111.2 12658.6 10576.8 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17412.6 15549.4 13462.7 11325.6 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 > > After: > vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.5 236.1 > vp9_inv_dct_dct_16x16_sub2_add_neon: 1448.2 994.0 1191.3 836.0 > vp9_inv_dct_dct_16x16_sub4_add_neon: 1437.0 991.0 1191.6 836.0 > vp9_inv_dct_dct_16x16_sub8_add_neon: 2114.5 1757.9 1855.3 1335.3 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2862.7 2141.5 2293.3 1772.7 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3299.6 2419.1 2552.7 2033.0 > vp9_inv_dct_dct_32x32_sub1_add_neon: 753.0 457.5 864.3 554.8 > vp9_inv_dct_dct_32x32_sub2_add_neon: 7867.8 5978.6 6594.6 5109.9 > vp9_inv_dct_dct_32x32_sub4_add_neon: 7871.0 5772.5 6582.2 5108.5 > vp9_inv_dct_dct_32x32_sub8_add_neon: 8694.8 6925.7 7125.7 5671.4 > vp9_inv_dct_dct_32x32_sub12_add_neon: 11250.3 9654.7 9557.6 7540.5 > vp9_inv_dct_dct_32x32_sub16_add_neon: 12129.5 11061.1 10295.0 8220.7 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15218.4 13580.8 11841.3 9739.9 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16343.5 15097.0 12629.2 10496.6 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17482.2 15516.4 13476.0 11261.0 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18586.7 16817.5 14289.3 12019.0 > > --- > If we wouldn't have made the core transforms standalone functions > in the previous patch, the code size would increase to around 21 KB (which > isn't too bad), but the idct32 pass1/2 functions would bloat up so much > that they would require literal pools within the functions themselves. > --- > libavcodec/arm/vp9itxfm_neon.S | 351 ++++++++++++++++++++++++++++++++++++++--- > 1 file changed, 331 insertions(+), 20 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 22e63e5..bd3f678 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -74,6 +74,14 @@ endconst > vrshrn.s32 \out2, \tmpq4, #14 > .endm > > +@ Same as mbutterfly0 above, but treating the input in in2 as zero, > +@ writing the same output into both out1 and out2. > +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 > + vmull.s16 \tmpq3, \in1, d0[0] > + vrshrn.s32 \out1, \tmpq3, #14 > + vmov \out2, \out1 if you haven't already tried doing the vrshrn twice could be faster since it has less dependencies > +.endm > + > @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 > @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 > @ Same as mbutterfly0, but with input being 2 q registers, output > @@ -137,6 +145,23 @@ endconst > vrshrn.s32 \inout2, \tmp2, #14 > .endm > > +@ Same as mbutterfly above, but treating the input in inout2 as zero > +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 > + vmull.s16 \tmp1, \inout1, \coef1 > + vmull.s16 \tmp2, \inout1, \coef2 > + vrshrn.s32 \inout1, \tmp1, #14 > + vrshrn.s32 \inout2, \tmp2, #14 > +.endm > + > +@ Same as mbutterfly above, but treating the input in inout1 as zero > +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 > + vmull.s16 \tmp1, \inout2, \coef2 > + vmull.s16 \tmp2, \inout2, \coef1 > + vneg.s32 \tmp1, \tmp1 > + vrshrn.s32 \inout2, \tmp2, #14 > + vrshrn.s32 \inout1, \tmp1, #14 > +.endm > + > @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14 > @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14 > @ inout are 4 d registers, tmp are 4 q registers > @@ -534,7 +559,7 @@ function idct16x16_dc_add_neon > endfunc > .ltorg > > -function idct16 > +.macro idct16_full > mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a > mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a > mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a > @@ -556,7 +581,10 @@ function idct16 > mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a > mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a, d25 = t14a > mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 = t13a, d21 = t10a > + idct16_end > +.endm > > +.macro idct16_end > butterfly d18, d7, d4, d7 @ d18 = t0a, d7 = t7a > butterfly d19, d22, d5, d22 @ d19 = t1a, d22 = t6 > butterfly d4, d26, d20, d26 @ d4 = t2a, d26 = t5 > @@ -581,6 +609,66 @@ function idct16 > butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] > butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10] > bx lr > +.endm > + > +function idct16 > + idct16_full > +endfunc > + > +function idct16_half > + mbutterfly0_h d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a > + mbutterfly_h1 d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a > + mbutterfly_h1 d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a > + mbutterfly_h2 d26, d22, d1[1], d1[2], q2, q3 @ d26 = t5a, d22 = t6a > + mbutterfly_h1 d17, d31, d1[3], d2[0], q2, q3 @ d17 = t8a, d31 = t15a > + mbutterfly_h2 d25, d23, d2[1], d2[2], q2, q3 @ d25 = t9a, d23 = t14a > + mbutterfly_h1 d21, d27, d2[3], d3[0], q2, q3 @ d21 = t10a, d27 = t13a > + mbutterfly_h2 d29, d19, d3[1], d3[2], q2, q3 @ d29 = t11a, d19 = t12a > + > + butterfly d4, d28, d16, d28 @ d4 = t0, d28 = t3 > + butterfly d5, d20, d24, d20 @ d5 = t1, d20 = t2 > + butterfly d6, d26, d18, d26 @ d6 = t4, d26 = t5 > + butterfly d7, d22, d30, d22 @ d7 = t7, d22 = t6 > + butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9 > + butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10 > + butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13 > + butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 > + > + mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a > + mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a, d25 = t14a > + mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 = t13a, d21 = t10a > + idct16_end > +endfunc > + > +function idct16_quarter > + vmull.s16 q12, d19, d3[2] > + vmull.s16 q2, d17, d1[3] > + vmull.s16 q3, d18, d1[0] > + vmull.s16 q15, d18, d0[3] > + vneg.s32 q12, q12 > + vmull.s16 q14, d17, d2[0] > + vmull.s16 q13, d19, d3[1] > + vmull.s16 q11, d16, d0[0] > + vrshrn.s32 d24, q12, #14 > + vrshrn.s32 d16, q2, #14 > + vrshrn.s32 d7, q3, #14 > + vrshrn.s32 d6, q15, #14 > + vrshrn.s32 d29, q14, #14 > + vrshrn.s32 d17, q13, #14 > + vrshrn.s32 d28, q11, #14 > + > + mbutterfly_l q10, q11, d17, d24, d0[1], d0[2] > + mbutterfly_l q9, q15, d29, d16, d0[1], d0[2] > + vneg.s32 q11, q11 > + vrshrn.s32 d27, q10, #14 > + vrshrn.s32 d21, q11, #14 > + vrshrn.s32 d23, q9, #14 > + vrshrn.s32 d25, q15, #14 > + vmov d4, d28 > + vmov d5, d28 > + mbutterfly0 d22, d26, d7, d6, d18, d30, q9, q15 > + vmov d20, d28 > + idct16_end > endfunc > > function iadst16 > @@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon > > mov r12, #32 > vmov.s16 q2, #0 > + > +.ifc \txfm,idct > + cmp r3, #10 > + ble 3f > + cmp r3, #38 > + ble 4f > +.endif I'd test only for less or equal 38 here > + > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > vld1.16 {d\i}, [r2,:64] > vst1.16 {d4}, [r2,:64], r12 > .endr > > bl \txfm\()16 > +.ifc \txfm,idct > + b 5f cmp r3, #10 > + > +3: > +.irp i, 16, 17, 18, 19 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 > +.endr > + bl idct16_quarter > + b 5f remove this > + > +4: > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 .if \i == 19 blle idct16_half ble 5f .endif saves a little binary space not sure if it's worth it. > +.endr > + bl idct16_half > +.endif > > +5: > @ Do four 4x4 transposes. Originally, d16-d31 contain the > @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 > @ contain the transposed 4x4 blocks. > @@ -721,58 +836,80 @@ endfunc > @ r0 = dst > @ r1 = dst stride > @ r2 = src (temp buffer) > -@ r3 = slice offset > +@ r3 = eob > +@ r8 = slice offset > function \txfm\()16_1d_4x16_pass2_neon > push {lr} > mov r12, #32 > +.ifc \txfm,idct > + cmp r3, #10 > + ble 3f > + cmp r3, #38 > + ble 4f same applies here > +.endif > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 > vld1.16 {d\i}, [r2,:64], r12 > .endr > - cmp r3, #0 > + cmp r8, #0 > beq 1f > .irp i, 28, 29, 30, 31 > vld1.16 {d\i}, [r2,:64], r12 > .endr > 1: > > - add r3, r0, r1 > - lsl r1, r1, #1 > bl \txfm\()16 > +.ifc \txfm,idct > + b 5f > +3: > +.irp i, 16, 17, 18, 19 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + bl idct16_quarter > + b 5f > +4: > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + bl idct16_half > +.endif > > +5: > + add r8, r0, r1 > + lsl r1, r1, #1 > .macro load_add_store coef0, coef1, coef2, coef3 > vrshr.s16 \coef0, \coef0, #6 > vrshr.s16 \coef1, \coef1, #6 > > vld1.32 {d4[]}, [r0,:32], r1 > - vld1.32 {d4[1]}, [r3,:32], r1 > + vld1.32 {d4[1]}, [r8,:32], r1 > vrshr.s16 \coef2, \coef2, #6 > vrshr.s16 \coef3, \coef3, #6 > vld1.32 {d5[]}, [r0,:32], r1 > - vld1.32 {d5[1]}, [r3,:32], r1 > + vld1.32 {d5[1]}, [r8,:32], r1 > vaddw.u8 \coef0, \coef0, d4 > vld1.32 {d6[]}, [r0,:32], r1 > - vld1.32 {d6[1]}, [r3,:32], r1 > + vld1.32 {d6[1]}, [r8,:32], r1 > vaddw.u8 \coef1, \coef1, d5 > vld1.32 {d7[]}, [r0,:32], r1 > - vld1.32 {d7[1]}, [r3,:32], r1 > + vld1.32 {d7[1]}, [r8,:32], r1 > > vqmovun.s16 d4, \coef0 > vqmovun.s16 d5, \coef1 > sub r0, r0, r1, lsl #2 > - sub r3, r3, r1, lsl #2 > + sub r8, r8, r1, lsl #2 > vaddw.u8 \coef2, \coef2, d6 > vaddw.u8 \coef3, \coef3, d7 > vst1.32 {d4[0]}, [r0,:32], r1 > - vst1.32 {d4[1]}, [r3,:32], r1 > + vst1.32 {d4[1]}, [r8,:32], r1 > vqmovun.s16 d6, \coef2 > vst1.32 {d5[0]}, [r0,:32], r1 > - vst1.32 {d5[1]}, [r3,:32], r1 > + vst1.32 {d5[1]}, [r8,:32], r1 > vqmovun.s16 d7, \coef3 > > vst1.32 {d6[0]}, [r0,:32], r1 > - vst1.32 {d6[1]}, [r3,:32], r1 > + vst1.32 {d6[1]}, [r8,:32], r1 > vst1.32 {d7[0]}, [r0,:32], r1 > - vst1.32 {d7[1]}, [r3,:32], r1 > + vst1.32 {d7[1]}, [r8,:32], r1 > .endm > load_add_store q8, q9, q10, q11 > load_add_store q12, q13, q14, q15 > @@ -799,6 +936,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 > push {r4-r8,lr} > .ifnc \txfm1\()_\txfm2,idct_idct > vpush {q4-q7} > + mov r3, #256 > .else > movrel r8, min_eob_idct_idct_16 + 2 > .endif > @@ -859,7 +997,7 @@ A and r7, sp, #15 > add r0, r4, #(\i) > mov r1, r5 > add r2, sp, #(\i*2) > - mov r3, #\i > + mov r8, #\i > bl \txfm2\()16_1d_4x16_pass2_neon > .endr > > @@ -913,7 +1051,7 @@ function idct32x32_dc_add_neon > bx lr > endfunc > > -function idct32_odd > +.macro idct32_odd_full > movrel r12, idct_coeffs > add r12, r12, #32 > vld1.16 {q0-q1}, [r12,:128] > @@ -943,7 +1081,10 @@ function idct32_odd > mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a > mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a > mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a > + idct32_end > +.endm > > +.macro idct32_end > butterfly d16, d5, d4, d5 @ d16 = t16a, d5 = t19a > butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18 > butterfly d18, d6, d7, d6 @ d18 = t23a, d6 = t20a > @@ -973,6 +1114,91 @@ function idct32_odd > mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 > mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a > bx lr > +.endm > + > +function idct32_odd > + idct32_odd_full > +endfunc > + > +function idct32_odd_half > + movrel r12, idct_coeffs > + add r12, r12, #32 > + vld1.16 {q0-q1}, [r12,:128] > + > + mbutterfly_h1 d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a > + mbutterfly_h2 d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a > + mbutterfly_h1 d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a > + mbutterfly_h2 d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a > + mbutterfly_h1 d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a > + mbutterfly_h2 d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a > + mbutterfly_h1 d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a > + mbutterfly_h2 d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a > + > + sub r12, r12, #32 > + vld1.16 {q0}, [r12,:128] > + > + butterfly d4, d24, d16, d24 @ d4 = t16, d24 = t17 > + butterfly d5, d20, d28, d20 @ d5 = t19, d20 = t18 > + butterfly d6, d26, d18, d26 @ d6 = t20, d26 = t21 > + butterfly d7, d22, d30, d22 @ d7 = t23, d22 = t22 > + butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 > + butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 > + butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 > + butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 > + > + mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a > + mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a > + mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a > + mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a > + > + idct32_end > +endfunc > + > +function idct32_odd_quarter > + movrel r12, idct_coeffs > + add r12, r12, #32 > + vld1.16 {q0-q1}, [r12,:128] > + > + vmull.s16 q2, d16, d0[0] > + vmull.s16 q14, d19, d1[3] > + vmull.s16 q15, d16, d0[1] > + vmull.s16 q11, d17, d3[2] > + vmull.s16 q3, d17, d3[3] > + vmull.s16 q13, d19, d1[2] > + vmull.s16 q10, d18, d2[0] > + vmull.s16 q12, d18, d2[1] > + > + sub r12, r12, #32 > + vld1.16 {q0}, [r12,:128] > + > + vneg.s32 q14, q14 > + vneg.s32 q3, q3 > + > + vrshrn.s32 d4, q2, #14 > + vrshrn.s32 d5, q14, #14 > + vrshrn.s32 d29, q15, #14 > + vrshrn.s32 d28, q11, #14 > + vrshrn.s32 d7, q3, #14 > + vrshrn.s32 d31, q13, #14 > + vrshrn.s32 d6, q10, #14 > + vrshrn.s32 d30, q12, #14 > + > + mbutterfly_l q8, q9, d29, d4, d0[3], d1[0] > + mbutterfly_l q13, q10, d31, d5, d0[3], d1[0] > + vrshrn.s32 d23, q8, #14 > + vrshrn.s32 d24, q9, #14 > + vneg.s32 q10, q10 > + vrshrn.s32 d27, q13, #14 > + vrshrn.s32 d20, q10, #14 > + mbutterfly_l q8, q9, d30, d6, d1[1], d1[2] > + vrshrn.s32 d21, q8, #14 > + vrshrn.s32 d26, q9, #14 > + mbutterfly_l q8, q9, d28, d7, d1[1], d1[2] > + vrshrn.s32 d25, q8, #14 > + vneg.s32 q9, q9 > + vrshrn.s32 d22, q9, #14 > + > + idct32_end > endfunc > > @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. > @@ -994,6 +1220,11 @@ function idct32_1d_4x32_pass1_neon > mov r12, #128 > vmov.s16 d4, #0 > > + cmp r3, #34 > + ble 3f > + cmp r3, #135 > + ble 4f > + > @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > vld1.16 {d\i}, [r2,:64] > @@ -1001,7 +1232,25 @@ function idct32_1d_4x32_pass1_neon > .endr > > bl idct16 > + sub r2, r2, r12, lsl #4 > + b 5f > +3: > +.irp i, 16, 17, 18, 19 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 > +.endr > + bl idct16_quarter > + sub r2, r2, r12, lsl #2 > + b 5f > +4: > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 > +.endr > + bl idct16_half > + sub r2, r2, r12, lsl #3 same could be done here but I'm not sure if it's a godd idea > +5: > @ Do four 4x4 transposes. Originally, d16-d31 contain the > @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 > @ contain the transposed 4x4 blocks. > @@ -1024,12 +1273,16 @@ function idct32_1d_4x32_pass1_neon > sub r0, r0, #256 > .purgem store_rev > > - @ Move r2 back to the start of the input, and move > - @ to the first odd row > - sub r2, r2, r12, lsl #4 > + @ Move r2 to the first odd row > add r2, r2, #64 > > vmov.s16 d4, #0 > + > + cmp r3, #34 > + ble 3f > + cmp r3, #135 > + ble 4f > + > @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > vld1.16 {d\i}, [r2,:64] > @@ -1037,7 +1290,22 @@ function idct32_1d_4x32_pass1_neon > .endr > > bl idct32_odd > + b 5f > +3: > +.irp i, 16, 17, 18, 19 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 > +.endr > + bl idct32_odd_quarter > + b 5f > +4: > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 > + vld1.16 {d\i}, [r2,:64] > + vst1.16 {d4}, [r2,:64], r12 > +.endr > + bl idct32_odd_half > > +5: > transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 > > @ Store the registers a, b, c, d horizontally, > @@ -1078,6 +1346,12 @@ function idct32_1d_4x32_pass2_neon > vld1.16 {q0-q1}, [r12,:128] > > mov r12, #128 > + > + cmp r3, #34 > + ble 3f > + cmp r3, #135 > + ble 4f > + > @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > vld1.16 {d\i}, [r2,:64], r12 > @@ -1085,7 +1359,23 @@ function idct32_1d_4x32_pass2_neon > sub r2, r2, r12, lsl #4 > > bl idct16 > + b 5f > +3: > +.irp i, 16, 17, 18, 19 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + sub r2, r2, r12, lsl #2 > + bl idct16_quarter > + b 5f > + > +4: > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + sub r2, r2, r12, lsl #3 > + bl idct16_half > > +5: > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > vst1.16 {d\i}, [r2,:64], r12 > .endr > @@ -1093,15 +1383,36 @@ function idct32_1d_4x32_pass2_neon > sub r2, r2, r12, lsl #4 > add r2, r2, #64 > > + cmp r3, #34 > + ble 3f > + cmp r3, #135 > + ble 4f > + > @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > vld1.16 {d\i}, [r2,:64], r12 > .endr > sub r2, r2, r12, lsl #4 > - sub r2, r2, #64 > > bl idct32_odd > + b 5f > > +3: > +.irp i, 16, 17, 18, 19 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + sub r2, r2, r12, lsl #2 > + bl idct32_odd_quarter > + b 5f > +4: > +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 > + vld1.16 {d\i}, [r2,:64], r12 > +.endr > + sub r2, r2, r12, lsl #3 > + bl idct32_odd_half > + > +5: > + sub r2, r2, #64 > mov r12, #128 > .macro load_acc_store a, b, c, d, neg=0 > vld1.16 {d4}, [r2,:64], r12 otherwise ok JAnne
On Fri, 3 Feb 2017, Janne Grunau wrote: > On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote: >> This work is sponsored by, and copyright, Google. >> >> This increases the code size of libavcodec/arm/vp9itxfm_neon.o >> from 12388 to 15064 bytes. >> >> Before: Cortex A7 A8 A9 A53 >> vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.7 211.9 235.8 >> vp9_inv_dct_dct_16x16_sub2_add_neon: 2056.7 1521.2 1734.8 1262.0 >> vp9_inv_dct_dct_16x16_sub4_add_neon: 2060.8 1608.5 1735.7 1262.0 >> vp9_inv_dct_dct_16x16_sub8_add_neon: 2444.9 1801.6 2007.8 1508.5 >> vp9_inv_dct_dct_16x16_sub12_add_neon: 2902.1 2116.7 2285.1 1751.7 >> vp9_inv_dct_dct_16x16_sub16_add_neon: 3211.2 2443.5 2546.1 1999.5 >> vp9_inv_dct_dct_32x32_sub1_add_neon: 752.0 456.7 866.0 553.9 >> vp9_inv_dct_dct_32x32_sub2_add_neon: 11042.7 8127.5 8582.7 6822.8 >> vp9_inv_dct_dct_32x32_sub4_add_neon: 10682.0 8043.8 8581.3 6810.1 >> vp9_inv_dct_dct_32x32_sub8_add_neon: 11908.0 9281.8 9381.9 7562.4 >> vp9_inv_dct_dct_32x32_sub12_add_neon: 13015.2 10791.1 10220.3 8318.9 >> vp9_inv_dct_dct_32x32_sub16_add_neon: 14150.3 11886.2 11032.6 9064.8 >> vp9_inv_dct_dct_32x32_sub20_add_neon: 15165.7 12993.8 11847.0 9816.7 >> vp9_inv_dct_dct_32x32_sub24_add_neon: 16280.8 15111.2 12658.6 10576.8 >> vp9_inv_dct_dct_32x32_sub28_add_neon: 17412.6 15549.4 13462.7 11325.6 >> vp9_inv_dct_dct_32x32_sub32_add_neon: 18522.4 17277.4 14286.7 12087.9 >> >> After: >> vp9_inv_dct_dct_16x16_sub1_add_neon: 273.0 189.5 211.5 236.1 >> vp9_inv_dct_dct_16x16_sub2_add_neon: 1448.2 994.0 1191.3 836.0 >> vp9_inv_dct_dct_16x16_sub4_add_neon: 1437.0 991.0 1191.6 836.0 >> vp9_inv_dct_dct_16x16_sub8_add_neon: 2114.5 1757.9 1855.3 1335.3 >> vp9_inv_dct_dct_16x16_sub12_add_neon: 2862.7 2141.5 2293.3 1772.7 >> vp9_inv_dct_dct_16x16_sub16_add_neon: 3299.6 2419.1 2552.7 2033.0 >> vp9_inv_dct_dct_32x32_sub1_add_neon: 753.0 457.5 864.3 554.8 >> vp9_inv_dct_dct_32x32_sub2_add_neon: 7867.8 5978.6 6594.6 5109.9 >> vp9_inv_dct_dct_32x32_sub4_add_neon: 7871.0 5772.5 6582.2 5108.5 >> vp9_inv_dct_dct_32x32_sub8_add_neon: 8694.8 6925.7 7125.7 5671.4 >> vp9_inv_dct_dct_32x32_sub12_add_neon: 11250.3 9654.7 9557.6 7540.5 >> vp9_inv_dct_dct_32x32_sub16_add_neon: 12129.5 11061.1 10295.0 8220.7 >> vp9_inv_dct_dct_32x32_sub20_add_neon: 15218.4 13580.8 11841.3 9739.9 >> vp9_inv_dct_dct_32x32_sub24_add_neon: 16343.5 15097.0 12629.2 10496.6 >> vp9_inv_dct_dct_32x32_sub28_add_neon: 17482.2 15516.4 13476.0 11261.0 >> vp9_inv_dct_dct_32x32_sub32_add_neon: 18586.7 16817.5 14289.3 12019.0 >> >> --- >> If we wouldn't have made the core transforms standalone functions >> in the previous patch, the code size would increase to around 21 KB (which >> isn't too bad), but the idct32 pass1/2 functions would bloat up so much >> that they would require literal pools within the functions themselves. >> --- >> libavcodec/arm/vp9itxfm_neon.S | 351 ++++++++++++++++++++++++++++++++++++++--- >> 1 file changed, 331 insertions(+), 20 deletions(-) >> >> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S >> index 22e63e5..bd3f678 100644 >> --- a/libavcodec/arm/vp9itxfm_neon.S >> +++ b/libavcodec/arm/vp9itxfm_neon.S >> @@ -74,6 +74,14 @@ endconst >> vrshrn.s32 \out2, \tmpq4, #14 >> .endm >> >> +@ Same as mbutterfly0 above, but treating the input in in2 as zero, >> +@ writing the same output into both out1 and out2. >> +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 >> + vmull.s16 \tmpq3, \in1, d0[0] >> + vrshrn.s32 \out1, \tmpq3, #14 >> + vmov \out2, \out1 > > if you haven't already tried doing the vrshrn twice could be faster > since it has less dependencies Didn't think of that - it does indeed seem to help (both here and in the aarch64 version), so applied that. >> @@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon >> >> mov r12, #32 >> vmov.s16 q2, #0 >> + >> +.ifc \txfm,idct >> + cmp r3, #10 >> + ble 3f >> + cmp r3, #38 >> + ble 4f >> +.endif > > I'd test only for less or equal 38 here > >> + >> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 >> vld1.16 {d\i}, [r2,:64] >> vst1.16 {d4}, [r2,:64], r12 >> .endr >> >> bl \txfm\()16 >> +.ifc \txfm,idct >> + b 5f > > cmp r3, #10 > >> + >> +3: >> +.irp i, 16, 17, 18, 19 >> + vld1.16 {d\i}, [r2,:64] >> + vst1.16 {d4}, [r2,:64], r12 >> +.endr >> + bl idct16_quarter >> + b 5f > > remove this > >> + >> +4: >> +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 >> + vld1.16 {d\i}, [r2,:64] >> + vst1.16 {d4}, [r2,:64], r12 > > .if \i == 19 > blle idct16_half > ble 5f > .endif > > saves a little binary space not sure if it's worth it. Thanks for the reviews! Hmm, that looks pretty neat. I folded in this change into the aarch64 version (and the rshrn instead of mov) as well, using a b.gt instead of conditional bl, like this: .if \i == 19 b.gt 4f bl idct16_quarter b 5f 4: .endif In principle I guess one could interleave the same in the full loop as well, having only one loop, with special case checks for i == 19 and i == 23. Then we'd end up with two comparisons instead of one when doing the full case - not sure if it's preferrable or not. The main question though is whether you prefer this or alternative 2. // Martin
On 2017-02-03 23:44:51 +0200, Martin Storsjö wrote: > On Fri, 3 Feb 2017, Janne Grunau wrote: > > >On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote: > >>This work is sponsored by, and copyright, Google. > >> > > >>@@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon > >> > >> mov r12, #32 > >> vmov.s16 q2, #0 > >>+ > >>+.ifc \txfm,idct > >>+ cmp r3, #10 > >>+ ble 3f > >>+ cmp r3, #38 > >>+ ble 4f > >>+.endif > > > >I'd test only for less or equal 38 here > > > >>+ > >> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > >> vld1.16 {d\i}, [r2,:64] > >> vst1.16 {d4}, [r2,:64], r12 > >> .endr > >> > >> bl \txfm\()16 > >>+.ifc \txfm,idct > >>+ b 5f > > > >cmp r3, #10 > > > >>+ > >>+3: > >>+.irp i, 16, 17, 18, 19 > >>+ vld1.16 {d\i}, [r2,:64] > >>+ vst1.16 {d4}, [r2,:64], r12 > >>+.endr > >>+ bl idct16_quarter > >>+ b 5f > > > >remove this > > > >>+ > >>+4: > >>+.irp i, 16, 17, 18, 19, 20, 21, 22, 23 > >>+ vld1.16 {d\i}, [r2,:64] > >>+ vst1.16 {d4}, [r2,:64], r12 > > > >.if \i == 19 > >blle idct16_half > >ble 5f > >.endif > > > >saves a little binary space not sure if it's worth it. > > Hmm, that looks pretty neat. > > I folded in this change into the aarch64 version (and the rshrn instead of > mov) as well, using a b.gt instead of conditional bl, like this: > > .if \i == 19 > b.gt 4f > bl idct16_quarter > b 5f > 4: > .endif > > In principle I guess one could interleave the same in the full loop as well, > having only one loop, with special case checks for i == 19 and i == 23. Then > we'd end up with two comparisons instead of one when doing the full case - > not sure if it's preferrable or not. I doubt the comparisons are noticeable. so folding it into the main loop should be fine. > The main question though is whether you prefer this or alternative 2. see my other mail. I have no strong opinion. Janne
On Sat, 4 Feb 2017, Janne Grunau wrote: > On 2017-02-03 23:44:51 +0200, Martin Storsjö wrote: >> On Fri, 3 Feb 2017, Janne Grunau wrote: >> >> >On 2016-12-01 11:26:57 +0200, Martin Storsjö wrote: >> >>This work is sponsored by, and copyright, Google. >> >> >> >> >>@@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon >> >> >> >> mov r12, #32 >> >> vmov.s16 q2, #0 >> >>+ >> >>+.ifc \txfm,idct >> >>+ cmp r3, #10 >> >>+ ble 3f >> >>+ cmp r3, #38 >> >>+ ble 4f >> >>+.endif >> > >> >I'd test only for less or equal 38 here >> > >> >>+ >> >> .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 >> >> vld1.16 {d\i}, [r2,:64] >> >> vst1.16 {d4}, [r2,:64], r12 >> >> .endr >> >> >> >> bl \txfm\()16 >> >>+.ifc \txfm,idct >> >>+ b 5f >> > >> >cmp r3, #10 >> > >> >>+ >> >>+3: >> >>+.irp i, 16, 17, 18, 19 >> >>+ vld1.16 {d\i}, [r2,:64] >> >>+ vst1.16 {d4}, [r2,:64], r12 >> >>+.endr >> >>+ bl idct16_quarter >> >>+ b 5f >> > >> >remove this >> > >> >>+ >> >>+4: >> >>+.irp i, 16, 17, 18, 19, 20, 21, 22, 23 >> >>+ vld1.16 {d\i}, [r2,:64] >> >>+ vst1.16 {d4}, [r2,:64], r12 >> > >> >.if \i == 19 >> >blle idct16_half >> >ble 5f >> >.endif >> > >> >saves a little binary space not sure if it's worth it. >> >> Hmm, that looks pretty neat. >> >> I folded in this change into the aarch64 version (and the rshrn instead of >> mov) as well, using a b.gt instead of conditional bl, like this: >> >> .if \i == 19 >> b.gt 4f >> bl idct16_quarter >> b 5f >> 4: >> .endif >> >> In principle I guess one could interleave the same in the full loop as well, >> having only one loop, with special case checks for i == 19 and i == 23. Then >> we'd end up with two comparisons instead of one when doing the full case - >> not sure if it's preferrable or not. > > I doubt the comparisons are noticeable. so folding it into the main loop > should be fine. Hmm, indeed. And in this case, the diff of this alternative turns out pretty small and neat actually. // Martin
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 22e63e5..bd3f678 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -74,6 +74,14 @@ endconst vrshrn.s32 \out2, \tmpq4, #14 .endm +@ Same as mbutterfly0 above, but treating the input in in2 as zero, +@ writing the same output into both out1 and out2. +.macro mbutterfly0_h out1, out2, in1, in2, tmpd1, tmpd2, tmpq3, tmpq4 + vmull.s16 \tmpq3, \in1, d0[0] + vrshrn.s32 \out1, \tmpq3, #14 + vmov \out2, \out1 +.endm + @ out1,out2 = ((in1 + in2) * d0[0] + (1 << 13)) >> 14 @ out3,out4 = ((in1 - in2) * d0[0] + (1 << 13)) >> 14 @ Same as mbutterfly0, but with input being 2 q registers, output @@ -137,6 +145,23 @@ endconst vrshrn.s32 \inout2, \tmp2, #14 .endm +@ Same as mbutterfly above, but treating the input in inout2 as zero +.macro mbutterfly_h1 inout1, inout2, coef1, coef2, tmp1, tmp2 + vmull.s16 \tmp1, \inout1, \coef1 + vmull.s16 \tmp2, \inout1, \coef2 + vrshrn.s32 \inout1, \tmp1, #14 + vrshrn.s32 \inout2, \tmp2, #14 +.endm + +@ Same as mbutterfly above, but treating the input in inout1 as zero +.macro mbutterfly_h2 inout1, inout2, coef1, coef2, tmp1, tmp2 + vmull.s16 \tmp1, \inout2, \coef2 + vmull.s16 \tmp2, \inout2, \coef1 + vneg.s32 \tmp1, \tmp1 + vrshrn.s32 \inout2, \tmp2, #14 + vrshrn.s32 \inout1, \tmp1, #14 +.endm + @ inout1,inout2 = (inout1,inout2 * coef1 - inout3,inout4 * coef2 + (1 << 13)) >> 14 @ inout3,inout4 = (inout1,inout2 * coef2 + inout3,inout4 * coef1 + (1 << 13)) >> 14 @ inout are 4 d registers, tmp are 4 q registers @@ -534,7 +559,7 @@ function idct16x16_dc_add_neon endfunc .ltorg -function idct16 +.macro idct16_full mbutterfly0 d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a mbutterfly d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a mbutterfly d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a @@ -556,7 +581,10 @@ function idct16 mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a, d25 = t14a mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 = t13a, d21 = t10a + idct16_end +.endm +.macro idct16_end butterfly d18, d7, d4, d7 @ d18 = t0a, d7 = t7a butterfly d19, d22, d5, d22 @ d19 = t1a, d22 = t6 butterfly d4, d26, d20, d26 @ d4 = t2a, d26 = t5 @@ -581,6 +609,66 @@ function idct16 butterfly d20, d27, d6, d27 @ d20 = out[4], d27 = out[11] butterfly d21, d26, d26, d4 @ d21 = out[5], d26 = out[10] bx lr +.endm + +function idct16 + idct16_full +endfunc + +function idct16_half + mbutterfly0_h d16, d24, d16, d24, d4, d6, q2, q3 @ d16 = t0a, d24 = t1a + mbutterfly_h1 d20, d28, d0[1], d0[2], q2, q3 @ d20 = t2a, d28 = t3a + mbutterfly_h1 d18, d30, d0[3], d1[0], q2, q3 @ d18 = t4a, d30 = t7a + mbutterfly_h2 d26, d22, d1[1], d1[2], q2, q3 @ d26 = t5a, d22 = t6a + mbutterfly_h1 d17, d31, d1[3], d2[0], q2, q3 @ d17 = t8a, d31 = t15a + mbutterfly_h2 d25, d23, d2[1], d2[2], q2, q3 @ d25 = t9a, d23 = t14a + mbutterfly_h1 d21, d27, d2[3], d3[0], q2, q3 @ d21 = t10a, d27 = t13a + mbutterfly_h2 d29, d19, d3[1], d3[2], q2, q3 @ d29 = t11a, d19 = t12a + + butterfly d4, d28, d16, d28 @ d4 = t0, d28 = t3 + butterfly d5, d20, d24, d20 @ d5 = t1, d20 = t2 + butterfly d6, d26, d18, d26 @ d6 = t4, d26 = t5 + butterfly d7, d22, d30, d22 @ d7 = t7, d22 = t6 + butterfly d16, d25, d17, d25 @ d16 = t8, d25 = t9 + butterfly d24, d21, d29, d21 @ d24 = t11, d21 = t10 + butterfly d17, d27, d19, d27 @ d17 = t12, d27 = t13 + butterfly d29, d23, d31, d23 @ d29 = t15, d23 = t14 + + mbutterfly0 d22, d26, d22, d26, d18, d30, q9, q15 @ d22 = t6a, d26 = t5a + mbutterfly d23, d25, d0[1], d0[2], q9, q15 @ d23 = t9a, d25 = t14a + mbutterfly d27, d21, d0[1], d0[2], q9, q15, neg=1 @ d27 = t13a, d21 = t10a + idct16_end +endfunc + +function idct16_quarter + vmull.s16 q12, d19, d3[2] + vmull.s16 q2, d17, d1[3] + vmull.s16 q3, d18, d1[0] + vmull.s16 q15, d18, d0[3] + vneg.s32 q12, q12 + vmull.s16 q14, d17, d2[0] + vmull.s16 q13, d19, d3[1] + vmull.s16 q11, d16, d0[0] + vrshrn.s32 d24, q12, #14 + vrshrn.s32 d16, q2, #14 + vrshrn.s32 d7, q3, #14 + vrshrn.s32 d6, q15, #14 + vrshrn.s32 d29, q14, #14 + vrshrn.s32 d17, q13, #14 + vrshrn.s32 d28, q11, #14 + + mbutterfly_l q10, q11, d17, d24, d0[1], d0[2] + mbutterfly_l q9, q15, d29, d16, d0[1], d0[2] + vneg.s32 q11, q11 + vrshrn.s32 d27, q10, #14 + vrshrn.s32 d21, q11, #14 + vrshrn.s32 d23, q9, #14 + vrshrn.s32 d25, q15, #14 + vmov d4, d28 + vmov d5, d28 + mbutterfly0 d22, d26, d7, d6, d18, d30, q9, q15 + vmov d20, d28 + idct16_end endfunc function iadst16 @@ -668,13 +756,40 @@ function \txfm\()16_1d_4x16_pass1_neon mov r12, #32 vmov.s16 q2, #0 + +.ifc \txfm,idct + cmp r3, #10 + ble 3f + cmp r3, #38 + ble 4f +.endif + .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64] vst1.16 {d4}, [r2,:64], r12 .endr bl \txfm\()16 +.ifc \txfm,idct + b 5f + +3: +.irp i, 16, 17, 18, 19 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + bl idct16_quarter + b 5f + +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + bl idct16_half +.endif +5: @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @ contain the transposed 4x4 blocks. @@ -721,58 +836,80 @@ endfunc @ r0 = dst @ r1 = dst stride @ r2 = src (temp buffer) -@ r3 = slice offset +@ r3 = eob +@ r8 = slice offset function \txfm\()16_1d_4x16_pass2_neon push {lr} mov r12, #32 +.ifc \txfm,idct + cmp r3, #10 + ble 3f + cmp r3, #38 + ble 4f +.endif .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27 vld1.16 {d\i}, [r2,:64], r12 .endr - cmp r3, #0 + cmp r8, #0 beq 1f .irp i, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64], r12 .endr 1: - add r3, r0, r1 - lsl r1, r1, #1 bl \txfm\()16 +.ifc \txfm,idct + b 5f +3: +.irp i, 16, 17, 18, 19 + vld1.16 {d\i}, [r2,:64], r12 +.endr + bl idct16_quarter + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.16 {d\i}, [r2,:64], r12 +.endr + bl idct16_half +.endif +5: + add r8, r0, r1 + lsl r1, r1, #1 .macro load_add_store coef0, coef1, coef2, coef3 vrshr.s16 \coef0, \coef0, #6 vrshr.s16 \coef1, \coef1, #6 vld1.32 {d4[]}, [r0,:32], r1 - vld1.32 {d4[1]}, [r3,:32], r1 + vld1.32 {d4[1]}, [r8,:32], r1 vrshr.s16 \coef2, \coef2, #6 vrshr.s16 \coef3, \coef3, #6 vld1.32 {d5[]}, [r0,:32], r1 - vld1.32 {d5[1]}, [r3,:32], r1 + vld1.32 {d5[1]}, [r8,:32], r1 vaddw.u8 \coef0, \coef0, d4 vld1.32 {d6[]}, [r0,:32], r1 - vld1.32 {d6[1]}, [r3,:32], r1 + vld1.32 {d6[1]}, [r8,:32], r1 vaddw.u8 \coef1, \coef1, d5 vld1.32 {d7[]}, [r0,:32], r1 - vld1.32 {d7[1]}, [r3,:32], r1 + vld1.32 {d7[1]}, [r8,:32], r1 vqmovun.s16 d4, \coef0 vqmovun.s16 d5, \coef1 sub r0, r0, r1, lsl #2 - sub r3, r3, r1, lsl #2 + sub r8, r8, r1, lsl #2 vaddw.u8 \coef2, \coef2, d6 vaddw.u8 \coef3, \coef3, d7 vst1.32 {d4[0]}, [r0,:32], r1 - vst1.32 {d4[1]}, [r3,:32], r1 + vst1.32 {d4[1]}, [r8,:32], r1 vqmovun.s16 d6, \coef2 vst1.32 {d5[0]}, [r0,:32], r1 - vst1.32 {d5[1]}, [r3,:32], r1 + vst1.32 {d5[1]}, [r8,:32], r1 vqmovun.s16 d7, \coef3 vst1.32 {d6[0]}, [r0,:32], r1 - vst1.32 {d6[1]}, [r3,:32], r1 + vst1.32 {d6[1]}, [r8,:32], r1 vst1.32 {d7[0]}, [r0,:32], r1 - vst1.32 {d7[1]}, [r3,:32], r1 + vst1.32 {d7[1]}, [r8,:32], r1 .endm load_add_store q8, q9, q10, q11 load_add_store q12, q13, q14, q15 @@ -799,6 +936,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 push {r4-r8,lr} .ifnc \txfm1\()_\txfm2,idct_idct vpush {q4-q7} + mov r3, #256 .else movrel r8, min_eob_idct_idct_16 + 2 .endif @@ -859,7 +997,7 @@ A and r7, sp, #15 add r0, r4, #(\i) mov r1, r5 add r2, sp, #(\i*2) - mov r3, #\i + mov r8, #\i bl \txfm2\()16_1d_4x16_pass2_neon .endr @@ -913,7 +1051,7 @@ function idct32x32_dc_add_neon bx lr endfunc -function idct32_odd +.macro idct32_odd_full movrel r12, idct_coeffs add r12, r12, #32 vld1.16 {q0-q1}, [r12,:128] @@ -943,7 +1081,10 @@ function idct32_odd mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a + idct32_end +.endm +.macro idct32_end butterfly d16, d5, d4, d5 @ d16 = t16a, d5 = t19a butterfly d17, d20, d23, d20 @ d17 = t17, d20 = t18 butterfly d18, d6, d7, d6 @ d18 = t23a, d6 = t20a @@ -973,6 +1114,91 @@ function idct32_odd mbutterfly0 d25, d22, d25, d22, d4, d6, q2, q3 @ d25 = t25, d22 = t22 mbutterfly0 d24, d23, d24, d23, d4, d6, q2, q3 @ d24 = t24a, d23 = t23a bx lr +.endm + +function idct32_odd + idct32_odd_full +endfunc + +function idct32_odd_half + movrel r12, idct_coeffs + add r12, r12, #32 + vld1.16 {q0-q1}, [r12,:128] + + mbutterfly_h1 d16, d31, d0[0], d0[1], q2, q3 @ d16 = t16a, d31 = t31a + mbutterfly_h2 d24, d23, d0[2], d0[3], q2, q3 @ d24 = t17a, d23 = t30a + mbutterfly_h1 d20, d27, d1[0], d1[1], q2, q3 @ d20 = t18a, d27 = t29a + mbutterfly_h2 d28, d19, d1[2], d1[3], q2, q3 @ d28 = t19a, d19 = t28a + mbutterfly_h1 d18, d29, d2[0], d2[1], q2, q3 @ d18 = t20a, d29 = t27a + mbutterfly_h2 d26, d21, d2[2], d2[3], q2, q3 @ d26 = t21a, d21 = t26a + mbutterfly_h1 d22, d25, d3[0], d3[1], q2, q3 @ d22 = t22a, d25 = t25a + mbutterfly_h2 d30, d17, d3[2], d3[3], q2, q3 @ d30 = t23a, d17 = t24a + + sub r12, r12, #32 + vld1.16 {q0}, [r12,:128] + + butterfly d4, d24, d16, d24 @ d4 = t16, d24 = t17 + butterfly d5, d20, d28, d20 @ d5 = t19, d20 = t18 + butterfly d6, d26, d18, d26 @ d6 = t20, d26 = t21 + butterfly d7, d22, d30, d22 @ d7 = t23, d22 = t22 + butterfly d28, d25, d17, d25 @ d28 = t24, d25 = t25 + butterfly d30, d21, d29, d21 @ d30 = t27, d21 = t26 + butterfly d29, d23, d31, d23 @ d29 = t31, d23 = t30 + butterfly d31, d27, d19, d27 @ d31 = t28, d27 = t29 + + mbutterfly d23, d24, d0[3], d1[0], q8, q9 @ d23 = t17a, d24 = t30a + mbutterfly d27, d20, d0[3], d1[0], q8, q9, neg=1 @ d27 = t29a, d20 = t18a + mbutterfly d21, d26, d1[1], d1[2], q8, q9 @ d21 = t21a, d26 = t26a + mbutterfly d25, d22, d1[1], d1[2], q8, q9, neg=1 @ d25 = t25a, d22 = t22a + + idct32_end +endfunc + +function idct32_odd_quarter + movrel r12, idct_coeffs + add r12, r12, #32 + vld1.16 {q0-q1}, [r12,:128] + + vmull.s16 q2, d16, d0[0] + vmull.s16 q14, d19, d1[3] + vmull.s16 q15, d16, d0[1] + vmull.s16 q11, d17, d3[2] + vmull.s16 q3, d17, d3[3] + vmull.s16 q13, d19, d1[2] + vmull.s16 q10, d18, d2[0] + vmull.s16 q12, d18, d2[1] + + sub r12, r12, #32 + vld1.16 {q0}, [r12,:128] + + vneg.s32 q14, q14 + vneg.s32 q3, q3 + + vrshrn.s32 d4, q2, #14 + vrshrn.s32 d5, q14, #14 + vrshrn.s32 d29, q15, #14 + vrshrn.s32 d28, q11, #14 + vrshrn.s32 d7, q3, #14 + vrshrn.s32 d31, q13, #14 + vrshrn.s32 d6, q10, #14 + vrshrn.s32 d30, q12, #14 + + mbutterfly_l q8, q9, d29, d4, d0[3], d1[0] + mbutterfly_l q13, q10, d31, d5, d0[3], d1[0] + vrshrn.s32 d23, q8, #14 + vrshrn.s32 d24, q9, #14 + vneg.s32 q10, q10 + vrshrn.s32 d27, q13, #14 + vrshrn.s32 d20, q10, #14 + mbutterfly_l q8, q9, d30, d6, d1[1], d1[2] + vrshrn.s32 d21, q8, #14 + vrshrn.s32 d26, q9, #14 + mbutterfly_l q8, q9, d28, d7, d1[1], d1[2] + vrshrn.s32 d25, q8, #14 + vneg.s32 q9, q9 + vrshrn.s32 d22, q9, #14 + + idct32_end endfunc @ Do an 32-point IDCT of a 4x32 slice out of a 32x32 matrix. @@ -994,6 +1220,11 @@ function idct32_1d_4x32_pass1_neon mov r12, #128 vmov.s16 d4, #0 + cmp r3, #34 + ble 3f + cmp r3, #135 + ble 4f + @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64] @@ -1001,7 +1232,25 @@ function idct32_1d_4x32_pass1_neon .endr bl idct16 + sub r2, r2, r12, lsl #4 + b 5f +3: +.irp i, 16, 17, 18, 19 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + bl idct16_quarter + sub r2, r2, r12, lsl #2 + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + bl idct16_half + sub r2, r2, r12, lsl #3 +5: @ Do four 4x4 transposes. Originally, d16-d31 contain the @ 16 rows. Afterwards, d16-d19, d20-d23, d24-d27, d28-d31 @ contain the transposed 4x4 blocks. @@ -1024,12 +1273,16 @@ function idct32_1d_4x32_pass1_neon sub r0, r0, #256 .purgem store_rev - @ Move r2 back to the start of the input, and move - @ to the first odd row - sub r2, r2, r12, lsl #4 + @ Move r2 to the first odd row add r2, r2, #64 vmov.s16 d4, #0 + + cmp r3, #34 + ble 3f + cmp r3, #135 + ble 4f + @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64] @@ -1037,7 +1290,22 @@ function idct32_1d_4x32_pass1_neon .endr bl idct32_odd + b 5f +3: +.irp i, 16, 17, 18, 19 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + bl idct32_odd_quarter + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.16 {d\i}, [r2,:64] + vst1.16 {d4}, [r2,:64], r12 +.endr + bl idct32_odd_half +5: transpose16_q_4x_4x4 q15, q14, q13, q12, q11, q10, q9, q8, d31, d30, d29, d28, d27, d26, d25, d24, d23, d22, d21, d20, d19, d18, d17, d16 @ Store the registers a, b, c, d horizontally, @@ -1078,6 +1346,12 @@ function idct32_1d_4x32_pass2_neon vld1.16 {q0-q1}, [r12,:128] mov r12, #128 + + cmp r3, #34 + ble 3f + cmp r3, #135 + ble 4f + @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64], r12 @@ -1085,7 +1359,23 @@ function idct32_1d_4x32_pass2_neon sub r2, r2, r12, lsl #4 bl idct16 + b 5f +3: +.irp i, 16, 17, 18, 19 + vld1.16 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #2 + bl idct16_quarter + b 5f + +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.16 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #3 + bl idct16_half +5: .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vst1.16 {d\i}, [r2,:64], r12 .endr @@ -1093,15 +1383,36 @@ function idct32_1d_4x32_pass2_neon sub r2, r2, r12, lsl #4 add r2, r2, #64 + cmp r3, #34 + ble 3f + cmp r3, #135 + ble 4f + @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31) .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 vld1.16 {d\i}, [r2,:64], r12 .endr sub r2, r2, r12, lsl #4 - sub r2, r2, #64 bl idct32_odd + b 5f +3: +.irp i, 16, 17, 18, 19 + vld1.16 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #2 + bl idct32_odd_quarter + b 5f +4: +.irp i, 16, 17, 18, 19, 20, 21, 22, 23 + vld1.16 {d\i}, [r2,:64], r12 +.endr + sub r2, r2, r12, lsl #3 + bl idct32_odd_half + +5: + sub r2, r2, #64 mov r12, #128 .macro load_acc_store a, b, c, d, neg=0 vld1.16 {d4}, [r2,:64], r12