Message ID | 1479906058-22747-4-git-send-email-martin@martin.st |
---|---|
State | Superseded |
Headers | show |
On 2016-11-23 15:00:51 +0200, Martin Storsjö wrote: > This work is sponsored by, and copyright, Google. > > Previously all subpartitions except the eob=1 (DC) case ran with > the same runtime: > > vp9_inv_dct_dct_16x16_sub16_add_neon: 3189.0 2486.8 2509.9 1964.1 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18448.1 16682.0 14235.4 11993.4 > > By skipping individual 4x16 or 4x32 pixel slices in the first pass, > we reduce the runtime of these functions like this: > > vp9_inv_dct_dct_16x16_sub1_add_neon: 271.5 188.7 211.6 235.1 > vp9_inv_dct_dct_16x16_sub4_add_neon: 2079.7 1606.3 1772.1 1264.8 > vp9_inv_dct_dct_16x16_sub8_add_neon: 2449.2 1834.3 2046.5 1499.7 > vp9_inv_dct_dct_16x16_sub12_add_neon: 2826.2 2109.2 2295.9 1758.2 > vp9_inv_dct_dct_16x16_sub16_add_neon: 3224.1 2476.5 2533.1 1985.7 > vp9_inv_dct_dct_32x32_sub1_add_neon: 752.5 457.5 863.7 554.7 > vp9_inv_dct_dct_32x32_sub4_add_neon: 10689.2 8013.4 8592.9 6785.9 > vp9_inv_dct_dct_32x32_sub8_add_neon: 12217.8 9068.1 9420.4 7518.3 > vp9_inv_dct_dct_32x32_sub12_add_neon: 12967.3 10455.5 10223.9 8275.7 > vp9_inv_dct_dct_32x32_sub16_add_neon: 14084.1 11933.7 10998.9 9012.5 > vp9_inv_dct_dct_32x32_sub20_add_neon: 15171.4 13335.0 11820.6 9757.2 > vp9_inv_dct_dct_32x32_sub24_add_neon: 16229.6 15185.7 12614.4 10504.9 > vp9_inv_dct_dct_32x32_sub28_add_neon: 17338.1 15955.3 13445.0 11248.4 > vp9_inv_dct_dct_32x32_sub32_add_neon: 18465.7 16974.6 14239.2 11999.1 > > I.e. in general a very minor overhead for the full subpartition case due > to the additional cmps, but a significant speedup for the cases when we > only need to process a small part of the actual input data. > > In common VP9 content in a few inspected clips, 70-90% of the non-dc-only > 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left > 8x8 or 16x16 subpartitions respectively. > --- > This goes on top of the checkasm vp9dsp patch that adds benchmarking > of generic subpartitions in the itxfm. > --- > libavcodec/arm/vp9itxfm_neon.S | 70 ++++++++++++++++++++++++++++++++++++------ > tests/checkasm/vp9dsp.c | 6 ++-- > 2 files changed, 64 insertions(+), 12 deletions(-) > > diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S > index 01944bd..769579a 100644 > --- a/libavcodec/arm/vp9itxfm_neon.S > +++ b/libavcodec/arm/vp9itxfm_neon.S > @@ -659,10 +659,17 @@ endfunc > @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > @ transpose into a horizontal 16x4 slice and store. > @ r0 = dst (temp buffer) > -@ r1 = unused > +@ r1 = slice offset > @ r2 = src > -@ r3 = slice offset > +@ r3 = eob > +@ r9 = min eob > function \txfm\()16_1d_4x16_pass1_neon > +.ifc \txfm,idct > + @ Check if this whole input slice is zero > + cmp r3, r9 > + ble 2f once this check is true it is true for all remaining slices so we should move it out to the main function. > +.endif > + > mov r12, #32 > vmov.s16 q2, #0 > .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 > @@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon > transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 > > @ Store the transposed 4x4 blocks horizontally. > - cmp r3, #12 > + cmp r1, #12 > beq 1f > .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 > vst1.16 {d\i}, [r0,:64]! > .endr > bx lr > 1: > - @ Special case: For the last input column (r3 == 12), > + @ Special case: For the last input column (r1 == 12), > @ which would be stored as the last row in the temp buffer, > @ don't store the first 4x4 block, but keep it in registers > @ for the first slice of the second pass (where it is the > @@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon > vmov d30, d18 > vmov d31, d19 > bx lr > + > +.ifc \txfm,idct > +2: > + @ Set d28-d31 to zero, for the in-register passthrough of coefficients to pass 2 > + vmov.i16 q14, #0 > + vmov.i16 q15, #0 > + @ Write zeros to the temp buffer for pass 2 > +.rept 4 > + vst1.16 {q14-q15}, [r0,:128]! > +.endr > + bx lr > +.endif > endfunc > > @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, > @@ -781,15 +800,23 @@ endfunc > itxfm16_1d_funcs idct > itxfm16_1d_funcs iadst > > +@ This is the minimum eob value for each subpartition, in increments of 4 > +const min_eob_idct_idct_16, align=4 > + .short 0, 10, 38, 89 > +endconst > + > .macro itxfm_func16x16 txfm1, txfm2 > function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 > .ifc \txfm1\()_\txfm2,idct_idct > cmp r3, #1 > beq idct16x16_dc_add_neon > .endif > - push {r4-r7,lr} > + push {r4-r9,lr} > .ifnc \txfm1\()_\txfm2,idct_idct > vpush {q4-q7} > + mov r9, #0 > +.else > + movrel r8, min_eob_idct_idct_16 > .endif > > @ Align the stack, allocate a temp buffer > @@ -810,8 +837,11 @@ A and r7, sp, #15 > > .irp i, 0, 4, 8, 12 > add r0, sp, #(\i*32) > + mov r1, #\i > add r2, r6, #(\i*2) > - mov r3, #\i > +.ifc \txfm1\()_\txfm2,idct_idct > + ldrh r9, [r8, #(\i/2)] using the writeback variant would look imo clearer although it increases the code size for thumb (if we care about that) > +.endif move this to the beginning and load to r1, cmp with eob, conditionally store how much stack space needs to be cleared and jump out of '.irp', saves r9. and several jumps and comparisons if eob is small. > bl \txfm1\()16_1d_4x16_pass1_neon > .endr > .ifc \txfm2,idct > @@ -830,7 +860,7 @@ A and r7, sp, #15 > .ifnc \txfm1\()_\txfm2,idct_idct > vpop {q4-q7} > .endif > - pop {r4-r7,pc} > + pop {r4-r9,pc} > endfunc > .endm > > @@ -944,9 +974,14 @@ endfunc > @ each output written twice), followed by a separate 16-point IDCT > @ of the odd inputs, added/subtracted onto the outputs of the first idct16. > @ r0 = dst (temp buffer) > -@ r1 = unused > +@ r1 = min eob > @ r2 = src > +@ r3 = eob > function idct32_1d_4x32_pass1_neon > + @ Check if this whole input slice is zero > + cmp r3, r1 > + ble 1f the same applies as for the 16x16 idct Janne
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S index 01944bd..769579a 100644 --- a/libavcodec/arm/vp9itxfm_neon.S +++ b/libavcodec/arm/vp9itxfm_neon.S @@ -659,10 +659,17 @@ endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @ transpose into a horizontal 16x4 slice and store. @ r0 = dst (temp buffer) -@ r1 = unused +@ r1 = slice offset @ r2 = src -@ r3 = slice offset +@ r3 = eob +@ r9 = min eob function \txfm\()16_1d_4x16_pass1_neon +.ifc \txfm,idct + @ Check if this whole input slice is zero + cmp r3, r9 + ble 2f +.endif + mov r12, #32 vmov.s16 q2, #0 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31 @@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon transpose16_q_4x_4x4 q8, q9, q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31 @ Store the transposed 4x4 blocks horizontally. - cmp r3, #12 + cmp r1, #12 beq 1f .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31 vst1.16 {d\i}, [r0,:64]! .endr bx lr 1: - @ Special case: For the last input column (r3 == 12), + @ Special case: For the last input column (r1 == 12), @ which would be stored as the last row in the temp buffer, @ don't store the first 4x4 block, but keep it in registers @ for the first slice of the second pass (where it is the @@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon vmov d30, d18 vmov d31, d19 bx lr + +.ifc \txfm,idct +2: + @ Set d28-d31 to zero, for the in-register passthrough of coefficients to pass 2 + vmov.i16 q14, #0 + vmov.i16 q15, #0 + @ Write zeros to the temp buffer for pass 2 +.rept 4 + vst1.16 {q14-q15}, [r0,:128]! +.endr + bx lr +.endif endfunc @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it, @@ -781,15 +800,23 @@ endfunc itxfm16_1d_funcs idct itxfm16_1d_funcs iadst +@ This is the minimum eob value for each subpartition, in increments of 4 +const min_eob_idct_idct_16, align=4 + .short 0, 10, 38, 89 +endconst + .macro itxfm_func16x16 txfm1, txfm2 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1 .ifc \txfm1\()_\txfm2,idct_idct cmp r3, #1 beq idct16x16_dc_add_neon .endif - push {r4-r7,lr} + push {r4-r9,lr} .ifnc \txfm1\()_\txfm2,idct_idct vpush {q4-q7} + mov r9, #0 +.else + movrel r8, min_eob_idct_idct_16 .endif @ Align the stack, allocate a temp buffer @@ -810,8 +837,11 @@ A and r7, sp, #15 .irp i, 0, 4, 8, 12 add r0, sp, #(\i*32) + mov r1, #\i add r2, r6, #(\i*2) - mov r3, #\i +.ifc \txfm1\()_\txfm2,idct_idct + ldrh r9, [r8, #(\i/2)] +.endif bl \txfm1\()16_1d_4x16_pass1_neon .endr .ifc \txfm2,idct @@ -830,7 +860,7 @@ A and r7, sp, #15 .ifnc \txfm1\()_\txfm2,idct_idct vpop {q4-q7} .endif - pop {r4-r7,pc} + pop {r4-r9,pc} endfunc .endm @@ -944,9 +974,14 @@ endfunc @ each output written twice), followed by a separate 16-point IDCT @ of the odd inputs, added/subtracted onto the outputs of the first idct16. @ r0 = dst (temp buffer) -@ r1 = unused +@ r1 = min eob @ r2 = src +@ r3 = eob function idct32_1d_4x32_pass1_neon + @ Check if this whole input slice is zero + cmp r3, r1 + ble 1f + movrel r12, idct_coeffs vld1.16 {q0-q1}, [r12,:128] @@ -1023,6 +1058,15 @@ function idct32_1d_4x32_pass1_neon store_rev 28, 24, 20, 16 .purgem store_rev bx lr + +1: + @ Write zeros to the temp buffer for pass 2 + vmov.i16 q14, #0 + vmov.i16 q15, #0 +.rept 8 + vst1.16 {q14-q15}, [r0,:128]! +.endr + bx lr endfunc .ltorg @@ -1110,11 +1154,16 @@ function idct32_1d_4x32_pass2_neon bx lr endfunc +const min_eob_idct_idct_32, align=4 + .short 0, 9, 34, 70, 135, 240, 336, 448 +endconst + function ff_vp9_idct_idct_32x32_add_neon, export=1 cmp r3, #1 beq idct32x32_dc_add_neon - push {r4-r7,lr} + push {r4-r8,lr} vpush {q4-q7} + movrel r8, min_eob_idct_idct_32 @ Align the stack, allocate a temp buffer T mov r7, sp @@ -1129,6 +1178,7 @@ A and r7, sp, #15 .irp i, 0, 4, 8, 12, 16, 20, 24, 28 add r0, sp, #(\i*64) + ldrh r1, [r8, #(\i/2)] add r2, r6, #(\i*2) bl idct32_1d_4x32_pass1_neon .endr @@ -1141,5 +1191,5 @@ A and r7, sp, #15 add sp, sp, r7 vpop {q4-q7} - pop {r4-r7,pc} + pop {r4-r8,pc} endfunc diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c index 25f9dd1..76ce61f 100644 --- a/tests/checkasm/vp9dsp.c +++ b/tests/checkasm/vp9dsp.c @@ -272,8 +272,10 @@ static void check_itxfm(void) // skip testing sub-IDCTs for WHT or ADST since they don't // implement it in any of the SIMD functions. If they do, // consider changing this to ensure we have complete test - // coverage - for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) { + // coverage. Test sub=1 for dc-only, then 4, 8, etc, since + // the arm version can distinguish them at that level. + for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; + sub == 1 ? (sub = 4) : (sub += 4)) { if (check_func(dsp.itxfm_add[tx][txtp], "vp9_inv_%s_%dx%d_sub%d_add", tx == 4 ? "wht_wht" : txtp_types[txtp],