[04/11] arm: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

Message ID 1479906058-22747-4-git-send-email-martin@martin.st
State Superseded
Headers show

Commit Message

Martin Storsjö Nov. 23, 2016, 1 p.m.
This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:

vp9_inv_dct_dct_16x16_sub16_add_neon:   3189.0   2486.8   2509.9   1964.1
vp9_inv_dct_dct_32x32_sub32_add_neon:  18448.1  16682.0  14235.4  11993.4

By skipping individual 4x16 or 4x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon:     271.5    188.7    211.6    235.1
vp9_inv_dct_dct_16x16_sub4_add_neon:    2079.7   1606.3   1772.1   1264.8
vp9_inv_dct_dct_16x16_sub8_add_neon:    2449.2   1834.3   2046.5   1499.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   2826.2   2109.2   2295.9   1758.2
vp9_inv_dct_dct_16x16_sub16_add_neon:   3224.1   2476.5   2533.1   1985.7
vp9_inv_dct_dct_32x32_sub1_add_neon:     752.5    457.5    863.7    554.7
vp9_inv_dct_dct_32x32_sub4_add_neon:   10689.2   8013.4   8592.9   6785.9
vp9_inv_dct_dct_32x32_sub8_add_neon:   12217.8   9068.1   9420.4   7518.3
vp9_inv_dct_dct_32x32_sub12_add_neon:  12967.3  10455.5  10223.9   8275.7
vp9_inv_dct_dct_32x32_sub16_add_neon:  14084.1  11933.7  10998.9   9012.5
vp9_inv_dct_dct_32x32_sub20_add_neon:  15171.4  13335.0  11820.6   9757.2
vp9_inv_dct_dct_32x32_sub24_add_neon:  16229.6  15185.7  12614.4  10504.9
vp9_inv_dct_dct_32x32_sub28_add_neon:  17338.1  15955.3  13445.0  11248.4
vp9_inv_dct_dct_32x32_sub32_add_neon:  18465.7  16974.6  14239.2  11999.1

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.

In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
8x8 or 16x16 subpartitions respectively.
---
This goes on top of the checkasm vp9dsp patch that adds benchmarking
of generic subpartitions in the itxfm.
---
 libavcodec/arm/vp9itxfm_neon.S | 70 ++++++++++++++++++++++++++++++++++++------
 tests/checkasm/vp9dsp.c        |  6 ++--
 2 files changed, 64 insertions(+), 12 deletions(-)

Comments

Janne Grunau Nov. 23, 2016, 9:12 p.m. | #1
On 2016-11-23 15:00:51 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
> 
> Previously all subpartitions except the eob=1 (DC) case ran with
> the same runtime:
> 
> vp9_inv_dct_dct_16x16_sub16_add_neon:   3189.0   2486.8   2509.9   1964.1
> vp9_inv_dct_dct_32x32_sub32_add_neon:  18448.1  16682.0  14235.4  11993.4
> 
> By skipping individual 4x16 or 4x32 pixel slices in the first pass,
> we reduce the runtime of these functions like this:
> 
> vp9_inv_dct_dct_16x16_sub1_add_neon:     271.5    188.7    211.6    235.1
> vp9_inv_dct_dct_16x16_sub4_add_neon:    2079.7   1606.3   1772.1   1264.8
> vp9_inv_dct_dct_16x16_sub8_add_neon:    2449.2   1834.3   2046.5   1499.7
> vp9_inv_dct_dct_16x16_sub12_add_neon:   2826.2   2109.2   2295.9   1758.2
> vp9_inv_dct_dct_16x16_sub16_add_neon:   3224.1   2476.5   2533.1   1985.7
> vp9_inv_dct_dct_32x32_sub1_add_neon:     752.5    457.5    863.7    554.7
> vp9_inv_dct_dct_32x32_sub4_add_neon:   10689.2   8013.4   8592.9   6785.9
> vp9_inv_dct_dct_32x32_sub8_add_neon:   12217.8   9068.1   9420.4   7518.3
> vp9_inv_dct_dct_32x32_sub12_add_neon:  12967.3  10455.5  10223.9   8275.7
> vp9_inv_dct_dct_32x32_sub16_add_neon:  14084.1  11933.7  10998.9   9012.5
> vp9_inv_dct_dct_32x32_sub20_add_neon:  15171.4  13335.0  11820.6   9757.2
> vp9_inv_dct_dct_32x32_sub24_add_neon:  16229.6  15185.7  12614.4  10504.9
> vp9_inv_dct_dct_32x32_sub28_add_neon:  17338.1  15955.3  13445.0  11248.4
> vp9_inv_dct_dct_32x32_sub32_add_neon:  18465.7  16974.6  14239.2  11999.1
> 
> I.e. in general a very minor overhead for the full subpartition case due
> to the additional cmps, but a significant speedup for the cases when we
> only need to process a small part of the actual input data.
> 
> In common VP9 content in a few inspected clips, 70-90% of the non-dc-only
> 16x16 and 32x32 IDCTs only have nonzero coefficients in the upper left
> 8x8 or 16x16 subpartitions respectively.
> ---
> This goes on top of the checkasm vp9dsp patch that adds benchmarking
> of generic subpartitions in the itxfm.
> ---
>  libavcodec/arm/vp9itxfm_neon.S | 70 ++++++++++++++++++++++++++++++++++++------
>  tests/checkasm/vp9dsp.c        |  6 ++--
>  2 files changed, 64 insertions(+), 12 deletions(-)
> 
> diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
> index 01944bd..769579a 100644
> --- a/libavcodec/arm/vp9itxfm_neon.S
> +++ b/libavcodec/arm/vp9itxfm_neon.S
> @@ -659,10 +659,17 @@ endfunc
>  @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
>  @ transpose into a horizontal 16x4 slice and store.
>  @ r0 = dst (temp buffer)
> -@ r1 = unused
> +@ r1 = slice offset
>  @ r2 = src
> -@ r3 = slice offset
> +@ r3 = eob
> +@ r9 = min eob
>  function \txfm\()16_1d_4x16_pass1_neon
> +.ifc \txfm,idct
> +        @ Check if this whole input slice is zero
> +        cmp             r3,  r9
> +        ble             2f

once this check is true it is true for all remaining slices so we should 
move it out to the main function.

> +.endif
> +
>          mov             r12, #32
>          vmov.s16        q2, #0
>  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
> @@ -678,14 +685,14 @@ function \txfm\()16_1d_4x16_pass1_neon
>          transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
>  
>          @ Store the transposed 4x4 blocks horizontally.
> -        cmp             r3,  #12
> +        cmp             r1,  #12
>          beq             1f
>  .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
>          vst1.16         {d\i}, [r0,:64]!
>  .endr
>          bx              lr
>  1:
> -        @ Special case: For the last input column (r3 == 12),
> +        @ Special case: For the last input column (r1 == 12),
>          @ which would be stored as the last row in the temp buffer,
>          @ don't store the first 4x4 block, but keep it in registers
>          @ for the first slice of the second pass (where it is the
> @@ -711,6 +718,18 @@ function \txfm\()16_1d_4x16_pass1_neon
>          vmov            d30, d18
>          vmov            d31, d19
>          bx              lr
> +
> +.ifc \txfm,idct
> +2:
> +        @ Set d28-d31 to zero, for the in-register passthrough of coefficients to pass 2
> +        vmov.i16        q14, #0
> +        vmov.i16        q15, #0
> +        @ Write zeros to the temp buffer for pass 2
> +.rept 4
> +        vst1.16         {q14-q15}, [r0,:128]!
> +.endr
> +        bx              lr
> +.endif
>  endfunc
>  
>  @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
> @@ -781,15 +800,23 @@ endfunc
>  itxfm16_1d_funcs idct
>  itxfm16_1d_funcs iadst
>  
> +@ This is the minimum eob value for each subpartition, in increments of 4
> +const min_eob_idct_idct_16, align=4
> +        .short  0, 10, 38, 89
> +endconst
> +
>  .macro itxfm_func16x16 txfm1, txfm2
>  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>  .ifc \txfm1\()_\txfm2,idct_idct
>          cmp             r3,  #1
>          beq             idct16x16_dc_add_neon
>  .endif
> -        push            {r4-r7,lr}
> +        push            {r4-r9,lr}
>  .ifnc \txfm1\()_\txfm2,idct_idct
>          vpush           {q4-q7}
> +        mov             r9,  #0
> +.else
> +        movrel          r8,  min_eob_idct_idct_16
>  .endif
>  
>          @ Align the stack, allocate a temp buffer
> @@ -810,8 +837,11 @@ A       and             r7,  sp,  #15
>  
>  .irp i, 0, 4, 8, 12
>          add             r0,  sp,  #(\i*32)
> +        mov             r1,  #\i
>          add             r2,  r6,  #(\i*2)
> -        mov             r3,  #\i
> +.ifc \txfm1\()_\txfm2,idct_idct
> +        ldrh            r9,  [r8, #(\i/2)]

using the writeback variant would look imo clearer although it increases 
the code size for thumb (if we care about that)

> +.endif

move this to the beginning and load to r1, cmp with eob, conditionally 
store how much stack space needs to be cleared and jump out of '.irp', 
saves r9. and several jumps and comparisons if eob is small. 

>          bl              \txfm1\()16_1d_4x16_pass1_neon
>  .endr
>  .ifc \txfm2,idct
> @@ -830,7 +860,7 @@ A       and             r7,  sp,  #15
>  .ifnc \txfm1\()_\txfm2,idct_idct
>          vpop            {q4-q7}
>  .endif
> -        pop             {r4-r7,pc}
> +        pop             {r4-r9,pc}
>  endfunc
>  .endm
>  
> @@ -944,9 +974,14 @@ endfunc
>  @ each output written twice), followed by a separate 16-point IDCT
>  @ of the odd inputs, added/subtracted onto the outputs of the first idct16.
>  @ r0 = dst (temp buffer)
> -@ r1 = unused
> +@ r1 = min eob
>  @ r2 = src
> +@ r3 = eob
>  function idct32_1d_4x32_pass1_neon
> +        @ Check if this whole input slice is zero
> +        cmp             r3,  r1
> +        ble             1f

the same applies as for the 16x16 idct

Janne

Patch

diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S
index 01944bd..769579a 100644
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -659,10 +659,17 @@  endfunc
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
 @ transpose into a horizontal 16x4 slice and store.
 @ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = slice offset
 @ r2 = src
-@ r3 = slice offset
+@ r3 = eob
+@ r9 = min eob
 function \txfm\()16_1d_4x16_pass1_neon
+.ifc \txfm,idct
+        @ Check if this whole input slice is zero
+        cmp             r3,  r9
+        ble             2f
+.endif
+
         mov             r12, #32
         vmov.s16        q2, #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -678,14 +685,14 @@  function \txfm\()16_1d_4x16_pass1_neon
         transpose16_q_4x_4x4 q8,  q9,  q10, q11, q12, q13, q14, q15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31
 
         @ Store the transposed 4x4 blocks horizontally.
-        cmp             r3,  #12
+        cmp             r1,  #12
         beq             1f
 .irp i, 16, 20, 24, 28, 17, 21, 25, 29, 18, 22, 26, 30, 19, 23, 27, 31
         vst1.16         {d\i}, [r0,:64]!
 .endr
         bx              lr
 1:
-        @ Special case: For the last input column (r3 == 12),
+        @ Special case: For the last input column (r1 == 12),
         @ which would be stored as the last row in the temp buffer,
         @ don't store the first 4x4 block, but keep it in registers
         @ for the first slice of the second pass (where it is the
@@ -711,6 +718,18 @@  function \txfm\()16_1d_4x16_pass1_neon
         vmov            d30, d18
         vmov            d31, d19
         bx              lr
+
+.ifc \txfm,idct
+2:
+        @ Set d28-d31 to zero, for the in-register passthrough of coefficients to pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+        @ Write zeros to the temp buffer for pass 2
+.rept 4
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bx              lr
+.endif
 endfunc
 
 @ Read a vertical 4x16 slice out of a 16x16 matrix, do a transform on it,
@@ -781,15 +800,23 @@  endfunc
 itxfm16_1d_funcs idct
 itxfm16_1d_funcs iadst
 
+@ This is the minimum eob value for each subpartition, in increments of 4
+const min_eob_idct_idct_16, align=4
+        .short  0, 10, 38, 89
+endconst
+
 .macro itxfm_func16x16 txfm1, txfm2
 function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
         cmp             r3,  #1
         beq             idct16x16_dc_add_neon
 .endif
-        push            {r4-r7,lr}
+        push            {r4-r9,lr}
 .ifnc \txfm1\()_\txfm2,idct_idct
         vpush           {q4-q7}
+        mov             r9,  #0
+.else
+        movrel          r8,  min_eob_idct_idct_16
 .endif
 
         @ Align the stack, allocate a temp buffer
@@ -810,8 +837,11 @@  A       and             r7,  sp,  #15
 
 .irp i, 0, 4, 8, 12
         add             r0,  sp,  #(\i*32)
+        mov             r1,  #\i
         add             r2,  r6,  #(\i*2)
-        mov             r3,  #\i
+.ifc \txfm1\()_\txfm2,idct_idct
+        ldrh            r9,  [r8, #(\i/2)]
+.endif
         bl              \txfm1\()16_1d_4x16_pass1_neon
 .endr
 .ifc \txfm2,idct
@@ -830,7 +860,7 @@  A       and             r7,  sp,  #15
 .ifnc \txfm1\()_\txfm2,idct_idct
         vpop            {q4-q7}
 .endif
-        pop             {r4-r7,pc}
+        pop             {r4-r9,pc}
 endfunc
 .endm
 
@@ -944,9 +974,14 @@  endfunc
 @ each output written twice), followed by a separate 16-point IDCT
 @ of the odd inputs, added/subtracted onto the outputs of the first idct16.
 @ r0 = dst (temp buffer)
-@ r1 = unused
+@ r1 = min eob
 @ r2 = src
+@ r3 = eob
 function idct32_1d_4x32_pass1_neon
+        @ Check if this whole input slice is zero
+        cmp             r3,  r1
+        ble             1f
+
         movrel          r12, idct_coeffs
         vld1.16         {q0-q1}, [r12,:128]
 
@@ -1023,6 +1058,15 @@  function idct32_1d_4x32_pass1_neon
         store_rev       28, 24, 20, 16
 .purgem store_rev
         bx              lr
+
+1:
+        @ Write zeros to the temp buffer for pass 2
+        vmov.i16        q14, #0
+        vmov.i16        q15, #0
+.rept 8
+        vst1.16         {q14-q15}, [r0,:128]!
+.endr
+        bx              lr
 endfunc
 .ltorg
 
@@ -1110,11 +1154,16 @@  function idct32_1d_4x32_pass2_neon
         bx              lr
 endfunc
 
+const min_eob_idct_idct_32, align=4
+        .short  0, 9, 34, 70, 135, 240, 336, 448
+endconst
+
 function ff_vp9_idct_idct_32x32_add_neon, export=1
         cmp             r3,  #1
         beq             idct32x32_dc_add_neon
-        push            {r4-r7,lr}
+        push            {r4-r8,lr}
         vpush           {q4-q7}
+        movrel          r8,  min_eob_idct_idct_32
 
         @ Align the stack, allocate a temp buffer
 T       mov             r7,  sp
@@ -1129,6 +1178,7 @@  A       and             r7,  sp,  #15
 
 .irp i, 0, 4, 8, 12, 16, 20, 24, 28
         add             r0,  sp,  #(\i*64)
+        ldrh            r1,  [r8, #(\i/2)]
         add             r2,  r6,  #(\i*2)
         bl              idct32_1d_4x32_pass1_neon
 .endr
@@ -1141,5 +1191,5 @@  A       and             r7,  sp,  #15
 
         add             sp,  sp,  r7
         vpop            {q4-q7}
-        pop             {r4-r7,pc}
+        pop             {r4-r8,pc}
 endfunc
diff --git a/tests/checkasm/vp9dsp.c b/tests/checkasm/vp9dsp.c
index 25f9dd1..76ce61f 100644
--- a/tests/checkasm/vp9dsp.c
+++ b/tests/checkasm/vp9dsp.c
@@ -272,8 +272,10 @@  static void check_itxfm(void)
             // skip testing sub-IDCTs for WHT or ADST since they don't
             // implement it in any of the SIMD functions. If they do,
             // consider changing this to ensure we have complete test
-            // coverage
-            for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz; sub <<= 1) {
+            // coverage. Test sub=1 for dc-only, then 4, 8, etc, since
+            // the arm version can distinguish them at that level.
+            for (sub = (txtp == 0 && tx < 4) ? 1 : sz; sub <= sz;
+                 sub == 1 ? (sub = 4) : (sub += 4)) {
                 if (check_func(dsp.itxfm_add[tx][txtp],
                                "vp9_inv_%s_%dx%d_sub%d_add",
                                tx == 4 ? "wht_wht" : txtp_types[txtp],