[3/3] aarch64: vp9itxfm: Skip empty slices in the first pass of idct_idct 16x16 and 32x32

Message ID 1480325162-7688-3-git-send-email-martin@martin.st
State Committed
Headers show

Commit Message

Martin Storsjö Nov. 28, 2016, 9:26 a.m.
This work is sponsored by, and copyright, Google.

Previously all subpartitions except the eob=1 (DC) case ran with
the same runtime:

vp9_inv_dct_dct_16x16_sub16_add_neon:   1373.2
vp9_inv_dct_dct_32x32_sub32_add_neon:   8089.0

By skipping individual 8x16 or 8x32 pixel slices in the first pass,
we reduce the runtime of these functions like this:

vp9_inv_dct_dct_16x16_sub1_add_neon:     235.3
vp9_inv_dct_dct_16x16_sub2_add_neon:    1043.7
vp9_inv_dct_dct_16x16_sub4_add_neon:    1045.3
vp9_inv_dct_dct_16x16_sub8_add_neon:    1043.7
vp9_inv_dct_dct_16x16_sub12_add_neon:   1374.0
vp9_inv_dct_dct_16x16_sub16_add_neon:   1368.7
vp9_inv_dct_dct_32x32_sub1_add_neon:     555.6
vp9_inv_dct_dct_32x32_sub2_add_neon:    5180.0
vp9_inv_dct_dct_32x32_sub4_add_neon:    5175.1
vp9_inv_dct_dct_32x32_sub8_add_neon:    5186.6
vp9_inv_dct_dct_32x32_sub12_add_neon:   6159.5
vp9_inv_dct_dct_32x32_sub16_add_neon:   6162.7
vp9_inv_dct_dct_32x32_sub20_add_neon:   7129.0
vp9_inv_dct_dct_32x32_sub24_add_neon:   7133.1
vp9_inv_dct_dct_32x32_sub28_add_neon:   8107.1
vp9_inv_dct_dct_32x32_sub32_add_neon:   8105.6

I.e. in general a very minor overhead for the full subpartition case due
to the additional cmps, but a significant speedup for the cases when we
only need to process a small part of the actual input data.
---
Updated based on Janne's review of the arm version.
---
 libavcodec/aarch64/vp9itxfm_neon.S | 60 ++++++++++++++++++++++++++++++++++----
 1 file changed, 55 insertions(+), 5 deletions(-)

Comments

Janne Grunau Nov. 30, 2016, 8:24 p.m. | #1
On 2016-11-28 11:26:02 +0200, Martin Storsjö wrote:
> This work is sponsored by, and copyright, Google.
> 
> Previously all subpartitions except the eob=1 (DC) case ran with
> the same runtime:
> 
> vp9_inv_dct_dct_16x16_sub16_add_neon:   1373.2
> vp9_inv_dct_dct_32x32_sub32_add_neon:   8089.0
> 
> By skipping individual 8x16 or 8x32 pixel slices in the first pass,
> we reduce the runtime of these functions like this:
> 
> vp9_inv_dct_dct_16x16_sub1_add_neon:     235.3
> vp9_inv_dct_dct_16x16_sub2_add_neon:    1043.7
> vp9_inv_dct_dct_16x16_sub4_add_neon:    1045.3
> vp9_inv_dct_dct_16x16_sub8_add_neon:    1043.7
> vp9_inv_dct_dct_16x16_sub12_add_neon:   1374.0
> vp9_inv_dct_dct_16x16_sub16_add_neon:   1368.7
> vp9_inv_dct_dct_32x32_sub1_add_neon:     555.6
> vp9_inv_dct_dct_32x32_sub2_add_neon:    5180.0
> vp9_inv_dct_dct_32x32_sub4_add_neon:    5175.1
> vp9_inv_dct_dct_32x32_sub8_add_neon:    5186.6
> vp9_inv_dct_dct_32x32_sub12_add_neon:   6159.5
> vp9_inv_dct_dct_32x32_sub16_add_neon:   6162.7
> vp9_inv_dct_dct_32x32_sub20_add_neon:   7129.0
> vp9_inv_dct_dct_32x32_sub24_add_neon:   7133.1
> vp9_inv_dct_dct_32x32_sub28_add_neon:   8107.1
> vp9_inv_dct_dct_32x32_sub32_add_neon:   8105.6
> 
> I.e. in general a very minor overhead for the full subpartition case due
> to the additional cmps, but a significant speedup for the cases when we
> only need to process a small part of the actual input data.
> ---
> Updated based on Janne's review of the arm version.
> ---
>  libavcodec/aarch64/vp9itxfm_neon.S | 60 ++++++++++++++++++++++++++++++++++----
>  1 file changed, 55 insertions(+), 5 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
> index f4194a6..9d2ba11 100644
> --- a/libavcodec/aarch64/vp9itxfm_neon.S
> +++ b/libavcodec/aarch64/vp9itxfm_neon.S
> @@ -588,6 +588,9 @@ endfunc
>  .macro store i, dst, inc
>          st1             {v\i\().8h},  [\dst], \inc
>  .endm
> +.macro movi_v i, size, imm
> +        movi            v\i\()\size,  \imm
> +.endm
>  .macro load_clear i, src, inc
>          ld1             {v\i\().8h}, [\src]
>          st1             {v2.8h},  [\src], \inc
> @@ -596,9 +599,8 @@ endfunc
>  // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
>  // transpose into a horizontal 16x8 slice and store.
>  // x0 = dst (temp buffer)
> -// x1 = unused
> +// x1 = slice offset
>  // x2 = src
> -// x3 = slice offset
>  // x9 = input stride
>  .macro itxfm16_1d_funcs txfm
>  function \txfm\()16_1d_8x16_pass1_neon
> @@ -616,14 +618,14 @@ function \txfm\()16_1d_8x16_pass1_neon
>          transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
>  
>          // Store the transposed 8x8 blocks horizontally.
> -        cmp             x3,  #8
> +        cmp             x1,  #8
>          b.eq            1f
>  .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
>          store           \i,  x0,  #16
>  .endr
>          ret
>  1:
> -        // Special case: For the last input column (x3 == 8),
> +        // Special case: For the last input column (x1 == 8),
>          // which would be stored as the last row in the temp buffer,
>          // don't store the first 8x8 block, but keep it in registers
>          // for the first slice of the second pass (where it is the
> @@ -751,13 +753,35 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
>  
>  .irp i, 0, 8
>          add             x0,  sp,  #(\i*32)
> +.ifc \txfm1\()_\txfm2,idct_idct
> +.if \i == 8
> +        cmp             w3,  #38
> +        b.le            1f
> +.endif
> +.endif
> +        mov             x1,  #\i
>          add             x2,  x6,  #(\i*2)
> -        mov             x3,  #\i
>          bl              \txfm1\()16_1d_8x16_pass1_neon
>  .endr
>  .ifc \txfm1\()_\txfm2,iadst_idct
>          ld1             {v0.8h,v1.8h}, [x10]
>  .endif
> +
> +.ifc \txfm1\()_\txfm2,idct_idct
> +        b               3f
> +1:
> +        // Set v24-v31 to zero, for the in-register passthrough of
> +        // coefficients to pass 2. Since we only do two slices, this can
> +        // only ever happen for the second slice. So we only need to store
> +        // zeros to the temp buffer for the second half of the buffer.
> +.irp i, 24, 25, 26, 27, 28, 29, 30, 31
> +        add             x0,  x0,  #16
> +        movi_v          \i,  .16b, #0
> +        store           24,  x0,  #16
> +.endr

not really pretty, unfortunately I don't see much room for improvement.  
iirc we should have a gpr which holds #32. move the add out of the .irp 
and use w\that register as writeback

> +3:
> +.endif
> +
>  .irp i, 0, 8
>          add             x0,  x4,  #(\i)
>          mov             x1,  x5
> @@ -1073,12 +1097,17 @@ function idct32_1d_8x32_pass2_neon
>          ret
>  endfunc
>  
> +const min_eob_idct_idct_32, align=4
> +        .short  0, 34, 135, 336
> +endconst
> +
>  function ff_vp9_idct_idct_32x32_add_neon, export=1
>          cmp             w3,  #1
>          b.eq            idct32x32_dc_add_neon
>  
>          movrel          x10, idct_coeffs
>          add             x11, x10, #32
> +        movrel          x12, min_eob_idct_idct_32 + 2
>  
>          mov             x15, x30
>  
> @@ -1099,9 +1128,30 @@ function ff_vp9_idct_idct_32x32_add_neon, export=1
>  
>  .irp i, 0, 8, 16, 24
>          add             x0,  sp,  #(\i*64)
> +.if \i > 0
> +        ldrh            w1,  [x12], #2
> +        cmp             w3,  w1
> +        mov             x1,  #(32 - \i)/4
> +        b.le            1f
> +.endif
>          add             x2,  x6,  #(\i*2)
>          bl              idct32_1d_8x32_pass1_neon
>  .endr
> +        b               3f
> +
> +1:
> +        // Write zeros to the temp buffer for pass 2
> +        movi            v16.8h,  #0
> +        movi            v17.8h,  #0
> +        movi            v18.8h,  #0
> +        movi            v19.8h,  #0
> +2:
> +        subs            x1,  x1,  #1
> +.rept 4
> +        st1             {v16.8h-v19.8h},  [x0], #64
> +.endr
> +        b.ne            2b
> +3:
>  .irp i, 0, 8, 16, 24
>          add             x0,  x4,  #(\i)
>          mov             x1,  x5

otherwise ok

Janne

Patch

diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S
index f4194a6..9d2ba11 100644
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -588,6 +588,9 @@  endfunc
 .macro store i, dst, inc
         st1             {v\i\().8h},  [\dst], \inc
 .endm
+.macro movi_v i, size, imm
+        movi            v\i\()\size,  \imm
+.endm
 .macro load_clear i, src, inc
         ld1             {v\i\().8h}, [\src]
         st1             {v2.8h},  [\src], \inc
@@ -596,9 +599,8 @@  endfunc
 // Read a vertical 8x16 slice out of a 16x16 matrix, do a transform on it,
 // transpose into a horizontal 16x8 slice and store.
 // x0 = dst (temp buffer)
-// x1 = unused
+// x1 = slice offset
 // x2 = src
-// x3 = slice offset
 // x9 = input stride
 .macro itxfm16_1d_funcs txfm
 function \txfm\()16_1d_8x16_pass1_neon
@@ -616,14 +618,14 @@  function \txfm\()16_1d_8x16_pass1_neon
         transpose_8x8H  v24, v25, v26, v27, v28, v29, v30, v31, v2, v3
 
         // Store the transposed 8x8 blocks horizontally.
-        cmp             x3,  #8
+        cmp             x1,  #8
         b.eq            1f
 .irp i, 16, 24, 17, 25, 18, 26, 19, 27, 20, 28, 21, 29, 22, 30, 23, 31
         store           \i,  x0,  #16
 .endr
         ret
 1:
-        // Special case: For the last input column (x3 == 8),
+        // Special case: For the last input column (x1 == 8),
         // which would be stored as the last row in the temp buffer,
         // don't store the first 8x8 block, but keep it in registers
         // for the first slice of the second pass (where it is the
@@ -751,13 +753,35 @@  function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 
 .irp i, 0, 8
         add             x0,  sp,  #(\i*32)
+.ifc \txfm1\()_\txfm2,idct_idct
+.if \i == 8
+        cmp             w3,  #38
+        b.le            1f
+.endif
+.endif
+        mov             x1,  #\i
         add             x2,  x6,  #(\i*2)
-        mov             x3,  #\i
         bl              \txfm1\()16_1d_8x16_pass1_neon
 .endr
 .ifc \txfm1\()_\txfm2,iadst_idct
         ld1             {v0.8h,v1.8h}, [x10]
 .endif
+
+.ifc \txfm1\()_\txfm2,idct_idct
+        b               3f
+1:
+        // Set v24-v31 to zero, for the in-register passthrough of
+        // coefficients to pass 2. Since we only do two slices, this can
+        // only ever happen for the second slice. So we only need to store
+        // zeros to the temp buffer for the second half of the buffer.
+.irp i, 24, 25, 26, 27, 28, 29, 30, 31
+        add             x0,  x0,  #16
+        movi_v          \i,  .16b, #0
+        store           24,  x0,  #16
+.endr
+3:
+.endif
+
 .irp i, 0, 8
         add             x0,  x4,  #(\i)
         mov             x1,  x5
@@ -1073,12 +1097,17 @@  function idct32_1d_8x32_pass2_neon
         ret
 endfunc
 
+const min_eob_idct_idct_32, align=4
+        .short  0, 34, 135, 336
+endconst
+
 function ff_vp9_idct_idct_32x32_add_neon, export=1
         cmp             w3,  #1
         b.eq            idct32x32_dc_add_neon
 
         movrel          x10, idct_coeffs
         add             x11, x10, #32
+        movrel          x12, min_eob_idct_idct_32 + 2
 
         mov             x15, x30
 
@@ -1099,9 +1128,30 @@  function ff_vp9_idct_idct_32x32_add_neon, export=1
 
 .irp i, 0, 8, 16, 24
         add             x0,  sp,  #(\i*64)
+.if \i > 0
+        ldrh            w1,  [x12], #2
+        cmp             w3,  w1
+        mov             x1,  #(32 - \i)/4
+        b.le            1f
+.endif
         add             x2,  x6,  #(\i*2)
         bl              idct32_1d_8x32_pass1_neon
 .endr
+        b               3f
+
+1:
+        // Write zeros to the temp buffer for pass 2
+        movi            v16.8h,  #0
+        movi            v17.8h,  #0
+        movi            v18.8h,  #0
+        movi            v19.8h,  #0
+2:
+        subs            x1,  x1,  #1
+.rept 4
+        st1             {v16.8h-v19.8h},  [x0], #64
+.endr
+        b.ne            2b
+3:
 .irp i, 0, 8, 16, 24
         add             x0,  x4,  #(\i)
         mov             x1,  x5