vp9dsp: add DC only versions for idct/idct.

Message ID 20161129125541.56517-1-martin@martin.st
State Committed
Headers show

Commit Message

Martin Storsjö Nov. 29, 2016, 12:55 p.m.
From: Clément Bœsch <u@pkh.me>

before:

time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
real    0m11.125s
user    0m11.059s
sys     0m0.050s

time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
real    0m10.944s
user    0m10.819s
sys     0m0.064s

after:

time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
real    0m8.153s
user    0m8.034s
sys     0m0.050s

time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
real    0m8.038s
user    0m7.980s
sys     0m0.039s
---
 libavcodec/vp9dsp.c | 32 ++++++++++++++++++++++++--------
 1 file changed, 24 insertions(+), 8 deletions(-)

Comments

Diego Biurrun Nov. 29, 2016, 1:10 p.m. | #1
idct/idct?

> --- a/libavcodec/vp9dsp.c
> +++ b/libavcodec/vp9dsp.c
> @@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
> +    if (has_dconly && eob == 1) {                                           \
> +        const int t  = (((block[0] * 11585 + (1 << 13)) >> 14)              \
> +                                   * 11585 + (1 << 13)) >> 14;              \
> +        block[0] = 0;                                                       \
> +        for (i = 0; i < sz; i++) {                                          \
> +            for (j = 0; j < sz; j++)                                        \
> +                dst[j * stride] = av_clip_uint8(dst[j * stride] +           \
> +                                                (bits ?                     \
> +                                                 (t + (1 << (bits - 1))) >> bits : \
> +                                                 t));                       \

FTLIW:

                   dst[j * stride] =                                           \
                        av_clip_uint8(dst[j * stride] +                        \
                                      (bits ? (t + (1 << (bits - 1))) >> bits  \
                                            : t));                             \

Diego
Martin Storsjö Nov. 29, 2016, 1:25 p.m. | #2
On Tue, 29 Nov 2016, Diego Biurrun wrote:

> idct/idct?

The vp9 inverse transforms can be different combos for the vertical and 
horizontal passes; it can be iwht_iwht, idct_idct, idct_iadst, iadst_idct, 
iadst_iadst. This optimization is only valid for the idct_idct case.

>
>> --- a/libavcodec/vp9dsp.c
>> +++ b/libavcodec/vp9dsp.c
>> @@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
>> +    if (has_dconly && eob == 1) {                                           \
>> +        const int t  = (((block[0] * 11585 + (1 << 13)) >> 14)              \
>> +                                   * 11585 + (1 << 13)) >> 14;              \
>> +        block[0] = 0;                                                       \
>> +        for (i = 0; i < sz; i++) {                                          \
>> +            for (j = 0; j < sz; j++)                                        \
>> +                dst[j * stride] = av_clip_uint8(dst[j * stride] +           \
>> +                                                (bits ?                     \
>> +                                                 (t + (1 << (bits - 1))) >> bits : \
>> +                                                 t));                       \
>
> FTLIW:
>
>                   dst[j * stride] =                                           \
>                        av_clip_uint8(dst[j * stride] +                        \
>                                      (bits ? (t + (1 << (bits - 1))) >> bits  \
>                                            : t));                             \

FTLIW?

This looks more readable, sure.

// Martin
Diego Biurrun Nov. 29, 2016, 1:49 p.m. | #3
On Tue, Nov 29, 2016 at 03:25:38PM +0200, Martin Storsjö wrote:
> On Tue, 29 Nov 2016, Diego Biurrun wrote:
> >>--- a/libavcodec/vp9dsp.c
> >>+++ b/libavcodec/vp9dsp.c
> >>@@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
> >>+    if (has_dconly && eob == 1) {                                           \
> >>+        const int t  = (((block[0] * 11585 + (1 << 13)) >> 14)              \
> >>+                                   * 11585 + (1 << 13)) >> 14;              \
> >>+        block[0] = 0;                                                       \
> >>+        for (i = 0; i < sz; i++) {                                          \
> >>+            for (j = 0; j < sz; j++)                                        \
> >>+                dst[j * stride] = av_clip_uint8(dst[j * stride] +           \
> >>+                                                (bits ?                     \
> >>+                                                 (t + (1 << (bits - 1))) >> bits : \
> >>+                                                 t));                       \
> >
> >FTLIW:
> >
> >                  dst[j * stride] =                                           \
> >                       av_clip_uint8(dst[j * stride] +                        \
> >                                     (bits ? (t + (1 << (bits - 1))) >> bits  \
> >                                           : t));                             \
> 
> FTLIW?

For The Little It's Worth; i.e. take it if you like it or just ignore.

Diego
Janne Grunau Nov. 30, 2016, 7:34 p.m. | #4
On 2016-11-29 14:55:41 +0200, Martin Storsjö wrote:
> From: Clément Bœsch <u@pkh.me>
> 
> before:
> 
> time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
> real    0m11.125s
> user    0m11.059s
> sys     0m0.050s
> 
> time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
> real    0m10.944s
> user    0m10.819s
> sys     0m0.064s
> 
> after:
> 
> time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
> real    0m8.153s
> user    0m8.034s
> sys     0m0.050s
> 
> time ./avconv -v 0 -nostats -threads 1 -i sintel_vp9_500kbps.webm -f null -
> real    0m8.038s
> user    0m7.980s
> sys     0m0.039s
> ---
>  libavcodec/vp9dsp.c | 32 ++++++++++++++++++++++++--------
>  1 file changed, 24 insertions(+), 8 deletions(-)
> 
> diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
> index 73006fa..ead2f88 100644
> --- a/libavcodec/vp9dsp.c
> +++ b/libavcodec/vp9dsp.c
> @@ -944,7 +944,7 @@ static av_cold void vp9dsp_intrapred_init(VP9DSPContext *dsp)
>  #undef init_intra_pred
>  }
>  
> -#define itxfm_wrapper(type_a, type_b, sz, bits)                             \
> +#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly)                 \
>  static void                                                                 \
>  type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
>                                                        ptrdiff_t stride,     \
> @@ -953,6 +953,22 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
>  {                                                                           \
>      int i, j;                                                               \
>      int16_t tmp[sz * sz], out[sz];                                          \
> +                                                                            \
> +    if (has_dconly && eob == 1) {                                           \
> +        const int t  = (((block[0] * 11585 + (1 << 13)) >> 14)              \
> +                                   * 11585 + (1 << 13)) >> 14;              \
> +        block[0] = 0;                                                       \
> +        for (i = 0; i < sz; i++) {                                          \
> +            for (j = 0; j < sz; j++)                                        \
> +                dst[j * stride] = av_clip_uint8(dst[j * stride] +           \
> +                                                (bits ?                     \
> +                                                 (t + (1 << (bits - 1))) >> bits : \
> +                                                 t));                       \
> +            dst++;                                                          \
> +        }                                                                   \
> +        return;                                                             \
> +    }                                                                       \
> +                                                                            \
>      for (i = 0; i < sz; i++)                                                \
>          type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);                \
>      memset(block, 0, sz * sz * sizeof(*block));                             \
> @@ -967,11 +983,11 @@ type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
>      }                                                                       \
>  }
>  
> -#define itxfm_wrap(sz, bits)             \
> -    itxfm_wrapper(idct, idct, sz, bits)  \
> -    itxfm_wrapper(iadst, idct, sz, bits) \
> -    itxfm_wrapper(idct, iadst, sz, bits) \
> -    itxfm_wrapper(iadst, iadst, sz, bits)
> +#define itxfm_wrap(sz, bits)                 \
> +    itxfm_wrapper(idct,  idct,  sz, bits, 1) \
> +    itxfm_wrapper(iadst, idct,  sz, bits, 0) \
> +    itxfm_wrapper(idct,  iadst, sz, bits, 0) \
> +    itxfm_wrapper(iadst, iadst, sz, bits, 0)
>  
>  #define IN(x) in[x * stride]
>  
> @@ -1490,7 +1506,7 @@ static av_always_inline void idct32_1d(int16_t *out, const int16_t *in,
>      out[31] = t0   - t31;
>  }
>  
> -itxfm_wrapper(idct, idct, 32, 6)
> +itxfm_wrapper(idct, idct, 32, 6, 1)
>  
>  static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
>                                        ptrdiff_t stride, int pass)
> @@ -1523,7 +1539,7 @@ static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
>      out[3] = t3;
>  }
>  
> -itxfm_wrapper(iwht, iwht, 4, 0)
> +itxfm_wrapper(iwht, iwht, 4, 0, 0)
>  
>  #undef IN
>  #undef itxfm_wrapper

ok

Janne

Patch

diff --git a/libavcodec/vp9dsp.c b/libavcodec/vp9dsp.c
index 73006fa..ead2f88 100644
--- a/libavcodec/vp9dsp.c
+++ b/libavcodec/vp9dsp.c
@@ -944,7 +944,7 @@  static av_cold void vp9dsp_intrapred_init(VP9DSPContext *dsp)
 #undef init_intra_pred
 }
 
-#define itxfm_wrapper(type_a, type_b, sz, bits)                             \
+#define itxfm_wrapper(type_a, type_b, sz, bits, has_dconly)                 \
 static void                                                                 \
 type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
                                                       ptrdiff_t stride,     \
@@ -953,6 +953,22 @@  type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
 {                                                                           \
     int i, j;                                                               \
     int16_t tmp[sz * sz], out[sz];                                          \
+                                                                            \
+    if (has_dconly && eob == 1) {                                           \
+        const int t  = (((block[0] * 11585 + (1 << 13)) >> 14)              \
+                                   * 11585 + (1 << 13)) >> 14;              \
+        block[0] = 0;                                                       \
+        for (i = 0; i < sz; i++) {                                          \
+            for (j = 0; j < sz; j++)                                        \
+                dst[j * stride] = av_clip_uint8(dst[j * stride] +           \
+                                                (bits ?                     \
+                                                 (t + (1 << (bits - 1))) >> bits : \
+                                                 t));                       \
+            dst++;                                                          \
+        }                                                                   \
+        return;                                                             \
+    }                                                                       \
+                                                                            \
     for (i = 0; i < sz; i++)                                                \
         type_a ## sz ## _1d(tmp + i * sz, block + i, sz, 0);                \
     memset(block, 0, sz * sz * sizeof(*block));                             \
@@ -967,11 +983,11 @@  type_a ## _ ## type_b ## _ ## sz ## x ## sz ## _add_c(uint8_t *dst,         \
     }                                                                       \
 }
 
-#define itxfm_wrap(sz, bits)             \
-    itxfm_wrapper(idct, idct, sz, bits)  \
-    itxfm_wrapper(iadst, idct, sz, bits) \
-    itxfm_wrapper(idct, iadst, sz, bits) \
-    itxfm_wrapper(iadst, iadst, sz, bits)
+#define itxfm_wrap(sz, bits)                 \
+    itxfm_wrapper(idct,  idct,  sz, bits, 1) \
+    itxfm_wrapper(iadst, idct,  sz, bits, 0) \
+    itxfm_wrapper(idct,  iadst, sz, bits, 0) \
+    itxfm_wrapper(iadst, iadst, sz, bits, 0)
 
 #define IN(x) in[x * stride]
 
@@ -1490,7 +1506,7 @@  static av_always_inline void idct32_1d(int16_t *out, const int16_t *in,
     out[31] = t0   - t31;
 }
 
-itxfm_wrapper(idct, idct, 32, 6)
+itxfm_wrapper(idct, idct, 32, 6, 1)
 
 static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
                                       ptrdiff_t stride, int pass)
@@ -1523,7 +1539,7 @@  static av_always_inline void iwht4_1d(int16_t *out, const int16_t *in,
     out[3] = t3;
 }
 
-itxfm_wrapper(iwht, iwht, 4, 0)
+itxfm_wrapper(iwht, iwht, 4, 0, 0)
 
 #undef IN
 #undef itxfm_wrapper