[1/6] arm/aarch64: vp9lpf: Calculate !hev directly

Message ID 1484513752-7472-1-git-send-email-martin@martin.st
State Committed
Commit e1f9de86f454861b69b199ad801adc2ec6c3b220
Headers show

Commit Message

Martin Storsjö Jan. 15, 2017, 8:55 p.m.
Previously we first calculated hev, and then negated it.

Since we were able to schedule the negation in the middle
of another calculation, we don't see any gain in all cases.

Before:                     Cortex A7      A8      A9     A53  A53/AArch64
vp9_loop_filter_v_4_8_neon:     147.0   129.0   115.8    89.0         88.7
vp9_loop_filter_v_8_8_neon:     242.0   198.5   174.7   140.0        136.7
vp9_loop_filter_v_16_8_neon:    500.0   419.5   382.7   293.0        275.7
vp9_loop_filter_v_16_16_neon:   971.2   825.5   731.5   579.0        453.0
After:
vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
---
 libavcodec/aarch64/vp9lpf_neon.S | 5 ++---
 libavcodec/arm/vp9lpf_neon.S     | 5 ++---
 2 files changed, 4 insertions(+), 6 deletions(-)

Comments

Janne Grunau Feb. 10, 2017, 8:09 p.m. | #1
On 2017-01-15 22:55:47 +0200, Martin Storsjö wrote:
> Previously we first calculated hev, and then negated it.
> 
> Since we were able to schedule the negation in the middle
> of another calculation, we don't see any gain in all cases.
> 
> Before:                     Cortex A7      A8      A9     A53  A53/AArch64
> vp9_loop_filter_v_4_8_neon:     147.0   129.0   115.8    89.0         88.7
> vp9_loop_filter_v_8_8_neon:     242.0   198.5   174.7   140.0        136.7
> vp9_loop_filter_v_16_8_neon:    500.0   419.5   382.7   293.0        275.7
> vp9_loop_filter_v_16_16_neon:   971.2   825.5   731.5   579.0        453.0
> After:
> vp9_loop_filter_v_4_8_neon:     143.0   127.7   114.8    88.0         87.7
> vp9_loop_filter_v_8_8_neon:     241.0   197.2   173.7   140.0        136.7
> vp9_loop_filter_v_16_8_neon:    497.0   419.5   379.7   293.0        275.7
> vp9_loop_filter_v_16_16_neon:   965.2   818.7   731.4   579.0        452.0
> ---
>  libavcodec/aarch64/vp9lpf_neon.S | 5 ++---
>  libavcodec/arm/vp9lpf_neon.S     | 5 ++---
>  2 files changed, 4 insertions(+), 6 deletions(-)
> 
> diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
> index e9c7d9e..3b8e6eb 100644
> --- a/libavcodec/aarch64/vp9lpf_neon.S
> +++ b/libavcodec/aarch64/vp9lpf_neon.S
> @@ -292,7 +292,7 @@
>  .if \mix != 0
>          sxtl            v1.8h,  v1.8b
>  .endif
> -        cmhi            v5\sz,  v5\sz,  v3\sz  // hev
> +        cmhs            v5\sz,  v3\sz,  v5\sz  // !hev
>  .if \wd == 8
>          // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
>  .if \mix != 0
> @@ -306,11 +306,10 @@
>  .elseif \wd == 8
>          bic             v4\sz,  v4\sz,  v6\sz  // fm && !flat8in
>  .endif
> -        mvn             v5\sz,  v5\sz          // !hev
> +        and             v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
>  .if \wd == 16
>          and             v7\sz,  v7\sz,  v6\sz  // flat8out && flat8in && fm
>  .endif
> -        and             v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
>  
>          mul_sz          \tmp3\().8h,  \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp5\().8h,  \tmp5\().8h, \sz // 3 * (q0 - p0)
>          bic             \tmp1\sz,  \tmp1\sz,  v5\sz    // if (!hev) av_clip_int8 = 0
> diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
> index fbf2901..c57c0e9 100644
> --- a/libavcodec/arm/vp9lpf_neon.S
> +++ b/libavcodec/arm/vp9lpf_neon.S
> @@ -141,7 +141,7 @@
>  .if \wd == 8
>          vcle.u8         d6,  d6,  d0            @ flat8in
>  .endif
> -        vcgt.u8         d5,  d5,  d3            @ hev
> +        vcle.u8         d5,  d5,  d3            @ !hev
>  .if \wd == 8
>          vand            d6,  d6,  d4            @ flat8in && fm
>  .endif
> @@ -151,11 +151,10 @@
>  .elseif \wd == 8
>          vbic            d4,  d4,  d6            @ fm && !flat8in
>  .endif
> -        vmvn            d5,  d5                 @ !hev
> +        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
>  .if \wd == 16
>          vand            d7,  d7,  d6            @ flat8out && flat8in && fm
>  .endif
> -        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
>  
>          vmul.s16        \tmpq2,  \tmpq2, \tmpq3 @ 3 * (q0 - p0)
>          vbic            \tmp1,   \tmp1,   d5    @ if (!hev) av_clip_int8 = 0

ok

Janne

Patch

diff --git a/libavcodec/aarch64/vp9lpf_neon.S b/libavcodec/aarch64/vp9lpf_neon.S
index e9c7d9e..3b8e6eb 100644
--- a/libavcodec/aarch64/vp9lpf_neon.S
+++ b/libavcodec/aarch64/vp9lpf_neon.S
@@ -292,7 +292,7 @@ 
 .if \mix != 0
         sxtl            v1.8h,  v1.8b
 .endif
-        cmhi            v5\sz,  v5\sz,  v3\sz  // hev
+        cmhs            v5\sz,  v3\sz,  v5\sz  // !hev
 .if \wd == 8
         // If a 4/8 or 8/4 mix is used, clear the relevant half of v6
 .if \mix != 0
@@ -306,11 +306,10 @@ 
 .elseif \wd == 8
         bic             v4\sz,  v4\sz,  v6\sz  // fm && !flat8in
 .endif
-        mvn             v5\sz,  v5\sz          // !hev
+        and             v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
 .if \wd == 16
         and             v7\sz,  v7\sz,  v6\sz  // flat8out && flat8in && fm
 .endif
-        and             v5\sz,  v5\sz,  v4\sz  // !hev && fm && !flat8in
 
         mul_sz          \tmp3\().8h,  \tmp4\().8h,  \tmp3\().8h, \tmp4\().8h,  \tmp5\().8h,  \tmp5\().8h, \sz // 3 * (q0 - p0)
         bic             \tmp1\sz,  \tmp1\sz,  v5\sz    // if (!hev) av_clip_int8 = 0
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S
index fbf2901..c57c0e9 100644
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -141,7 +141,7 @@ 
 .if \wd == 8
         vcle.u8         d6,  d6,  d0            @ flat8in
 .endif
-        vcgt.u8         d5,  d5,  d3            @ hev
+        vcle.u8         d5,  d5,  d3            @ !hev
 .if \wd == 8
         vand            d6,  d6,  d4            @ flat8in && fm
 .endif
@@ -151,11 +151,10 @@ 
 .elseif \wd == 8
         vbic            d4,  d4,  d6            @ fm && !flat8in
 .endif
-        vmvn            d5,  d5                 @ !hev
+        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
 .if \wd == 16
         vand            d7,  d7,  d6            @ flat8out && flat8in && fm
 .endif
-        vand            d5,  d5,  d4            @ !hev && fm && !flat8in
 
         vmul.s16        \tmpq2,  \tmpq2, \tmpq3 @ 3 * (q0 - p0)
         vbic            \tmp1,   \tmp1,   d5    @ if (!hev) av_clip_int8 = 0