arm: Implement a NEON version of 422 h264_h_loop_filter_chroma

Message ID 1552384248-23652-1-git-send-email-martin@martin.st
State Committed
Commit 0676de935b1e81bc5b5698fef3e7d48ff2ea77ff
Headers show
Series
  • arm: Implement a NEON version of 422 h264_h_loop_filter_chroma
Related show

Commit Message

Martin Storsjö March 12, 2019, 9:50 a.m.
Previously, the 420 version was used even for 422.

This fixes occasional checkasm failures.
---
 libavcodec/arm/h264dsp_init_arm.c |  8 +++++++-
 libavcodec/arm/h264dsp_neon.S     | 19 +++++++++++++++++++
 2 files changed, 26 insertions(+), 1 deletion(-)

Comments

Luca Barbato March 12, 2019, 10:45 a.m. | #1
On 12/03/2019 10:50, Martin Storsjö wrote:
> Previously, the 420 version was used even for 422.
> 
> This fixes occasional checkasm failures.
> ---
>   libavcodec/arm/h264dsp_init_arm.c |  8 +++++++-
>   libavcodec/arm/h264dsp_neon.S     | 19 +++++++++++++++++++
>   2 files changed, 26 insertions(+), 1 deletion(-)
> 
> diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
> index 7afd350..617632c 100644
> --- a/libavcodec/arm/h264dsp_init_arm.c
> +++ b/libavcodec/arm/h264dsp_init_arm.c
> @@ -33,6 +33,8 @@ void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
>                                          int beta, int8_t *tc0);
>   void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
>                                          int beta, int8_t *tc0);
> +void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, int stride, int alpha,
> +                                          int beta, int8_t *tc0);
>   
>   void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
>                                      int log2_den, int weight, int offset);
> @@ -76,7 +78,11 @@ static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
>           c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
>           c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
>           c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
> -        c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
> +
> +        if (chroma_format_idc <= 1)
> +            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
> +        else
> +            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon;
>   
>           c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
>           c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
> diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
> index 5e75565..783e0f6 100644
> --- a/libavcodec/arm/h264dsp_neon.S
> +++ b/libavcodec/arm/h264dsp_neon.S
> @@ -237,6 +237,7 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
>           h264_loop_filter_start
>   
>           sub             r0,  r0,  #2
> +h_loop_filter_chroma420:
>           vld1.32         {d18[0]}, [r0], r1
>           vld1.32         {d16[0]}, [r0], r1
>           vld1.32         {d0[0]},  [r0], r1
> @@ -271,6 +272,24 @@ function ff_h264_h_loop_filter_chroma_neon, export=1
>           bx              lr
>   endfunc
>   
> +function ff_h264_h_loop_filter_chroma422_neon, export=1
> +        h264_loop_filter_start
> +        push            {r4, lr}
> +        add             r4,  r0,  r1
> +        add             r1,  r1,  r1
> +        sub             r0,  r0,  #2
> +
> +        bl              h_loop_filter_chroma420
> +
> +        ldr             r12, [sp, #8]
> +        ldr             r12, [r12]
> +        vmov.32         d24[0], r12
> +        sub             r0,  r4,  #2
> +
> +        bl              h_loop_filter_chroma420
> +        pop             {r4, pc}
> +endfunc
> +
>   @ Biweighted prediction
>   
>   .macro  biweight_16     macs, macd
> 

Sure

Patch

diff --git a/libavcodec/arm/h264dsp_init_arm.c b/libavcodec/arm/h264dsp_init_arm.c
index 7afd350..617632c 100644
--- a/libavcodec/arm/h264dsp_init_arm.c
+++ b/libavcodec/arm/h264dsp_init_arm.c
@@ -33,6 +33,8 @@  void ff_h264_v_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                        int beta, int8_t *tc0);
 void ff_h264_h_loop_filter_chroma_neon(uint8_t *pix, int stride, int alpha,
                                        int beta, int8_t *tc0);
+void ff_h264_h_loop_filter_chroma422_neon(uint8_t *pix, int stride, int alpha,
+                                          int beta, int8_t *tc0);
 
 void ff_weight_h264_pixels_16_neon(uint8_t *dst, int stride, int height,
                                    int log2_den, int weight, int offset);
@@ -76,7 +78,11 @@  static av_cold void h264dsp_init_neon(H264DSPContext *c, const int bit_depth,
         c->h264_v_loop_filter_luma   = ff_h264_v_loop_filter_luma_neon;
         c->h264_h_loop_filter_luma   = ff_h264_h_loop_filter_luma_neon;
         c->h264_v_loop_filter_chroma = ff_h264_v_loop_filter_chroma_neon;
-        c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+
+        if (chroma_format_idc <= 1)
+            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma_neon;
+        else
+            c->h264_h_loop_filter_chroma = ff_h264_h_loop_filter_chroma422_neon;
 
         c->weight_h264_pixels_tab[0] = ff_weight_h264_pixels_16_neon;
         c->weight_h264_pixels_tab[1] = ff_weight_h264_pixels_8_neon;
diff --git a/libavcodec/arm/h264dsp_neon.S b/libavcodec/arm/h264dsp_neon.S
index 5e75565..783e0f6 100644
--- a/libavcodec/arm/h264dsp_neon.S
+++ b/libavcodec/arm/h264dsp_neon.S
@@ -237,6 +237,7 @@  function ff_h264_h_loop_filter_chroma_neon, export=1
         h264_loop_filter_start
 
         sub             r0,  r0,  #2
+h_loop_filter_chroma420:
         vld1.32         {d18[0]}, [r0], r1
         vld1.32         {d16[0]}, [r0], r1
         vld1.32         {d0[0]},  [r0], r1
@@ -271,6 +272,24 @@  function ff_h264_h_loop_filter_chroma_neon, export=1
         bx              lr
 endfunc
 
+function ff_h264_h_loop_filter_chroma422_neon, export=1
+        h264_loop_filter_start
+        push            {r4, lr}
+        add             r4,  r0,  r1
+        add             r1,  r1,  r1
+        sub             r0,  r0,  #2
+
+        bl              h_loop_filter_chroma420
+
+        ldr             r12, [sp, #8]
+        ldr             r12, [r12]
+        vmov.32         d24[0], r12
+        sub             r0,  r4,  #2
+
+        bl              h_loop_filter_chroma420
+        pop             {r4, pc}
+endfunc
+
 @ Biweighted prediction
 
 .macro  biweight_16     macs, macd