[12/19] aarch64: vp8: Port vp8_luma_dc_wht and vp8_idct_dc_add4uv from arm version

Message ID 1549012378-32118-12-git-send-email-martin@martin.st
State Committed
Headers show
Series
  • [01/19] libavcodec: vp8 neon optimizations for aarch64
Related show

Commit Message

Martin Storsjö Feb. 1, 2019, 9:12 a.m.
Cortex A53    A72    A73
vp8_luma_dc_wht_c:        115.7   75.7   90.7
vp8_luma_dc_wht_neon:      60.7   41.2   45.7
vp8_idct_dc_add4uv_c:     376.1  262.9  282.5
vp8_idct_dc_add4uv_neon:   52.0   29.0   37.0
---
 libavcodec/aarch64/vp8dsp_init_aarch64.c |   3 +
 libavcodec/aarch64/vp8dsp_neon.S         | 109 +++++++++++++++++++++++++++++++
 2 files changed, 112 insertions(+)

Patch

diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c
index da54efd..8f060dc 100644
--- a/libavcodec/aarch64/vp8dsp_init_aarch64.c
+++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c
@@ -28,6 +28,7 @@  void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]);
 void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
 void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
 void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
+void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride);
 
 VP8_LF(neon);
 
@@ -57,10 +58,12 @@  av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp)
     if (!have_neon(av_get_cpu_flags())) {
         return;
     }
+    dsp->vp8_luma_dc_wht    = ff_vp8_luma_dc_wht_neon;
 
     dsp->vp8_idct_add       = ff_vp8_idct_add_neon;
     dsp->vp8_idct_dc_add    = ff_vp8_idct_dc_add_neon;
     dsp->vp8_idct_dc_add4y  = ff_vp8_idct_dc_add4y_neon;
+    dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon;
 
     dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon;
     dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon;
diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S
index 2b5b049..4ea62c0 100644
--- a/libavcodec/aarch64/vp8dsp_neon.S
+++ b/libavcodec/aarch64/vp8dsp_neon.S
@@ -4,6 +4,7 @@ 
  * Copyright (c) 2010 Rob Clark <rob@ti.com>
  * Copyright (c) 2011 Mans Rullgard <mans@mansr.com>
  * Copyright (c) 2018 Magnus Röös <mla2.roos@gmail.com>
+ * Copyright (c) 2019 Martin Storsjo <martin@martin.st>
  *
  * This file is part of Libav.
  *
@@ -25,6 +26,62 @@ 
 #include "libavutil/aarch64/asm.S"
 #include "neon.S"
 
+function ff_vp8_luma_dc_wht_neon, export=1
+        ld1             {v0.4h - v3.4h}, [x1]
+        movi            v30.8h, #0
+
+        add             v4.4h,  v0.4h,  v3.4h
+        add             v6.4h,  v1.4h,  v2.4h
+        st1             {v30.8h}, [x1], #16
+        sub             v7.4h,  v1.4h,  v2.4h
+        sub             v5.4h,  v0.4h,  v3.4h
+        st1             {v30.8h}, [x1]
+        add             v0.4h,  v4.4h,  v6.4h
+        add             v1.4h,  v5.4h,  v7.4h
+        sub             v2.4h,  v4.4h,  v6.4h
+        sub             v3.4h,  v5.4h,  v7.4h
+
+        movi            v16.4h, #3
+
+        transpose_4x4H  v0, v1, v2, v3, v4, v5, v6, v7
+
+        add             v0.4h,  v0.4h,  v16.4h
+
+        add             v4.4h,  v0.4h,  v3.4h
+        add             v6.4h,  v1.4h,  v2.4h
+        sub             v7.4h,  v1.4h,  v2.4h
+        sub             v5.4h,  v0.4h,  v3.4h
+        add             v0.4h,  v4.4h,  v6.4h
+        add             v1.4h,  v5.4h,  v7.4h
+        sub             v2.4h,  v4.4h,  v6.4h
+        sub             v3.4h,  v5.4h,  v7.4h
+
+        sshr            v0.4h,  v0.4h,  #3
+        sshr            v1.4h,  v1.4h,  #3
+        sshr            v2.4h,  v2.4h,  #3
+        sshr            v3.4h,  v3.4h,  #3
+
+        mov             x3,  #32
+        st1             {v0.h}[0],  [x0], x3
+        st1             {v1.h}[0],  [x0], x3
+        st1             {v2.h}[0],  [x0], x3
+        st1             {v3.h}[0],  [x0], x3
+        st1             {v0.h}[1],  [x0], x3
+        st1             {v1.h}[1],  [x0], x3
+        st1             {v2.h}[1],  [x0], x3
+        st1             {v3.h}[1],  [x0], x3
+        st1             {v0.h}[2],  [x0], x3
+        st1             {v1.h}[2],  [x0], x3
+        st1             {v2.h}[2],  [x0], x3
+        st1             {v3.h}[2],  [x0], x3
+        st1             {v0.h}[3],  [x0], x3
+        st1             {v1.h}[3],  [x0], x3
+        st1             {v2.h}[3],  [x0], x3
+        st1             {v3.h}[3],  [x0], x3
+
+        ret
+endfunc
+
 function ff_vp8_idct_add_neon, export=1
         ld1             {v0.8b - v3.8b},  [x1]
         mov             w4,  #20091
@@ -102,6 +159,58 @@  function ff_vp8_idct_add_neon, export=1
         ret
 endfunc
 
+function ff_vp8_idct_dc_add4uv_neon, export=1
+        movi            v0.4h,  #0
+        mov             x3,     #32
+        ld1r            {v16.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v17.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v18.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ld1r            {v19.4h},  [x1]
+        st1             {v0.h}[0], [x1], x3
+        ins             v16.d[1],  v17.d[0]
+        ins             v18.d[1],  v19.d[0]
+        mov             x3,  x0
+        srshr           v16.8h,    v16.8h,  #3            // dc >>= 3
+        ld1             {v0.8b},   [x0], x2
+        srshr           v18.8h,    v18.8h,  #3
+        ld1             {v1.8b},   [x0], x2
+        uaddw           v20.8h,    v16.8h, v0.8b
+        ld1             {v2.8b},   [x0], x2
+        uaddw           v0.8h,     v16.8h, v1.8b
+        ld1             {v3.8b},   [x0], x2
+        uaddw           v22.8h,    v16.8h, v2.8b
+        ld1             {v4.8b},   [x0], x2
+        uaddw           v2.8h,     v16.8h, v3.8b
+        ld1             {v5.8b},   [x0], x2
+        uaddw           v24.8h,    v18.8h, v4.8b
+        ld1             {v6.8b},   [x0], x2
+        uaddw           v4.8h,     v18.8h, v5.8b
+        ld1             {v7.8b},   [x0], x2
+        uaddw           v26.8h,    v18.8h, v6.8b
+        sqxtun          v20.8b,    v20.8h
+        uaddw           v6.8h,     v18.8h, v7.8b
+        sqxtun          v21.8b,    v0.8h
+        sqxtun          v22.8b,    v22.8h
+        st1             {v20.8b},  [x3], x2
+        sqxtun          v23.8b,    v2.8h
+        st1             {v21.8b},  [x3], x2
+        sqxtun          v24.8b,    v24.8h
+        st1             {v22.8b},  [x3], x2
+        sqxtun          v25.8b,    v4.8h
+        st1             {v23.8b},  [x3], x2
+        sqxtun          v26.8b,    v26.8h
+        st1             {v24.8b},  [x3], x2
+        sqxtun          v27.8b,    v6.8h
+        st1             {v25.8b},  [x3], x2
+        st1             {v26.8b},  [x3], x2
+        st1             {v27.8b},  [x3], x2
+
+        ret
+endfunc
+
 function ff_vp8_idct_dc_add4y_neon, export=1
         movi            v0.16b,  #0
         mov             x3,  #32