From patchwork Fri Feb 1 09:12:40 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [01/19] libavcodec: vp8 neon optimizations for aarch64 X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64420 Message-Id: <1549012378-32118-1-git-send-email-martin@martin.st> To: libav-devel@libav.org Cc: =?utf-8?b?TWFnbnVzIFLDtsO2cw==?= Date: Fri, 1 Feb 2019 11:12:40 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development From: Magnus Röös Partial port of the ARM Neon for aarch64. Benchmarks from fate: benchmarking with Linux Perf Monitoring API nop: 58.6 checkasm: using random seed 1760970128 NEON: - vp8dsp.idct [OK] - vp8dsp.mc [OK] - vp8dsp.loopfilter [OK] checkasm: all 21 tests passed vp8_idct_add_c: 201.6 vp8_idct_add_neon: 83.1 vp8_idct_dc_add_c: 107.6 vp8_idct_dc_add_neon: 33.8 vp8_idct_dc_add4y_c: 426.4 vp8_idct_dc_add4y_neon: 59.4 vp8_loop_filter8uv_h_c: 688.1 vp8_loop_filter8uv_h_neon: 216.3 vp8_loop_filter8uv_inner_h_c: 649.3 vp8_loop_filter8uv_inner_h_neon: 195.3 vp8_loop_filter8uv_inner_v_c: 544.8 vp8_loop_filter8uv_inner_v_neon: 131.3 vp8_loop_filter8uv_v_c: 706.1 vp8_loop_filter8uv_v_neon: 141.1 vp8_loop_filter16y_h_c: 668.8 vp8_loop_filter16y_h_neon: 242.8 vp8_loop_filter16y_inner_h_c: 647.3 vp8_loop_filter16y_inner_h_neon: 224.6 vp8_loop_filter16y_inner_v_c: 647.8 vp8_loop_filter16y_inner_v_neon: 128.8 vp8_loop_filter16y_v_c: 721.8 vp8_loop_filter16y_v_neon: 154.3 vp8_loop_filter_simple_h_c: 387.8 vp8_loop_filter_simple_h_neon: 187.6 vp8_loop_filter_simple_v_c: 384.1 vp8_loop_filter_simple_v_neon: 78.6 vp8_put_epel8_h4v4_c: 3971.1 vp8_put_epel8_h4v4_neon: 855.1 vp8_put_epel8_h4v6_c: 5060.1 vp8_put_epel8_h4v6_neon: 989.6 vp8_put_epel8_h6v4_c: 4320.8 vp8_put_epel8_h6v4_neon: 1007.3 vp8_put_epel8_h6v6_c: 5449.3 vp8_put_epel8_h6v6_neon: 1158.1 vp8_put_epel16_h6_c: 6683.8 vp8_put_epel16_h6_neon: 831.8 vp8_put_epel16_h6v6_c: 11110.8 vp8_put_epel16_h6v6_neon: 2214.8 vp8_put_epel16_v6_c: 7024.8 vp8_put_epel16_v6_neon: 799.6 vp8_put_pixels8_c: 112.8 vp8_put_pixels8_neon: 78.1 vp8_put_pixels16_c: 131.3 vp8_put_pixels16_neon: 129.8 Signed-off-by: Magnus Röös --- libavcodec/aarch64/Makefile | 2 + libavcodec/aarch64/vp8dsp.h | 70 ++ libavcodec/aarch64/vp8dsp_init_aarch64.c | 81 +++ libavcodec/aarch64/vp8dsp_neon.S | 1031 ++++++++++++++++++++++++++++++ libavcodec/vp8dsp.c | 4 + libavcodec/vp8dsp.h | 2 + 6 files changed, 1190 insertions(+) create mode 100644 libavcodec/aarch64/vp8dsp.h create mode 100644 libavcodec/aarch64/vp8dsp_init_aarch64.c create mode 100644 libavcodec/aarch64/vp8dsp_neon.S diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 5c1d118..2555044 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -44,6 +44,8 @@ NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \ aarch64/synth_filter_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o +NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o \ + aarch64/vp8dsp_neon.o NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_neon.o \ aarch64/vp9lpf_neon.o \ aarch64/vp9mc_neon.o diff --git a/libavcodec/aarch64/vp8dsp.h b/libavcodec/aarch64/vp8dsp.h new file mode 100644 index 0000000..8a0c8fb --- /dev/null +++ b/libavcodec/aarch64/vp8dsp.h @@ -0,0 +1,70 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#ifndef AVCODEC_ARM_VP8DSP_H +#define AVCODEC_ARM_VP8DSP_H + +#include "libavcodec/vp8dsp.h" + +#define VP8_LF_Y(hv, inner, opt) \ + void ff_vp8_##hv##_loop_filter16##inner##_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) + +#define VP8_LF_UV(hv, inner, opt) \ + void ff_vp8_##hv##_loop_filter8uv##inner##_##opt(uint8_t *dstU, \ + uint8_t *dstV, \ + ptrdiff_t stride, \ + int flim_E, int flim_I, \ + int hev_thresh) + +#define VP8_LF_SIMPLE(hv, opt) \ + void ff_vp8_##hv##_loop_filter16_simple_##opt(uint8_t *dst, \ + ptrdiff_t stride, \ + int flim) + +#define VP8_LF_HV(inner, opt) \ + VP8_LF_Y(h, inner, opt); \ + VP8_LF_Y(v, inner, opt); \ + VP8_LF_UV(h, inner, opt); \ + VP8_LF_UV(v, inner, opt) + +#define VP8_LF(opt) \ + VP8_LF_HV(, opt); \ + VP8_LF_HV(_inner, opt); \ + VP8_LF_SIMPLE(h, opt); \ + VP8_LF_SIMPLE(v, opt) + +#define VP8_MC(n, opt) \ + void ff_put_vp8_##n##_##opt(uint8_t *dst, ptrdiff_t dststride, \ + uint8_t *src, ptrdiff_t srcstride, \ + int h, int x, int y) + +#define VP8_EPEL(w, opt) \ + VP8_MC(pixels ## w, opt); \ + VP8_MC(epel ## w ## _h4, opt); \ + VP8_MC(epel ## w ## _h6, opt); \ + VP8_MC(epel ## w ## _v4, opt); \ + VP8_MC(epel ## w ## _h4v4, opt); \ + VP8_MC(epel ## w ## _h6v4, opt); \ + VP8_MC(epel ## w ## _v6, opt); \ + VP8_MC(epel ## w ## _h4v6, opt); \ + VP8_MC(epel ## w ## _h6v6, opt) + +#endif /* AVCODEC_ARM_VP8DSP_H */ diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c new file mode 100644 index 0000000..f93bcfa --- /dev/null +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -0,0 +1,81 @@ +/* + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include +#include +#include +#include +#include + +#include "libavutil/attributes.h" +#include "libavutil/aarch64/cpu.h" +#include "libavcodec/vp8dsp.h" +#include "vp8dsp.h" + +void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]); + +void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); + +VP8_LF(neon); + +VP8_EPEL(16, neon); +VP8_EPEL(8, neon); + + +av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) +{ + if (!have_neon(av_get_cpu_flags())) { + return; + } + dsp->put_vp8_epel_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; + dsp->put_vp8_epel_pixels_tab[0][0][2] = ff_put_vp8_epel16_h6_neon; + dsp->put_vp8_epel_pixels_tab[0][2][0] = ff_put_vp8_epel16_v6_neon; + dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon; + + dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; + dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon; +} + +av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) +{ + if (!have_neon(av_get_cpu_flags())) { + return; + } + + dsp->vp8_idct_add = ff_vp8_idct_add_neon; + dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; + dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; + + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; + dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; + dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon; + dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon; + + dsp->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16_inner_neon; + dsp->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16_inner_neon; + dsp->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_neon; + dsp->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_neon; + + dsp->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter16_simple_neon; + dsp->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter16_simple_neon; +} diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S new file mode 100644 index 0000000..771877c --- /dev/null +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -0,0 +1,1031 @@ +/* + * VP8 NEON optimisations + * + * Copyright (c) 2010 Rob Clark + * Copyright (c) 2011 Mans Rullgard + * Copyright (c) 2018 Magnus Röös + * + * This file is part of Libav. + * + * Libav is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * Libav is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with Libav; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + */ + +#include "libavutil/aarch64/asm.S" +#include "neon.S" + +function ff_vp8_idct_add_neon, export=1 + ld1 {v0.8b - v3.8b}, [x1] + mov w4, #20091 + movk w4, #35468/2, lsl 16 + dup v4.2s, w4 + + smull v26.4s, v1.4h, v4.4h[0] + smull v27.4s, v3.4h, v4.4h[0] + sqdmulh v20.4h, v1.4h, v4.4h[1] + sqdmulh v23.4h, v3.4h, v4.4h[1] + sqshrn v21.4h, v26.4s, #16 + sqshrn v22.4h, v27.4s, #16 + add v21.4h, v21.4h, v1.4h + add v22.4h, v22.4h, v3.4h + + add v16.4h, v0.4h, v2.4h + sub v17.4h, v0.4h, v2.4h + + add v18.4h, v21.4h, v23.4h + sub v19.4h, v20.4h, v22.4h + + add v0.4h, v16.4h, v18.4h + add v1.4h, v17.4h, v19.4h + sub v3.4h, v16.4h, v18.4h + sub v2.4h, v17.4h, v19.4h + + transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7 + + movi v29.8h, #0 + smull v26.4s, v1.4h, v4.4h[0] + st1 {v29.8h}, [x1], #16 + smull v27.4s, v3.4h, v4.4h[0] + st1 {v29.16b}, [x1] + sqdmulh v21.4h, v1.4h, v4.4h[1] + sqdmulh v23.4h, v3.4h, v4.4h[1] + sqshrn v20.4h, v26.4s, #16 + sqshrn v22.4h, v27.4s, #16 + add v20.4h, v20.4h, v1.4h + add v22.4h, v22.4h, v3.4h + add v16.4h, v0.4h, v2.4h + sub v17.4h, v0.4h, v2.4h + + add v18.4h, v20.4h, v23.4h + ld1 {v24.d}[0], [x0], x2 + zip1 v16.2d, v16.2d, v17.2d + sub v19.4h, v21.4h, v22.4h + ld1 {v25.d}[0], [x0], x2 + zip1 v18.2d, v18.2d, v19.2d + add v0.8h, v16.8h, v18.8h + ld1 {v25.d}[1], [x0], x2 + sub v1.8h, v16.8h, v18.8h + ld1 {v24.d}[1], [x0], x2 + srshr v0.8h, v0.8h, #3 + trn1 v24.4s, v24.4s, v25.4s + srshr v1.8h, v1.8h, #3 + sub x0, x0, x2, lsl #2 + + ext v1.16b, v1.16b, v1.16b, #8 + trn1 v3.2d, v0.2d, v1.2d + trn2 v0.2d, v0.2d, v1.2d + trn1 v1.8h, v3.8h, v0.8h + trn2 v3.8h, v3.8h, v0.8h + uzp1 v0.4s, v1.4s, v3.4s + uzp2 v1.4s, v3.4s, v1.4s + + uaddw v0.8h, v0.8h, v24.8b + uaddw2 v1.8h, v1.8h, v24.16b + sqxtun v0.8b, v0.8h + sqxtun2 v0.16b, v1.8h + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v0.s}[3], [x0], x2 + st1 {v0.s}[2], [x0], x2 + + ret +endfunc + +function ff_vp8_idct_dc_add4y_neon, export=1 + movi v0.16b, #0 + mov x3, #32 + ld1r {v16.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v17.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + zip1 v16.2d, v16.2d, v17.2d + ld1r {v18.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v19.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + zip1 v18.2d, v18.2d, v19.2d + srshr v16.8h, v16.8h, #3 // dc >>= 3 + ld1 {v0.16b}, [x0], x2 + srshr v18.8h, v18.8h, #3 + ld1 {v1.16b}, [x0], x2 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v2.16b}, [x0], x2 + uaddw2 v0.8h, v18.8h, v0.16b + ld1 {v3.16b}, [x0], x2 + uaddw v21.8h, v16.8h, v1.8b + uaddw2 v1.8h, v18.8h, v1.16b + uaddw v22.8h, v16.8h, v2.8b + uaddw2 v2.8h, v18.8h, v2.16b + uaddw v23.8h, v16.8h, v3.8b + uaddw2 v3.8h, v18.8h, v3.16b + sub x0, x0, x2, lsl #2 + sqxtun v20.8b, v20.8h + sqxtun2 v20.16b, v0.8h + sqxtun v21.8b, v21.8h + sqxtun2 v21.16b, v1.8h + sqxtun v22.8b, v22.8h + st1 {v20.16b}, [x0], x2 + sqxtun2 v22.16b, v2.8h + st1 {v21.16b}, [x0], x2 + sqxtun v23.8b, v23.8h + st1 {v22.16b}, [x0], x2 + sqxtun2 v23.16b, v3.8h + st1 {v23.16b}, [x0], x2 + + ret +endfunc + +function ff_vp8_idct_dc_add_neon, export=1 + mov w3, #0 + ld1r {v2.8h}, [x1] + strh w3, [x1] + srshr v2.8h, v2.8h, #3 + ld1 {v0.s}[0], [x0], x2 + ld1 {v0.s}[1], [x0], x2 + uaddw v3.8h, v2.8h, v0.8b + ld1 {v1.s}[0], [x0], x2 + ld1 {v1.s}[1], [x0], x2 + uaddw v4.8h, v2.8h, v1.8b + sqxtun v0.8b, v3.8h + sqxtun v1.8b, v4.8h + sub x0, x0, x2, lsl #2 + st1 {v0.s}[0], [x0], x2 + st1 {v0.s}[1], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v1.s}[1], [x0], x2 + ret +endfunc + +// Register layout: +// P3..Q3 -> v0..v7 +// flim_E -> v22 +// flim_I -> v23 +// hev_thresh -> x5 +// +.macro vp8_loop_filter, inner=0, simple=0, hev_thresh + .if \simple + uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) + uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) + uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 + ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 + uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) + movi v21.16b, #0x80 + cmhs v16.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim + .else + // calculate hev and normal_limit: + uabd v20.16b, v2.16b, v3.16b // abs(P1-P0) + uabd v21.16b, v5.16b, v4.16b // abs(Q1-Q0) + uabd v18.16b, v0.16b, v1.16b // abs(P3-P2) + uabd v19.16b, v1.16b, v2.16b // abs(P2-P1) + cmhs v16.16b, v23.16b, v20.16b // abs(P1-P0) <= flim_I + cmhs v17.16b, v23.16b, v21.16b // abs(Q1-Q0) <= flim_I + cmhs v18.16b, v23.16b, v18.16b // abs(P3-P2) <= flim_I + cmhs v19.16b, v23.16b, v19.16b // abs(P2-P1) <= flim_I + and v16.16b, v17.16b, v16.16b + uabd v17.16b, v7.16b, v6.16b // abs(Q3-Q2) + and v16.16b, v16.16b, v19.16b + uabd v19.16b, v6.16b, v5.16b // abs(Q2-Q1) + and v16.16b, v16.16b, v18.16b + cmhs v18.16b, v23.16b, v17.16b // abs(Q3-Q2) <= flim_I + cmhs v19.16b, v23.16b, v19.16b // abs(Q2-Q1) <= flim_I + uabd v17.16b, v3.16b, v4.16b // abs(P0-Q0) + uabd v23.16b, v2.16b, v5.16b // abs(P1-Q1) + and v16.16b, v16.16b, v18.16b + uqadd v17.16b, v17.16b, v17.16b // abs(P0-Q0) * 2 + and v16.16b, v16.16b, v19.16b + ushr v18.16b, v23.16b, #1 // abs(P1-Q1) / 2 + dup v23.16b, \hev_thresh // hev_thresh + uqadd v19.16b, v17.16b, v18.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) + cmhi v20.16b, v20.16b, v23.16b // abs(P1-P0) > hev_thresh + cmhs v19.16b, v22.16b, v19.16b // (abs(P0-Q0)*2) + (abs(P1-Q1)/2) <= flim_E + cmhi v22.16b, v21.16b, v23.16b // abs(Q1-Q0) > hev_thresh + and v16.16b, v16.16b, v19.16b + movi v21.16b, #0x80 + orr v17.16b, v20.16b, v22.16b + .endif + + // at this point: + // v16: normal_limit + // v17: hev + + // convert to signed value: + eor v3.16b, v3.16b, v21.16b // PS0 = P0 ^ 0x80 + eor v4.16b, v4.16b, v21.16b // QS0 = Q0 ^ 0x80 + + movi v20.8h, #3 + ssubl v18.8h, v4.8b, v3.8b // QS0 - PS0 + ssubl2 v19.8h, v4.16b, v3.16b // (widened to 16bit) + eor v2.16b, v2.16b, v21.16b // PS1 = P1 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // QS1 = Q1 ^ 0x80 + mul v18.8h, v18.8h, v20.8h // w = 3 * (QS0 - PS0) + mul v19.8h, v19.8h, v20.8h + + sqsub v20.16b, v2.16b, v5.16b // clamp(PS1-QS1) + movi v22.16b, #4 + movi v23.16b, #3 + .if \inner + and v20.16b, v20.16b, v17.16b // if(hev) w += clamp(PS1-QS1) + .endif + saddw v18.8h, v18.8h, v20.8b // w += clamp(PS1-QS1) + saddw2 v19.8h, v19.8h, v20.16b + sqxtn v18.8b, v18.8h // narrow result back into v18 + sqxtn2 v18.16b, v19.8h + .if !\inner && !\simple + eor v1.16b, v1.16b, v21.16b // PS2 = P2 ^ 0x80 + eor v6.16b, v6.16b, v21.16b // QS2 = Q2 ^ 0x80 + .endif + and v18.16b, v18.16b, v16.16b // w &= normal_limit + + // registers used at this point.. + // v0 -> P3 (don't corrupt) + // v1-v6 -> PS2-QS2 + // v7 -> Q3 (don't corrupt) + // v17 -> hev + // v18 -> w + // v21 -> #0x80 + // v22 -> #4 + // v23 -> #3 + // v16, v19, v29 -> unused + // + // filter_common: is4tap==1 + // c1 = clamp(w + 4) >> 3; + // c2 = clamp(w + 3) >> 3; + // Q0 = s2u(QS0 - c1); + // P0 = s2u(PS0 + c2); + + .if \simple + sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + .elseif \inner + // the !is4tap case of filter_common, only used for inner blocks + // c3 = ((c1&~hev) + 1) >> 1; + // Q1 = s2u(QS1 - c3); + // P1 = s2u(PS1 + c3); + sqadd v19.16b, v18.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v18.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + bic v19.16b, v19.16b, v17.16b // c1 & ~hev + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + srshr v19.16b, v19.16b, #1 // c3 >>= 1 + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-c3) + sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+c3) + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + .else + and v20.16b, v18.16b, v17.16b // w & hev + sqadd v19.16b, v20.16b, v22.16b // c1 = clamp((w&hev)+4) + sqadd v20.16b, v20.16b, v23.16b // c2 = clamp((w&hev)+3) + sshr v19.16b, v19.16b, #3 // c1 >>= 3 + sshr v20.16b, v20.16b, #3 // c2 >>= 3 + bic v18.16b, v18.16b, v17.16b // w &= ~hev + sqsub v4.16b, v4.16b, v19.16b // QS0 = clamp(QS0-c1) + sqadd v3.16b, v3.16b, v20.16b // PS0 = clamp(PS0+c2) + + // filter_mbedge: + // a = clamp((27*w + 63) >> 7); + // Q0 = s2u(QS0 - a); + // P0 = s2u(PS0 + a); + // a = clamp((18*w + 63) >> 7); + // Q1 = s2u(QS1 - a); + // P1 = s2u(PS1 + a); + // a = clamp((9*w + 63) >> 7); + // Q2 = s2u(QS2 - a); + // P2 = s2u(PS2 + a); + movi v17.8h, #63 + sshll v22.8h, v18.8b, #3 + sshll2 v23.8h, v18.16b, #3 + saddw v22.8h, v22.8h, v18.8b + saddw2 v23.8h, v23.8h, v18.16b + add v16.8h, v17.8h, v22.8h + add v17.8h, v17.8h, v23.8h // 9*w + 63 + add v19.8h, v16.8h, v22.8h + add v20.8h, v17.8h, v23.8h // 18*w + 63 + add v22.8h, v19.8h, v22.8h + add v23.8h, v20.8h, v23.8h // 27*w + 63 + sqshrn v16.8b, v16.8h, #7 + sqshrn2 v16.16b, v17.8h, #7 // clamp(( 9*w + 63)>>7) + sqshrn v19.8b, v19.8h, #7 + sqshrn2 v19.16b, v20.8h, #7 // clamp((18*w + 63)>>7) + sqshrn v22.8b, v22.8h, #7 + sqshrn2 v22.16b, v23.8h, #7 // clamp((27*w + 63)>>7) + sqadd v1.16b, v1.16b, v16.16b // PS2 = clamp(PS2+a) + sqsub v6.16b, v6.16b, v16.16b // QS2 = clamp(QS2-a) + sqadd v2.16b, v2.16b, v19.16b // PS1 = clamp(PS1+a) + sqsub v5.16b, v5.16b, v19.16b // QS1 = clamp(QS1-a) + sqadd v3.16b, v3.16b, v22.16b // PS0 = clamp(PS0+a) + sqsub v4.16b, v4.16b, v22.16b // QS0 = clamp(QS0-a) + eor v3.16b, v3.16b, v21.16b // P0 = PS0 ^ 0x80 + eor v4.16b, v4.16b, v21.16b // Q0 = QS0 ^ 0x80 + eor v2.16b, v2.16b, v21.16b // P1 = PS1 ^ 0x80 + eor v5.16b, v5.16b, v21.16b // Q1 = QS1 ^ 0x80 + eor v1.16b, v1.16b, v21.16b // P2 = PS2 ^ 0x80 + eor v6.16b, v6.16b, v21.16b // Q2 = QS2 ^ 0x80 + .endif +.endm + +.macro vp8_v_loop_filter16 name, inner=0, simple=0 +function ff_vp8_v_loop_filter16\name\()_neon, export=1 + sub x0, x0, x1, lsl #1+!\simple + + // Load pixels: + .if !\simple + ld1 {v0.16b}, [x0], x1 // P3 + ld1 {v1.16b}, [x0], x1 // P2 + .endif + ld1 {v2.16b}, [x0], x1 // P1 + ld1 {v3.16b}, [x0], x1 // P0 + ld1 {v4.16b}, [x0], x1 // Q0 + ld1 {v5.16b}, [x0], x1 // Q1 + .if !\simple + ld1 {v6.16b}, [x0], x1 // Q2 + ld1 {v7.16b}, [x0] // Q3 + dup v23.16b, w3 // flim_I + .endif + dup v22.16b, w2 // flim_E + + vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 + + // back up to P2: dst -= stride * 6 + sub x0, x0, x1, lsl #2 + .if !\simple + sub x0, x0, x1, lsl #1 + + // Store pixels: + st1 {v1.16b}, [x0], x1 // P2 + .endif + st1 {v2.16b}, [x0], x1 // P1 + st1 {v3.16b}, [x0], x1 // P0 + st1 {v4.16b}, [x0], x1 // Q0 + st1 {v5.16b}, [x0], x1 // Q1 + .if !\simple + st1 {v6.16b}, [x0] // Q2 + .endif + + ret +endfunc +.endm + +vp8_v_loop_filter16 +vp8_v_loop_filter16 _inner, inner=1 +vp8_v_loop_filter16 _simple, simple=1 + +.macro vp8_v_loop_filter8uv name, inner=0 +function ff_vp8_v_loop_filter8uv\name\()_neon, export=1 + sub x0, x0, x2, lsl #2 + sub x1, x1, x2, lsl #2 + // Load pixels: + ld1 {v0.d}[0], [x0], x2 // P3 + ld1 {v0.d}[1], [x1], x2 // P3 + ld1 {v1.d}[0], [x0], x2 // P2 + ld1 {v1.d}[1], [x1], x2 // P2 + ld1 {v2.d}[0], [x0], x2 // P1 + ld1 {v2.d}[1], [x1], x2 // P1 + ld1 {v3.d}[0], [x0], x2 // P0 + ld1 {v3.d}[1], [x1], x2 // P0 + ld1 {v4.d}[0], [x0], x2 // Q0 + ld1 {v4.d}[1], [x1], x2 // Q0 + ld1 {v5.d}[0], [x0], x2 // Q1 + ld1 {v5.d}[1], [x1], x2 // Q1 + ld1 {v6.d}[0], [x0], x2 // Q2 + ld1 {v6.d}[1], [x1], x2 // Q2 + ld1 {v7.d}[0], [x0] // Q3 + ld1 {v7.d}[1], [x1] // Q3 + + dup v22.16b, w3 // flim_E + dup v23.16b, w4 // flim_I + + vp8_loop_filter inner=\inner, hev_thresh=w5 + + // back up to P2: u,v -= stride * 6 + sub x0, x0, x2, lsl #2 + sub x1, x1, x2, lsl #2 + sub x0, x0, x2, lsl #1 + sub x1, x1, x2, lsl #1 + + // Store pixels: + + st1 {v1.d}[0], [x0], x2 // P2 + st1 {v1.d}[1], [x1], x2 // P2 + st1 {v2.d}[0], [x0], x2 // P1 + st1 {v2.d}[1], [x1], x2 // P1 + st1 {v3.d}[0], [x0], x2 // P0 + st1 {v3.d}[1], [x1], x2 // P0 + st1 {v4.d}[0], [x0], x2 // Q0 + st1 {v4.d}[1], [x1], x2 // Q0 + st1 {v5.d}[0], [x0], x2 // Q1 + st1 {v5.d}[1], [x1], x2 // Q1 + st1 {v6.d}[0], [x0] // Q2 + st1 {v6.d}[1], [x1] // Q2 + + ret +endfunc +.endm + +vp8_v_loop_filter8uv +vp8_v_loop_filter8uv _inner, inner=1 + +.macro vp8_h_loop_filter16 name, inner=0, simple=0 +function ff_vp8_h_loop_filter16\name\()_neon, export=1 + + sub x0, x0, #4 + // Load pixels: + ld1 {v0.d}[0], [x0], x1 + ld1 {v1.d}[0], [x0], x1 + ld1 {v2.d}[0], [x0], x1 + ld1 {v3.d}[0], [x0], x1 + ld1 {v4.d}[0], [x0], x1 + ld1 {v5.d}[0], [x0], x1 + ld1 {v6.d}[0], [x0], x1 + ld1 {v7.d}[0], [x0], x1 + ld1 {v0.d}[1], [x0], x1 + ld1 {v1.d}[1], [x0], x1 + ld1 {v2.d}[1], [x0], x1 + ld1 {v3.d}[1], [x0], x1 + ld1 {v4.d}[1], [x0], x1 + ld1 {v5.d}[1], [x0], x1 + ld1 {v6.d}[1], [x0], x1 + ld1 {v7.d}[1], [x0], x1 + + transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + dup v22.16b, w2 // flim_E + .if !\simple + dup v23.16b, w3 // flim_I + .endif + + vp8_loop_filter inner=\inner, simple=\simple, hev_thresh=w4 + + sub x0, x0, x1, lsl #4 // backup 16 rows + + transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + // Store pixels: + st1 {v0.d}[0], [x0], x1 + st1 {v1.d}[0], [x0], x1 + st1 {v2.d}[0], [x0], x1 + st1 {v3.d}[0], [x0], x1 + st1 {v4.d}[0], [x0], x1 + st1 {v5.d}[0], [x0], x1 + st1 {v6.d}[0], [x0], x1 + st1 {v7.d}[0], [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.d}[1], [x0], x1 + st1 {v2.d}[1], [x0], x1 + st1 {v3.d}[1], [x0], x1 + st1 {v4.d}[1], [x0], x1 + st1 {v5.d}[1], [x0], x1 + st1 {v6.d}[1], [x0], x1 + st1 {v7.d}[1], [x0] + + ret +endfunc +.endm + +vp8_h_loop_filter16 +vp8_h_loop_filter16 _inner, inner=1 +vp8_h_loop_filter16 _simple, simple=1 + +.macro vp8_h_loop_filter8uv name, inner=0 +function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 + sub x0, x0, #4 + sub x1, x1, #4 + + // Load pixels: + ld1 {v0.d}[0], [x0], x2 // load u + ld1 {v0.d}[1], [x1], x2 // load v + ld1 {v1.d}[0], [x0], x2 + ld1 {v1.d}[1], [x1], x2 + ld1 {v2.d}[0], [x0], x2 + ld1 {v2.d}[1], [x1], x2 + ld1 {v3.d}[0], [x0], x2 + ld1 {v3.d}[1], [x1], x2 + ld1 {v4.d}[0], [x0], x2 + ld1 {v4.d}[1], [x1], x2 + ld1 {v5.d}[0], [x0], x2 + ld1 {v5.d}[1], [x1], x2 + ld1 {v6.d}[0], [x0], x2 + ld1 {v6.d}[1], [x1], x2 + ld1 {v7.d}[0], [x0], x2 + ld1 {v7.d}[1], [x1], x2 + + transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + dup v22.16b, w3 // flim_E + dup v23.16b, w4 // flim_I + + vp8_loop_filter inner=\inner, hev_thresh=w5 + + sub x0, x0, x2, lsl #3 // backup u 8 rows + sub x1, x1, x2, lsl #3 // backup v 8 rows + + transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + + // Store pixels: + st1 {v0.d}[0], [x0], x2 // load u + st1 {v0.d}[1], [x1], x2 // load v + st1 {v1.d}[0], [x0], x2 + st1 {v1.d}[1], [x1], x2 + st1 {v2.d}[0], [x0], x2 + st1 {v2.d}[1], [x1], x2 + st1 {v3.d}[0], [x0], x2 + st1 {v3.d}[1], [x1], x2 + st1 {v4.d}[0], [x0], x2 + st1 {v4.d}[1], [x1], x2 + st1 {v5.d}[0], [x0], x2 + st1 {v5.d}[1], [x1], x2 + st1 {v6.d}[0], [x0], x2 + st1 {v6.d}[1], [x1], x2 + st1 {v7.d}[0], [x0] + st1 {v7.d}[1], [x1] + + ret + +endfunc +.endm + +vp8_h_loop_filter8uv +vp8_h_loop_filter8uv _inner, inner=1 + + +function ff_put_vp8_pixels16_neon, export=1 +1: + subs w4, w4, #4 + ld1 {v0.16b}, [x2], x3 + ld1 {v1.16b}, [x2], x3 + ld1 {v2.16b}, [x2], x3 + ld1 {v3.16b}, [x2], x3 + st1 {v0.16b}, [x0], x1 + st1 {v1.16b}, [x0], x1 + st1 {v2.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + bgt 1b + ret +endfunc + +function ff_put_vp8_pixels8_neon, export=1 +1: + subs w4, w4, #4 + ld1 {v0.8b}, [x2], x3 + ld1 {v0.d}[1], [x2], x3 + ld1 {v1.8b}, [x2], x3 + ld1 {v1.d}[1], [x2], x3 + st1 {v0.8b}, [x0], x1 + st1 {v0.d}[1], [x0], x1 + st1 {v1.8b}, [x0], x1 + st1 {v1.d}[1], [x0], x1 + bgt 1b + ret +endfunc + +/* 4/6-tap 8th-pel MC */ + +.macro vp8_epel8_h6 d, s0, s1 + ext v22.8b, \s0\().8b, \s1\().8b, #1 + uxtl v18.8h, \s0\().8b + ext v23.8b, \s0\().8b, \s1\().8b, #2 + uxtl v19.8h, v22.8b + ext v24.8b, \s0\().8b, \s1\().8b, #3 + uxtl v21.8h, v23.8b + ext v25.8b, \s0\().8b, \s1\().8b, #4 + uxtl v22.8h, v24.8b + ext v26.8b, \s0\().8b, \s1\().8b, #5 + uxtl v25.8h, v25.8b + mul v21.8h, v21.8h, v0.8h[2] + uxtl v26.8h, v26.8b + mul v22.8h, v22.8h, v0.8h[3] + mls v21.8h, v19.8h, v0.8h[1] + mls v22.8h, v25.8h, v0.8h[4] + mla v21.8h, v18.8h, v0.8h[0] + mla v22.8h, v26.8h, v0.8h[5] + sqadd v22.8h, v21.8h, v22.8h + sqrshrun \d\().8b, v22.8h, #7 +.endm + +.macro vp8_epel16_h6 d0, v0, v1 + ext v22.16b, \v0\().16b, \v1\().16b, #3 + ext v23.16b, \v0\().16b, \v1\().16b, #4 + uxtl v19.8h, v22.8b + uxtl2 v22.8h, v22.16b + ext v3.16b, \v0\().16b, \v1\().16b, #2 + uxtl v20.8h, v23.8b + uxtl2 v23.8h, v23.16b + ext v16.16b, \v0\().16b, \v1\().16b, #1 + uxtl v18.8h, v3.8b + uxtl2 v3.8h, v3.16b + ext v2.16b, \v0\().16b, \v1\().16b, #5 + uxtl v21.8h, v2.8b + uxtl2 v2.8h, v2.16b + uxtl v17.8h, v16.8b + uxtl2 v16.8h, v16.16b + mul v19.8h, v19.8h, v0.8h[3] + mul v18.8h, v18.8h, v0.8h[2] + mul v3.8h, v3.8h, v0.8h[2] + mul v22.8h, v22.8h, v0.8h[3] + mls v19.8h, v20.8h, v0.8h[4] + uxtl v20.8h, \v0\().8b + uxtl2 v1.8h, \v0\().16b + mls v18.8h, v17.8h, v0.8h[1] + mls v3.8h, v16.8h, v0.8h[1] + mls v22.8h, v23.8h, v0.8h[4] + mla v18.8h, v20.8h, v0.8h[0] + mla v19.8h, v21.8h, v0.8h[5] + mla v3.8h, v1.8h, v0.8h[0] + mla v22.8h, v2.8h, v0.8h[5] + sqadd v19.8h, v18.8h, v19.8h + sqadd v22.8h, v3.8h, v22.8h + sqrshrun \d0\().8b, v19.8h, #7 + sqrshrun2 \d0\().16b, v22.8h, #7 +.endm + +.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 + uxtl \s2\().8h, \s2\().8b + uxtl \s3\().8h, \s3\().8b + uxtl \s1\().8h, \s1\().8b + uxtl \s4\().8h, \s4\().8b + uxtl \s0\().8h, \s0\().8b + uxtl \s5\().8h, \s5\().8b + mul \s2\().8h, \s2\().8h, v0.8h[2] + mul \s3\().8h, \s3\().8h, v0.8h[3] + mls \s2\().8h, \s1\().8h, v0.8h[1] + mls \s3\().8h, \s4\().8h, v0.8h[4] + mla \s2\().8h, \s0\().8h, v0.8h[0] + mla \s3\().8h, \s5\().8h, v0.8h[5] + sqadd \s3\().8h, \s2\().8h, \s3\().8h + sqrshrun \d0\().8b, \s3\().8h, #7 +.endm + +.macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 + uxtl \s0\().8h, \s0\().8b + uxtl \s3\().8h, \s3\().8b + uxtl \s6\().8h, \s6\().8b + uxtl \s1\().8h, \s1\().8b + uxtl \s4\().8h, \s4\().8b + uxtl \s2\().8h, \s2\().8b + uxtl \s5\().8h, \s5\().8b + mul \s0\().8h, \s0\().8h, v0.8h[0] + mul v31.8h , \s3\().8h, v0.8h[3] + mul \s3\().8h, \s3\().8h, v0.8h[2] + mul \s6\().8h, \s6\().8h, v0.8h[5] + + mls \s0\().8h, \s1\().8h, v0.8h[1] + mls v31.8h , \s4\().8h, v0.8h[4] + mls \s3\().8h, \s2\().8h, v0.8h[1] + mls \s6\().8h, \s5\().8h, v0.8h[4] + + mla \s0\().8h, \s2\().8h, v0.8h[2] + mla v31.8h , \s5\().8h, v0.8h[5] + mla \s3\().8h, \s1\().8h, v0.8h[0] + mla \s6\().8h, \s4\().8h, v0.8h[3] + sqadd v31.8h , \s0\().8h, v31.8h + sqadd \s6\().8h, \s3\().8h, \s6\().8h + sqrshrun \d0\().8b, v31.8h, #7 + sqrshrun \d1\().8b, \s6\().8h, #7 +.endm + +.macro vp8_epel8_h4 d, v0, v1 + ext v22.8b, \v0\().8b, \v1\().8b, #1 + uxtl v19.8h, \v0\().8b + ext v23.8b, \v0\().8b, \v1\().8b, #2 + uxtl v20.8h, v22.8b + ext v25.8b, \v0\().8b, \v1\().8b, #3 + uxtl v22.8h, v23.8b + uxtl v25.8h, v25.8b + mul v20.8h, v20.8h, v0.8h[2] + mul v22.8h, v22.8h, v0.8h[3] + mls v20.8h, v19.8h, v0.8h[1] + mls v22.8h, v25.8h, v0.8h[4] + sqadd v22.8h, v20.8h, v22.8h + sqrshrun \d\().8b, v22.8h, #7 +.endm + +.macro vp8_epel8_v4_y2 d0, s0, s1, s2, s3, s4 + uxtl \s0\().8h, \s0\().8b + uxtl \s1\().8h, \s1\().8b + uxtl \s2\().8h, \s2\().8b + uxtl \s3\().8h, \s3\().8b + uxtl \s4\().8h, \s4\().8b + mul v21.8h, \s1\().8h, v0.8h[2] + mul v23.8h, \s2\().8h, v0.8h[3] + mul \s2\().8h, \s2\().8h, v0.8h[2] + mul v22.8h, \s3\().8h, v0.8h[3] + mls v21.8h, \s0\().8h, v0.8h[1] + mls v23.8h, \s3\().8h, v0.8h[4] + mls \s2\().8h, \s1\().8h, v0.8h[1] + mls v22.8h, \s4\().8h, v0.8h[4] + sqadd v21.8h, v21.8h, v23.8h + sqadd \s2\().8h, \s2\().8h, v22.8h + sqrshrun \d0\().8b, v21.8h, #7 + sqrshrun2 \d0\().16b, \s2\().8h, #7 +.endm + + +// note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit +// arithmatic can be used to apply filters +const subpel_filters, align=4 + .short 0, 6, 123, 12, 1, 0, 0, 0 + .short 2, 11, 108, 36, 8, 1, 0, 0 + .short 0, 9, 93, 50, 6, 0, 0, 0 + .short 3, 16, 77, 77, 16, 3, 0, 0 + .short 0, 6, 50, 93, 9, 0, 0, 0 + .short 1, 8, 36, 108, 11, 2, 0, 0 + .short 0, 1, 12, 123, 6, 0, 0, 0 +endconst + +function ff_put_vp8_epel16_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + sxtw x4, w4 + sxtw x6, w6 + movrel x17, subpel_filters-16 + add x6, x17, x6, lsl #4 // y + ld1 {v0.8h}, [x6] +1: + ld1 {v1.1d - v2.1d}, [x2], x3 + ld1 {v3.1d - v4.1d}, [x2], x3 + ld1 {v16.1d - v17.1d}, [x2], x3 + ld1 {v18.1d - v19.1d}, [x2], x3 + ld1 {v20.1d - v21.1d}, [x2], x3 + ld1 {v22.1d - v23.1d}, [x2], x3 + ld1 {v24.1d - v25.1d}, [x2] + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 + vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 + + st1 {v1.1d - v2.1d}, [x0], x1 + st1 {v3.1d - v4.1d}, [x0], x1 + subs x4, x4, #2 + bne 1b + + ret +endfunc + +function ff_put_vp8_epel16_h6_neon, export=1 + sub x2, x2, #2 + sxtw x5, w5 // x + + // first pass (horizontal): + movrel x17, subpel_filters-16 + add x5, x17, x5, lsl #4 // x + ld1 {v0.8h}, [x5] +1: + ld1 {v1.16b, v2.16b}, [x2], x3 + vp8_epel16_h6 v1, v1, v2 + st1 {v1.16b}, [x0], x1 + + subs w4, w4, #1 + bne 1b + ret +endfunc + + +function ff_put_vp8_epel16_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + + // first pass (horizontal): + movrel x17, subpel_filters-16 + sxtw x5, w5 // x + add x16, x17, x5, lsl #4 // x + sub sp, sp, #336+16 + ld1 {v0.8h}, [x16] + add x7, sp, #15 + sxtw x4, w4 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.16b, v2.16b}, [x2], x3 + vp8_epel16_h6 v1, v1, v2 + st1 {v1.16b}, [x7], #16 + subs x16, x16, #1 + bne 1b + + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v16.8b - v19.8b}, [x7], #32 + ld1 {v20.8b - v23.8b}, [x7] + sub x7, x7, #48 + + vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22 + vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23 + trn1 v2.2d, v5.2d, v2.2d + + st1 {v2.16b}, [x0], x1 + subs x4, x4, #1 + bne 2b + + add sp, sp, #336+16 + ret +endfunc + +function ff_put_vp8_epel8_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + sxtw x4, w4 + + // first pass (horizontal): + movrel x17, subpel_filters-16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h6 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + bne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v5.8b - v7.8b}, [x7] + + sub x7, x7, #16 + + vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 + + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + subs x4, x4, #2 + bne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_h4v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #1 + sxtw x4, w4 + + // first pass (horizontal): + movrel x17, subpel_filters-16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #5 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h4 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + bne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v4.8b}, [x7], #32 + ld1 {v5.8b - v7.8b}, [x7] + + sub x7, x7, #16 + + vp8_epel8_v6_y2 v1, v2, v1, v2, v3, v4, v5, v6, v7 + + st1 {v1.8b}, [x0], x1 + st1 {v2.8b}, [x0], x1 + subs x4, x4, #2 + bne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_h4v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #1 + sxtw x4, w4 + + + // first pass (horizontal): + movrel x17, subpel_filters-16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #3 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h4 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + bne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v2.8b}, [x7], #16 + ld1 {v3.8b - v5.8b}, [x7] + + vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 + + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + subs x4, x4, #2 + bne 2b + + add sp, sp, #168+16 + ret +endfunc + +function ff_put_vp8_epel8_h6v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #2 + sxtw x4, w4 + + + // first pass (horizontal): + movrel x17, subpel_filters-16 + sxtw x5, w5 + add x5, x17, x5, lsl #4 // x + sub sp, sp, #168+16 + ld1 {v0.8h}, [x5] + add x7, sp, #15 + add x16, x4, #3 // h + bic x7, x7, #15 +1: + ld1 {v1.8b, v2.8b}, [x2], x3 + + vp8_epel8_h6 v1, v1, v2 + + st1 {v1.8b}, [x7], #8 + subs x16, x16, #1 + bne 1b + + // second pass (vertical): + sxtw x6, w6 + add x6, x17, x6, lsl #4 // y + add x7, sp, #15 + ld1 {v0.8h}, [x6] + bic x7, x7, #15 +2: + ld1 {v1.8b - v2.8b}, [x7], #16 + ld1 {v3.8b - v5.8b}, [x7] + + vp8_epel8_v4_y2 v1, v1, v2, v3, v4, v5 + + st1 {v1.d}[0], [x0], x1 + st1 {v1.d}[1], [x0], x1 + subs x4, x4, #2 + bne 2b + + add sp, sp, #168+16 + ret +endfunc diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c index 4e4012f..3c8d1c8 100644 --- a/libavcodec/vp8dsp.c +++ b/libavcodec/vp8dsp.c @@ -685,6 +685,8 @@ av_cold void ff_vp78dsp_init(VP8DSPContext *dsp) ff_vp78dsp_init_ppc(dsp); if (ARCH_X86) ff_vp78dsp_init_x86(dsp); + if (ARCH_AARCH64) + ff_vp78dsp_init_aarch64(dsp); } #if CONFIG_VP7_DECODER @@ -743,5 +745,7 @@ av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) ff_vp8dsp_init_arm(dsp); if (ARCH_X86) ff_vp8dsp_init_x86(dsp); + if (ARCH_AARCH64) + ff_vp8dsp_init_aarch64(dsp); } #endif /* CONFIG_VP8_DECODER */ diff --git a/libavcodec/vp8dsp.h b/libavcodec/vp8dsp.h index b248b86..65d8418 100644 --- a/libavcodec/vp8dsp.h +++ b/libavcodec/vp8dsp.h @@ -91,11 +91,13 @@ void ff_put_vp8_pixels4_c(uint8_t *dst, uint8_t *src, ptrdiff_t stride, void ff_vp7dsp_init(VP8DSPContext *c); void ff_vp78dsp_init(VP8DSPContext *c); +void ff_vp78dsp_init_aarch64(VP8DSPContext *c); void ff_vp78dsp_init_arm(VP8DSPContext *c); void ff_vp78dsp_init_ppc(VP8DSPContext *c); void ff_vp78dsp_init_x86(VP8DSPContext *c); void ff_vp8dsp_init(VP8DSPContext *c); +void ff_vp8dsp_init_aarch64(VP8DSPContext *c); void ff_vp8dsp_init_arm(VP8DSPContext *c); void ff_vp8dsp_init_x86(VP8DSPContext *c); From patchwork Fri Feb 1 09:12:41 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [02/19] aarch64: vp8: Fix the include guard X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64416 Message-Id: <1549012378-32118-2-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:41 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development From: Carl Eugen Hoyos --- libavcodec/aarch64/vp8dsp.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp.h b/libavcodec/aarch64/vp8dsp.h index 8a0c8fb..40d0cae 100644 --- a/libavcodec/aarch64/vp8dsp.h +++ b/libavcodec/aarch64/vp8dsp.h @@ -16,8 +16,8 @@ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA */ -#ifndef AVCODEC_ARM_VP8DSP_H -#define AVCODEC_ARM_VP8DSP_H +#ifndef AVCODEC_AARCH64_VP8DSP_H +#define AVCODEC_AARCH64_VP8DSP_H #include "libavcodec/vp8dsp.h" @@ -67,4 +67,4 @@ VP8_MC(epel ## w ## _h4v6, opt); \ VP8_MC(epel ## w ## _h6v6, opt) -#endif /* AVCODEC_ARM_VP8DSP_H */ +#endif /* AVCODEC_AARCH64_VP8DSP_H */ From patchwork Fri Feb 1 09:12:42 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [03/19] aarch64: vp8: Fix assembling with clang X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64417 Message-Id: <1549012378-32118-3-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:42 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development This also partially fixes assembling with MS armasm64 (via gas-preprocessor). --- libavcodec/aarch64/vp8dsp_neon.S | 124 +++++++++++++++++++-------------------- 1 file changed, 62 insertions(+), 62 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 771877c..f371ea7 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -31,10 +31,10 @@ function ff_vp8_idct_add_neon, export=1 movk w4, #35468/2, lsl 16 dup v4.2s, w4 - smull v26.4s, v1.4h, v4.4h[0] - smull v27.4s, v3.4h, v4.4h[0] - sqdmulh v20.4h, v1.4h, v4.4h[1] - sqdmulh v23.4h, v3.4h, v4.4h[1] + smull v26.4s, v1.4h, v4.h[0] + smull v27.4s, v3.4h, v4.h[0] + sqdmulh v20.4h, v1.4h, v4.h[1] + sqdmulh v23.4h, v3.4h, v4.h[1] sqshrn v21.4h, v26.4s, #16 sqshrn v22.4h, v27.4s, #16 add v21.4h, v21.4h, v1.4h @@ -54,12 +54,12 @@ function ff_vp8_idct_add_neon, export=1 transpose_4x4H v0, v1, v2, v3, v24, v5, v6, v7 movi v29.8h, #0 - smull v26.4s, v1.4h, v4.4h[0] + smull v26.4s, v1.4h, v4.h[0] st1 {v29.8h}, [x1], #16 - smull v27.4s, v3.4h, v4.4h[0] + smull v27.4s, v3.4h, v4.h[0] st1 {v29.16b}, [x1] - sqdmulh v21.4h, v1.4h, v4.4h[1] - sqdmulh v23.4h, v3.4h, v4.4h[1] + sqdmulh v21.4h, v1.4h, v4.h[1] + sqdmulh v23.4h, v3.4h, v4.h[1] sqshrn v20.4h, v26.4s, #16 sqshrn v22.4h, v27.4s, #16 add v20.4h, v20.4h, v1.4h @@ -469,7 +469,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1 ld1 {v6.d}[1], [x0], x1 ld1 {v7.d}[1], [x0], x1 - transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 dup v22.16b, w2 // flim_E .if !\simple @@ -480,7 +480,7 @@ function ff_vp8_h_loop_filter16\name\()_neon, export=1 sub x0, x0, x1, lsl #4 // backup 16 rows - transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 // Store pixels: st1 {v0.d}[0], [x0], x1 @@ -531,7 +531,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 ld1 {v7.d}[0], [x0], x2 ld1 {v7.d}[1], [x1], x2 - transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 dup v22.16b, w3 // flim_E dup v23.16b, w4 // flim_I @@ -541,7 +541,7 @@ function ff_vp8_h_loop_filter8uv\name\()_neon, export=1 sub x0, x0, x2, lsl #3 // backup u 8 rows sub x1, x1, x2, lsl #3 // backup v 8 rows - transpose_8x16b v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 + transpose_8x16B v0, v1, v2, v3, v4, v5, v6, v7, v30, v31 // Store pixels: st1 {v0.d}[0], [x0], x2 // load u @@ -613,13 +613,13 @@ endfunc uxtl v22.8h, v24.8b ext v26.8b, \s0\().8b, \s1\().8b, #5 uxtl v25.8h, v25.8b - mul v21.8h, v21.8h, v0.8h[2] + mul v21.8h, v21.8h, v0.h[2] uxtl v26.8h, v26.8b - mul v22.8h, v22.8h, v0.8h[3] - mls v21.8h, v19.8h, v0.8h[1] - mls v22.8h, v25.8h, v0.8h[4] - mla v21.8h, v18.8h, v0.8h[0] - mla v22.8h, v26.8h, v0.8h[5] + mul v22.8h, v22.8h, v0.h[3] + mls v21.8h, v19.8h, v0.h[1] + mls v22.8h, v25.8h, v0.h[4] + mla v21.8h, v18.8h, v0.h[0] + mla v22.8h, v26.8h, v0.h[5] sqadd v22.8h, v21.8h, v22.8h sqrshrun \d\().8b, v22.8h, #7 .endm @@ -640,20 +640,20 @@ endfunc uxtl2 v2.8h, v2.16b uxtl v17.8h, v16.8b uxtl2 v16.8h, v16.16b - mul v19.8h, v19.8h, v0.8h[3] - mul v18.8h, v18.8h, v0.8h[2] - mul v3.8h, v3.8h, v0.8h[2] - mul v22.8h, v22.8h, v0.8h[3] - mls v19.8h, v20.8h, v0.8h[4] + mul v19.8h, v19.8h, v0.h[3] + mul v18.8h, v18.8h, v0.h[2] + mul v3.8h, v3.8h, v0.h[2] + mul v22.8h, v22.8h, v0.h[3] + mls v19.8h, v20.8h, v0.h[4] uxtl v20.8h, \v0\().8b uxtl2 v1.8h, \v0\().16b - mls v18.8h, v17.8h, v0.8h[1] - mls v3.8h, v16.8h, v0.8h[1] - mls v22.8h, v23.8h, v0.8h[4] - mla v18.8h, v20.8h, v0.8h[0] - mla v19.8h, v21.8h, v0.8h[5] - mla v3.8h, v1.8h, v0.8h[0] - mla v22.8h, v2.8h, v0.8h[5] + mls v18.8h, v17.8h, v0.h[1] + mls v3.8h, v16.8h, v0.h[1] + mls v22.8h, v23.8h, v0.h[4] + mla v18.8h, v20.8h, v0.h[0] + mla v19.8h, v21.8h, v0.h[5] + mla v3.8h, v1.8h, v0.h[0] + mla v22.8h, v2.8h, v0.h[5] sqadd v19.8h, v18.8h, v19.8h sqadd v22.8h, v3.8h, v22.8h sqrshrun \d0\().8b, v19.8h, #7 @@ -667,12 +667,12 @@ endfunc uxtl \s4\().8h, \s4\().8b uxtl \s0\().8h, \s0\().8b uxtl \s5\().8h, \s5\().8b - mul \s2\().8h, \s2\().8h, v0.8h[2] - mul \s3\().8h, \s3\().8h, v0.8h[3] - mls \s2\().8h, \s1\().8h, v0.8h[1] - mls \s3\().8h, \s4\().8h, v0.8h[4] - mla \s2\().8h, \s0\().8h, v0.8h[0] - mla \s3\().8h, \s5\().8h, v0.8h[5] + mul \s2\().8h, \s2\().8h, v0.h[2] + mul \s3\().8h, \s3\().8h, v0.h[3] + mls \s2\().8h, \s1\().8h, v0.h[1] + mls \s3\().8h, \s4\().8h, v0.h[4] + mla \s2\().8h, \s0\().8h, v0.h[0] + mla \s3\().8h, \s5\().8h, v0.h[5] sqadd \s3\().8h, \s2\().8h, \s3\().8h sqrshrun \d0\().8b, \s3\().8h, #7 .endm @@ -685,20 +685,20 @@ endfunc uxtl \s4\().8h, \s4\().8b uxtl \s2\().8h, \s2\().8b uxtl \s5\().8h, \s5\().8b - mul \s0\().8h, \s0\().8h, v0.8h[0] - mul v31.8h , \s3\().8h, v0.8h[3] - mul \s3\().8h, \s3\().8h, v0.8h[2] - mul \s6\().8h, \s6\().8h, v0.8h[5] - - mls \s0\().8h, \s1\().8h, v0.8h[1] - mls v31.8h , \s4\().8h, v0.8h[4] - mls \s3\().8h, \s2\().8h, v0.8h[1] - mls \s6\().8h, \s5\().8h, v0.8h[4] - - mla \s0\().8h, \s2\().8h, v0.8h[2] - mla v31.8h , \s5\().8h, v0.8h[5] - mla \s3\().8h, \s1\().8h, v0.8h[0] - mla \s6\().8h, \s4\().8h, v0.8h[3] + mul \s0\().8h, \s0\().8h, v0.h[0] + mul v31.8h , \s3\().8h, v0.h[3] + mul \s3\().8h, \s3\().8h, v0.h[2] + mul \s6\().8h, \s6\().8h, v0.h[5] + + mls \s0\().8h, \s1\().8h, v0.h[1] + mls v31.8h , \s4\().8h, v0.h[4] + mls \s3\().8h, \s2\().8h, v0.h[1] + mls \s6\().8h, \s5\().8h, v0.h[4] + + mla \s0\().8h, \s2\().8h, v0.h[2] + mla v31.8h , \s5\().8h, v0.h[5] + mla \s3\().8h, \s1\().8h, v0.h[0] + mla \s6\().8h, \s4\().8h, v0.h[3] sqadd v31.8h , \s0\().8h, v31.8h sqadd \s6\().8h, \s3\().8h, \s6\().8h sqrshrun \d0\().8b, v31.8h, #7 @@ -713,10 +713,10 @@ endfunc ext v25.8b, \v0\().8b, \v1\().8b, #3 uxtl v22.8h, v23.8b uxtl v25.8h, v25.8b - mul v20.8h, v20.8h, v0.8h[2] - mul v22.8h, v22.8h, v0.8h[3] - mls v20.8h, v19.8h, v0.8h[1] - mls v22.8h, v25.8h, v0.8h[4] + mul v20.8h, v20.8h, v0.h[2] + mul v22.8h, v22.8h, v0.h[3] + mls v20.8h, v19.8h, v0.h[1] + mls v22.8h, v25.8h, v0.h[4] sqadd v22.8h, v20.8h, v22.8h sqrshrun \d\().8b, v22.8h, #7 .endm @@ -727,14 +727,14 @@ endfunc uxtl \s2\().8h, \s2\().8b uxtl \s3\().8h, \s3\().8b uxtl \s4\().8h, \s4\().8b - mul v21.8h, \s1\().8h, v0.8h[2] - mul v23.8h, \s2\().8h, v0.8h[3] - mul \s2\().8h, \s2\().8h, v0.8h[2] - mul v22.8h, \s3\().8h, v0.8h[3] - mls v21.8h, \s0\().8h, v0.8h[1] - mls v23.8h, \s3\().8h, v0.8h[4] - mls \s2\().8h, \s1\().8h, v0.8h[1] - mls v22.8h, \s4\().8h, v0.8h[4] + mul v21.8h, \s1\().8h, v0.h[2] + mul v23.8h, \s2\().8h, v0.h[3] + mul \s2\().8h, \s2\().8h, v0.h[2] + mul v22.8h, \s3\().8h, v0.h[3] + mls v21.8h, \s0\().8h, v0.h[1] + mls v23.8h, \s3\().8h, v0.h[4] + mls \s2\().8h, \s1\().8h, v0.h[1] + mls v22.8h, \s4\().8h, v0.h[4] sqadd v21.8h, v21.8h, v23.8h sqadd \s2\().8h, \s2\().8h, v22.8h sqrshrun \d0\().8b, v21.8h, #7 From patchwork Fri Feb 1 09:12:43 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [04/19] aarch64: vp8: Fix assembling with armasm64 X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64418 Message-Id: <1549012378-32118-4-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:43 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development --- libavcodec/aarch64/vp8dsp_neon.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index f371ea7..14a9d11 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -28,7 +28,7 @@ function ff_vp8_idct_add_neon, export=1 ld1 {v0.8b - v3.8b}, [x1] mov w4, #20091 - movk w4, #35468/2, lsl 16 + movk w4, #35468/2, lsl #16 dup v4.2s, w4 smull v26.4s, v1.4h, v4.h[0] From patchwork Fri Feb 1 09:12:44 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [05/19] aarch64: vp8: Fix linking for iOS X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64419 Message-Id: <1549012378-32118-5-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:44 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development The mach-o relocations don't allow a negative offset to a symbol; use the third movrel parameter to handle this issue transparently. --- libavcodec/aarch64/vp8dsp_neon.S | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 14a9d11..eb22c42 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -759,7 +759,7 @@ function ff_put_vp8_epel16_v6_neon, export=1 sxtw x4, w4 sxtw x6, w6 - movrel x17, subpel_filters-16 + movrel x17, subpel_filters, -16 add x6, x17, x6, lsl #4 // y ld1 {v0.8h}, [x6] 1: @@ -788,7 +788,7 @@ function ff_put_vp8_epel16_h6_neon, export=1 sxtw x5, w5 // x // first pass (horizontal): - movrel x17, subpel_filters-16 + movrel x17, subpel_filters, -16 add x5, x17, x5, lsl #4 // x ld1 {v0.8h}, [x5] 1: @@ -807,7 +807,7 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 sub x2, x2, #2 // first pass (horizontal): - movrel x17, subpel_filters-16 + movrel x17, subpel_filters, -16 sxtw x5, w5 // x add x16, x17, x5, lsl #4 // x sub sp, sp, #336+16 @@ -854,7 +854,7 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 sxtw x4, w4 // first pass (horizontal): - movrel x17, subpel_filters-16 + movrel x17, subpel_filters, -16 sxtw x5, w5 add x5, x17, x5, lsl #4 // x sub sp, sp, #168+16 @@ -900,7 +900,7 @@ function ff_put_vp8_epel8_h4v6_neon, export=1 sxtw x4, w4 // first pass (horizontal): - movrel x17, subpel_filters-16 + movrel x17, subpel_filters, -16 sxtw x5, w5 add x5, x17, x5, lsl #4 // x sub sp, sp, #168+16 @@ -947,7 +947,7 @@ function ff_put_vp8_epel8_h4v4_neon, export=1 // first pass (horizontal): - movrel x17, subpel_filters-16 + movrel x17, subpel_filters, -16 sxtw x5, w5 add x5, x17, x5, lsl #4 // x sub sp, sp, #168+16 @@ -992,7 +992,7 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 // first pass (horizontal): - movrel x17, subpel_filters-16 + movrel x17, subpel_filters, -16 sxtw x5, w5 add x5, x17, x5, lsl #4 // x sub sp, sp, #168+16 From patchwork Fri Feb 1 09:12:45 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [06/19] aarch64: vp8: Use the proper aarch64 form for conditional branches X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64421 Message-Id: <1549012378-32118-6-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:45 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development The previous form also does seem to assemble on current tools, but I think it might fail on some older aarch64 tools. --- libavcodec/aarch64/vp8dsp_neon.S | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index eb22c42..c19ab0d 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -581,7 +581,7 @@ function ff_put_vp8_pixels16_neon, export=1 st1 {v1.16b}, [x0], x1 st1 {v2.16b}, [x0], x1 st1 {v3.16b}, [x0], x1 - bgt 1b + b.gt 1b ret endfunc @@ -596,7 +596,7 @@ function ff_put_vp8_pixels8_neon, export=1 st1 {v0.d}[1], [x0], x1 st1 {v1.8b}, [x0], x1 st1 {v1.d}[1], [x0], x1 - bgt 1b + b.gt 1b ret endfunc @@ -778,7 +778,7 @@ function ff_put_vp8_epel16_v6_neon, export=1 st1 {v1.1d - v2.1d}, [x0], x1 st1 {v3.1d - v4.1d}, [x0], x1 subs x4, x4, #2 - bne 1b + b.ne 1b ret endfunc @@ -797,7 +797,7 @@ function ff_put_vp8_epel16_h6_neon, export=1 st1 {v1.16b}, [x0], x1 subs w4, w4, #1 - bne 1b + b.ne 1b ret endfunc @@ -821,7 +821,7 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 vp8_epel16_h6 v1, v1, v2 st1 {v1.16b}, [x7], #16 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): @@ -842,7 +842,7 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 st1 {v2.16b}, [x0], x1 subs x4, x4, #1 - bne 2b + b.ne 2b add sp, sp, #336+16 ret @@ -869,7 +869,7 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 st1 {v1.8b}, [x7], #8 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): sxtw x6, w6 @@ -888,7 +888,7 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 st1 {v1.8b}, [x0], x1 st1 {v2.8b}, [x0], x1 subs x4, x4, #2 - bne 2b + b.ne 2b add sp, sp, #168+16 ret @@ -915,7 +915,7 @@ function ff_put_vp8_epel8_h4v6_neon, export=1 st1 {v1.8b}, [x7], #8 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): sxtw x6, w6 @@ -934,7 +934,7 @@ function ff_put_vp8_epel8_h4v6_neon, export=1 st1 {v1.8b}, [x0], x1 st1 {v2.8b}, [x0], x1 subs x4, x4, #2 - bne 2b + b.ne 2b add sp, sp, #168+16 ret @@ -962,7 +962,7 @@ function ff_put_vp8_epel8_h4v4_neon, export=1 st1 {v1.8b}, [x7], #8 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): sxtw x6, w6 @@ -979,7 +979,7 @@ function ff_put_vp8_epel8_h4v4_neon, export=1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 subs x4, x4, #2 - bne 2b + b.ne 2b add sp, sp, #168+16 ret @@ -1007,7 +1007,7 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 st1 {v1.8b}, [x7], #8 subs x16, x16, #1 - bne 1b + b.ne 1b // second pass (vertical): sxtw x6, w6 @@ -1024,7 +1024,7 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 st1 {v1.d}[0], [x0], x1 st1 {v1.d}[1], [x0], x1 subs x4, x4, #2 - bne 2b + b.ne 2b add sp, sp, #168+16 ret From patchwork Fri Feb 1 09:12:46 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [07/19] vp8dsp: Move the aarch64 dsp init call into alphabetical order X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64422 Message-Id: <1549012378-32118-7-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:46 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development --- libavcodec/vp8dsp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/vp8dsp.c b/libavcodec/vp8dsp.c index 3c8d1c8..ac9a6af 100644 --- a/libavcodec/vp8dsp.c +++ b/libavcodec/vp8dsp.c @@ -679,14 +679,14 @@ av_cold void ff_vp78dsp_init(VP8DSPContext *dsp) VP78_BILINEAR_MC_FUNC(1, 8); VP78_BILINEAR_MC_FUNC(2, 4); + if (ARCH_AARCH64) + ff_vp78dsp_init_aarch64(dsp); if (ARCH_ARM) ff_vp78dsp_init_arm(dsp); if (ARCH_PPC) ff_vp78dsp_init_ppc(dsp); if (ARCH_X86) ff_vp78dsp_init_x86(dsp); - if (ARCH_AARCH64) - ff_vp78dsp_init_aarch64(dsp); } #if CONFIG_VP7_DECODER @@ -741,11 +741,11 @@ av_cold void ff_vp8dsp_init(VP8DSPContext *dsp) dsp->vp8_v_loop_filter_simple = vp8_v_loop_filter_simple_c; dsp->vp8_h_loop_filter_simple = vp8_h_loop_filter_simple_c; + if (ARCH_AARCH64) + ff_vp8dsp_init_aarch64(dsp); if (ARCH_ARM) ff_vp8dsp_init_arm(dsp); if (ARCH_X86) ff_vp8dsp_init_x86(dsp); - if (ARCH_AARCH64) - ff_vp8dsp_init_aarch64(dsp); } #endif /* CONFIG_VP8_DECODER */ From patchwork Fri Feb 1 09:12:47 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [08/19] aarch64: vp8: Remove superfluous includes X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64423 Message-Id: <1549012378-32118-8-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:47 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development --- libavcodec/aarch64/vp8dsp_init_aarch64.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c index f93bcfa..3fb254a 100644 --- a/libavcodec/aarch64/vp8dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -17,10 +17,6 @@ */ #include -#include -#include -#include -#include #include "libavutil/attributes.h" #include "libavutil/aarch64/cpu.h" From patchwork Fri Feb 1 09:12:48 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [09/19] aarch64: vp8: Move the vp8dsp makefile entries to the right places X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64424 Message-Id: <1549012378-32118-9-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:48 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development Even if NEON would be disabled, the init functions should be built as they are called as long as ARCH_AARCH64 is set. These functions are part of a generic DSP subsytem, not tied directly to one decoder. (They should be built if the vp7 decoder is enabled, even if the vp8 decoder is disabled.) --- libavcodec/aarch64/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libavcodec/aarch64/Makefile b/libavcodec/aarch64/Makefile index 2555044..7228eae 100644 --- a/libavcodec/aarch64/Makefile +++ b/libavcodec/aarch64/Makefile @@ -11,6 +11,7 @@ OBJS-$(CONFIG_MDCT) += aarch64/mdct_init.o OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_init.o OBJS-$(CONFIG_NEON_CLOBBER_TEST) += aarch64/neontest.o OBJS-$(CONFIG_VIDEODSP) += aarch64/videodsp_init.o +OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o # decoders/encoders OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_init.o @@ -39,13 +40,12 @@ NEON-OBJS-$(CONFIG_HPELDSP) += aarch64/hpeldsp_neon.o NEON-OBJS-$(CONFIG_IMDCT15) += aarch64/imdct15_neon.o NEON-OBJS-$(CONFIG_MDCT) += aarch64/mdct_neon.o NEON-OBJS-$(CONFIG_MPEGAUDIODSP) += aarch64/mpegaudiodsp_neon.o +NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_neon.o # decoders/encoders NEON-OBJS-$(CONFIG_DCA_DECODER) += aarch64/dcadsp_neon.o \ aarch64/synth_filter_neon.o NEON-OBJS-$(CONFIG_VORBIS_DECODER) += aarch64/vorbisdsp_neon.o -NEON-OBJS-$(CONFIG_VP8DSP) += aarch64/vp8dsp_init_aarch64.o \ - aarch64/vp8dsp_neon.o NEON-OBJS-$(CONFIG_VP9_DECODER) += aarch64/vp9itxfm_neon.o \ aarch64/vp9lpf_neon.o \ aarch64/vp9mc_neon.o From patchwork Fri Feb 1 09:12:49 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [10/19] aarch64: vp8: Reorder the function pointer inits to match the arm original X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64426 Message-Id: <1549012378-32118-10-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:49 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development --- libavcodec/aarch64/vp8dsp_init_aarch64.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c index 3fb254a..da54efd 100644 --- a/libavcodec/aarch64/vp8dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -46,10 +46,10 @@ av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon; dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; - dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; - dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; - dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon; + dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; } av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) @@ -62,8 +62,8 @@ av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; - dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; + dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; dsp->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_neon; dsp->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_neon; From patchwork Fri Feb 1 09:12:50 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [11/19] aarch64: vp8: Fix a typo in a comment X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64425 Message-Id: <1549012378-32118-11-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:50 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development --- libavcodec/aarch64/vp8dsp_neon.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index c19ab0d..2b5b049 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -743,7 +743,7 @@ endfunc // note: worst case sum of all 6-tap filter values * 255 is 0x7f80 so 16 bit -// arithmatic can be used to apply filters +// arithmetic can be used to apply filters const subpel_filters, align=4 .short 0, 6, 123, 12, 1, 0, 0, 0 .short 2, 11, 108, 36, 8, 1, 0, 0 From patchwork Fri Feb 1 09:12:51 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Subject: [12/19] aarch64: vp8: Port vp8_luma_dc_wht and vp8_idct_dc_add4uv from arm version X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64429 Message-Id: <1549012378-32118-12-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:51 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development Cortex A53 A72 A73 vp8_luma_dc_wht_c: 115.7 75.7 90.7 vp8_luma_dc_wht_neon: 60.7 41.2 45.7 vp8_idct_dc_add4uv_c: 376.1 262.9 282.5 vp8_idct_dc_add4uv_neon: 52.0 29.0 37.0 --- libavcodec/aarch64/vp8dsp_init_aarch64.c | 3 + libavcodec/aarch64/vp8dsp_neon.S | 109 +++++++++++++++++++++++++++++++ 2 files changed, 112 insertions(+) diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c index da54efd..8f060dc 100644 --- a/libavcodec/aarch64/vp8dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -28,6 +28,7 @@ void ff_vp8_luma_dc_wht_neon(int16_t block[4][4][16], int16_t dc[16]); void ff_vp8_idct_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp8_idct_dc_add_neon(uint8_t *dst, int16_t block[16], ptrdiff_t stride); void ff_vp8_idct_dc_add4y_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); +void ff_vp8_idct_dc_add4uv_neon(uint8_t *dst, int16_t block[4][16], ptrdiff_t stride); VP8_LF(neon); @@ -57,10 +58,12 @@ av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) if (!have_neon(av_get_cpu_flags())) { return; } + dsp->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_neon; dsp->vp8_idct_add = ff_vp8_idct_add_neon; dsp->vp8_idct_dc_add = ff_vp8_idct_dc_add_neon; dsp->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_neon; + dsp->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_neon; dsp->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16_neon; dsp->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16_neon; diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 2b5b049..4ea62c0 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -4,6 +4,7 @@ * Copyright (c) 2010 Rob Clark * Copyright (c) 2011 Mans Rullgard * Copyright (c) 2018 Magnus Röös + * Copyright (c) 2019 Martin Storsjo * * This file is part of Libav. * @@ -25,6 +26,62 @@ #include "libavutil/aarch64/asm.S" #include "neon.S" +function ff_vp8_luma_dc_wht_neon, export=1 + ld1 {v0.4h - v3.4h}, [x1] + movi v30.8h, #0 + + add v4.4h, v0.4h, v3.4h + add v6.4h, v1.4h, v2.4h + st1 {v30.8h}, [x1], #16 + sub v7.4h, v1.4h, v2.4h + sub v5.4h, v0.4h, v3.4h + st1 {v30.8h}, [x1] + add v0.4h, v4.4h, v6.4h + add v1.4h, v5.4h, v7.4h + sub v2.4h, v4.4h, v6.4h + sub v3.4h, v5.4h, v7.4h + + movi v16.4h, #3 + + transpose_4x4H v0, v1, v2, v3, v4, v5, v6, v7 + + add v0.4h, v0.4h, v16.4h + + add v4.4h, v0.4h, v3.4h + add v6.4h, v1.4h, v2.4h + sub v7.4h, v1.4h, v2.4h + sub v5.4h, v0.4h, v3.4h + add v0.4h, v4.4h, v6.4h + add v1.4h, v5.4h, v7.4h + sub v2.4h, v4.4h, v6.4h + sub v3.4h, v5.4h, v7.4h + + sshr v0.4h, v0.4h, #3 + sshr v1.4h, v1.4h, #3 + sshr v2.4h, v2.4h, #3 + sshr v3.4h, v3.4h, #3 + + mov x3, #32 + st1 {v0.h}[0], [x0], x3 + st1 {v1.h}[0], [x0], x3 + st1 {v2.h}[0], [x0], x3 + st1 {v3.h}[0], [x0], x3 + st1 {v0.h}[1], [x0], x3 + st1 {v1.h}[1], [x0], x3 + st1 {v2.h}[1], [x0], x3 + st1 {v3.h}[1], [x0], x3 + st1 {v0.h}[2], [x0], x3 + st1 {v1.h}[2], [x0], x3 + st1 {v2.h}[2], [x0], x3 + st1 {v3.h}[2], [x0], x3 + st1 {v0.h}[3], [x0], x3 + st1 {v1.h}[3], [x0], x3 + st1 {v2.h}[3], [x0], x3 + st1 {v3.h}[3], [x0], x3 + + ret +endfunc + function ff_vp8_idct_add_neon, export=1 ld1 {v0.8b - v3.8b}, [x1] mov w4, #20091 @@ -102,6 +159,58 @@ function ff_vp8_idct_add_neon, export=1 ret endfunc +function ff_vp8_idct_dc_add4uv_neon, export=1 + movi v0.4h, #0 + mov x3, #32 + ld1r {v16.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v17.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v18.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ld1r {v19.4h}, [x1] + st1 {v0.h}[0], [x1], x3 + ins v16.d[1], v17.d[0] + ins v18.d[1], v19.d[0] + mov x3, x0 + srshr v16.8h, v16.8h, #3 // dc >>= 3 + ld1 {v0.8b}, [x0], x2 + srshr v18.8h, v18.8h, #3 + ld1 {v1.8b}, [x0], x2 + uaddw v20.8h, v16.8h, v0.8b + ld1 {v2.8b}, [x0], x2 + uaddw v0.8h, v16.8h, v1.8b + ld1 {v3.8b}, [x0], x2 + uaddw v22.8h, v16.8h, v2.8b + ld1 {v4.8b}, [x0], x2 + uaddw v2.8h, v16.8h, v3.8b + ld1 {v5.8b}, [x0], x2 + uaddw v24.8h, v18.8h, v4.8b + ld1 {v6.8b}, [x0], x2 + uaddw v4.8h, v18.8h, v5.8b + ld1 {v7.8b}, [x0], x2 + uaddw v26.8h, v18.8h, v6.8b + sqxtun v20.8b, v20.8h + uaddw v6.8h, v18.8h, v7.8b + sqxtun v21.8b, v0.8h + sqxtun v22.8b, v22.8h + st1 {v20.8b}, [x3], x2 + sqxtun v23.8b, v2.8h + st1 {v21.8b}, [x3], x2 + sqxtun v24.8b, v24.8h + st1 {v22.8b}, [x3], x2 + sqxtun v25.8b, v4.8h + st1 {v23.8b}, [x3], x2 + sqxtun v26.8b, v26.8h + st1 {v24.8b}, [x3], x2 + sqxtun v27.8b, v6.8h + st1 {v25.8b}, [x3], x2 + st1 {v26.8b}, [x3], x2 + st1 {v27.8b}, [x3], x2 + + ret +endfunc + function ff_vp8_idct_dc_add4y_neon, export=1 movi v0.16b, #0 mov x3, #32 From patchwork Fri Feb 1 09:12:52 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [13/19] aarch64: vp8: Port missing epel8 functions from arm version X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64427 Message-Id: <1549012378-32118-13-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:52 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development Cortex A53 A72 A73 vp8_put_epel8_h4_c: 2594.8 1159.6 1374.8 vp8_put_epel8_h4_neon: 506.4 244.2 314.0 vp8_put_epel8_h6_c: 3445.8 1677.1 1811.3 vp8_put_epel8_h6_neon: 634.4 371.7 433.0 vp8_put_epel8_v4_c: 2614.0 1174.8 1378.0 vp8_put_epel8_v4_neon: 321.0 221.7 235.8 vp8_put_epel8_v6_c: 3635.5 1703.0 2079.2 vp8_put_epel8_v6_neon: 416.9 317.0 295.5 --- libavcodec/aarch64/vp8dsp_init_aarch64.c | 4 ++ libavcodec/aarch64/vp8dsp_neon.S | 87 ++++++++++++++++++++++++++++++++ 2 files changed, 91 insertions(+) diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c index 8f060dc..1878d8e 100644 --- a/libavcodec/aarch64/vp8dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -47,8 +47,12 @@ av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) dsp->put_vp8_epel_pixels_tab[0][2][2] = ff_put_vp8_epel16_h6v6_neon; dsp->put_vp8_epel_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_epel_pixels_tab[1][0][1] = ff_put_vp8_epel8_h4_neon; + dsp->put_vp8_epel_pixels_tab[1][0][2] = ff_put_vp8_epel8_h6_neon; + dsp->put_vp8_epel_pixels_tab[1][1][0] = ff_put_vp8_epel8_v4_neon; dsp->put_vp8_epel_pixels_tab[1][1][1] = ff_put_vp8_epel8_h4v4_neon; dsp->put_vp8_epel_pixels_tab[1][1][2] = ff_put_vp8_epel8_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon; dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; } diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 4ea62c0..c5badc4 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -957,6 +957,51 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 ret endfunc +function ff_put_vp8_epel8_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x2], x3 + ld1 {v6.8b}, [x2], x3 + ld1 {v7.8b}, [x2], x3 + ld1 {v28.8b}, [x2] + + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 + + st1 {v2.8b}, [x0], x1 + st1 {v3.8b}, [x0], x1 + subs w4, w4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h6_neon, export=1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b, v3.8b}, [x2], x3 + + vp8_epel8_h6 v2, v2, v3 + + st1 {v2.8b}, [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + function ff_put_vp8_epel8_h6v6_neon, export=1 sub x2, x2, x3, lsl #1 sub x2, x2, #2 @@ -1003,6 +1048,48 @@ function ff_put_vp8_epel8_h6v6_neon, export=1 ret endfunc +function ff_put_vp8_epel8_v4_neon, export=1 + sub x2, x2, x3 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1 {v2.8b}, [x2], x3 + ld1 {v3.8b}, [x2], x3 + ld1 {v4.8b}, [x2], x3 + ld1 {v5.8b}, [x2], x3 + ld1 {v6.8b}, [x2] + sub x2, x2, x3, lsl #1 + + vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 + + st1 {v2.d}[0], [x0], x1 + st1 {v2.d}[1], [x0], x1 + subs w4, w4, #2 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel8_h4_neon, export=1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + + vp8_epel8_h4 v2, v2, v3 + + st1 {v2.8b}, [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + function ff_put_vp8_epel8_h4v6_neon, export=1 sub x2, x2, x3, lsl #1 sub x2, x2, #1 From patchwork Fri Feb 1 09:12:53 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [14/19] aarch64: vp8: Port epel4 functions from arm version X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64428 Message-Id: <1549012378-32118-14-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:53 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development Cortex A53 A72 A73 vp8_put_epel4_h4_c: 631.4 291.7 367.8 vp8_put_epel4_h4_neon: 241.0 131.0 155.7 vp8_put_epel4_h4v4_c: 967.5 529.3 667.7 vp8_put_epel4_h4v4_neon: 429.3 241.8 279.7 vp8_put_epel4_h4v6_c: 1374.7 657.5 864.5 vp8_put_epel4_h4v6_neon: 515.5 295.5 334.7 vp8_put_epel4_h6_c: 851.0 421.0 486.0 vp8_put_epel4_h6_neon: 321.5 195.0 217.7 vp8_put_epel4_h6v4_c: 1111.3 621.1 781.2 vp8_put_epel4_h6v4_neon: 539.2 328.0 365.3 vp8_put_epel4_h6v6_c: 1561.3 763.3 999.7 vp8_put_epel4_h6v6_neon: 645.5 401.0 434.7 vp8_put_epel4_v4_c: 663.8 298.3 357.0 vp8_put_epel4_v4_neon: 116.0 81.5 72.5 vp8_put_epel4_v6_c: 870.5 437.0 507.4 vp8_put_epel4_v6_neon: 147.7 108.8 92.0 --- libavcodec/aarch64/vp8dsp_init_aarch64.c | 10 ++ libavcodec/aarch64/vp8dsp_neon.S | 284 +++++++++++++++++++++++++++++++ 2 files changed, 294 insertions(+) diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c index 1878d8e..478f849 100644 --- a/libavcodec/aarch64/vp8dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -34,6 +34,7 @@ VP8_LF(neon); VP8_EPEL(16, neon); VP8_EPEL(8, neon); +VP8_EPEL(4, neon); av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) @@ -55,6 +56,15 @@ av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) dsp->put_vp8_epel_pixels_tab[1][2][0] = ff_put_vp8_epel8_v6_neon; dsp->put_vp8_epel_pixels_tab[1][2][1] = ff_put_vp8_epel8_h4v6_neon; dsp->put_vp8_epel_pixels_tab[1][2][2] = ff_put_vp8_epel8_h6v6_neon; + + dsp->put_vp8_epel_pixels_tab[2][0][1] = ff_put_vp8_epel4_h4_neon; + dsp->put_vp8_epel_pixels_tab[2][0][2] = ff_put_vp8_epel4_h6_neon; + dsp->put_vp8_epel_pixels_tab[2][1][0] = ff_put_vp8_epel4_v4_neon; + dsp->put_vp8_epel_pixels_tab[2][1][1] = ff_put_vp8_epel4_h4v4_neon; + dsp->put_vp8_epel_pixels_tab[2][1][2] = ff_put_vp8_epel4_h6v4_neon; + dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon; + dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon; + dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon; } av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index c5badc4..7fe2466 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -1225,3 +1225,287 @@ function ff_put_vp8_epel8_h6v4_neon, export=1 add sp, sp, #168+16 ret endfunc + +function ff_put_vp8_epel4_v6_neon, export=1 + sub x2, x2, x3, lsl #1 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1r {v2.2s}, [x2], x3 + ld1r {v3.2s}, [x2], x3 + ld1r {v4.2s}, [x2], x3 + ld1r {v5.2s}, [x2], x3 + ld1r {v6.2s}, [x2], x3 + ld1r {v7.2s}, [x2], x3 + ld1r {v28.2s}, [x2] + sub x2, x2, x3, lsl #2 + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + ld1 {v4.s}[1], [x2], x3 + ld1 {v5.s}[1], [x2], x3 + ld1 {v6.s}[1], [x2], x3 + ld1 {v7.s}[1], [x2], x3 + ld1 {v28.s}[1], [x2] + sub x2, x2, x3, lsl #2 + + vp8_epel8_v6_y2 v2, v3, v2, v3, v4, v5, v6, v7, v28 + + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h6_neon, export=1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h6v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #52 + add w8, w4, #5 + mov x9, sp +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1 {v6.8b}, [x9], #8 + ld1r {v28.2s}, [x9] + sub x9, x9, #16 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v7.8b}, [x9], #8 + ld1 {v28.s}[1], [x9] + sub x9, x9, #16 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + trn1 v3.2s, v6.2s, v7.2s + trn2 v7.2s, v6.2s, v7.2s + vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #52 + ret +endfunc + +function ff_put_vp8_epel4_h4v6_neon, export=1 + sub x2, x2, x3, lsl #1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #52 + add w8, w4, #5 + mov x9, sp +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v2 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1 {v6.8b}, [x9], #8 + ld1r {v28.2s}, [x9] + sub x9, x9, #16 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v7.8b}, [x9], #8 + ld1 {v28.s}[1], [x9] + sub x9, x9, #16 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + trn1 v3.2s, v6.2s, v7.2s + trn2 v7.2s, v6.2s, v7.2s + vp8_epel8_v6_y2 v2, v3, v1, v4, v2, v5, v3, v7, v28 + st1 {v2.s}[0], [x0], x1 + st1 {v3.s}[0], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v3.s}[1], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #52 + ret +endfunc + +function ff_put_vp8_epel4_h6v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #2 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #44 + add w8, w4, #3 + mov x9, sp +1: + ld1 {v2.8b,v3.8b}, [x2], x3 + vp8_epel8_h6 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1r {v6.2s}, [x9] + sub x9, x9, #8 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v6.s}[1], [x9] + sub x9, x9, #8 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[1], [x0], x1 + st1 {v1.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #44 + ret +endfunc + +function ff_put_vp8_epel4_h4_neon, export=1 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v2 + st1 {v2.s}[0], [x0], x1 + subs w4, w4, #1 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_v4_neon, export=1 + sub x2, x2, x3 + + movrel x7, subpel_filters, -16 + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] +1: + ld1r {v2.2s}, [x2], x3 + ld1r {v3.2s}, [x2], x3 + ld1r {v4.2s}, [x2], x3 + ld1r {v5.2s}, [x2], x3 + ld1r {v6.2s}, [x2] + sub x2, x2, x3, lsl #1 + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + ld1 {v4.s}[1], [x2], x3 + ld1 {v5.s}[1], [x2], x3 + ld1 {v6.s}[1], [x2] + sub x2, x2, x3, lsl #1 + + vp8_epel8_v4_y2 v2, v2, v3, v4, v5, v6 + + st1 {v2.s}[0], [x0], x1 + st1 {v2.s}[2], [x0], x1 + st1 {v2.s}[1], [x0], x1 + st1 {v2.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 1b + + ret +endfunc + +function ff_put_vp8_epel4_h4v4_neon, export=1 + sub x2, x2, x3 + sub x2, x2, #1 + + movrel x7, subpel_filters, -16 + add x5, x7, w5, uxtw #4 + ld1 {v0.8h}, [x5] + + sub sp, sp, #44 + add w8, w4, #3 + mov x9, sp +1: + ld1 {v2.8b}, [x2], x3 + vp8_epel8_h4 v2, v2, v3 + st1 {v2.s}[0], [x9], #4 + subs w8, w8, #1 + b.ne 1b + + add x6, x7, w6, uxtw #4 + ld1 {v0.8h}, [x6] + mov x9, sp +2: + ld1 {v2.8b,v3.8b}, [x9], #16 + ld1r {v6.2s}, [x9] + sub x9, x9, #8 + ld1 {v4.8b,v5.8b}, [x9], #16 + ld1 {v6.s}[1], [x9] + sub x9, x9, #8 + trn1 v1.2s, v2.2s, v4.2s + trn2 v4.2s, v2.2s, v4.2s + trn1 v2.2s, v3.2s, v5.2s + trn2 v5.2s, v3.2s, v5.2s + vp8_epel8_v4_y2 v1, v1, v4, v2, v5, v6 + st1 {v1.s}[0], [x0], x1 + st1 {v1.s}[2], [x0], x1 + st1 {v1.s}[1], [x0], x1 + st1 {v1.s}[3], [x0], x1 + subs w4, w4, #4 + b.ne 2b + + add sp, sp, #44 + ret +endfunc From patchwork Fri Feb 1 09:12:54 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [15/19] aarch64: vp8: Port bilin functions from arm version X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64432 Message-Id: <1549012378-32118-15-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:54 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development Cortex A53 A72 A73 vp8_put_bilin4_h_c: 303.8 102.2 161.8 vp8_put_bilin4_h_neon: 100.0 40.9 41.2 vp8_put_bilin4_hv_c: 322.8 201.0 305.9 vp8_put_bilin4_hv_neon: 156.8 72.6 77.0 vp8_put_bilin4_v_c: 304.7 101.7 166.5 vp8_put_bilin4_v_neon: 82.7 41.2 33.0 vp8_put_bilin8_h_c: 1192.7 352.5 623.8 vp8_put_bilin8_h_neon: 213.5 70.2 87.8 vp8_put_bilin8_hv_c: 1098.6 769.2 1041.9 vp8_put_bilin8_hv_neon: 324.0 123.5 146.0 vp8_put_bilin8_v_c: 1193.9 350.4 617.7 vp8_put_bilin8_v_neon: 183.9 60.7 64.7 vp8_put_bilin16_h_c: 2353.1 671.2 1223.3 vp8_put_bilin16_h_neon: 261.9 140.7 145.0 vp8_put_bilin16_hv_c: 2453.2 1470.9 2355.2 vp8_put_bilin16_hv_neon: 383.9 196.0 217.0 vp8_put_bilin16_v_c: 2349.3 669.8 1251.2 vp8_put_bilin16_v_neon: 202.9 110.7 96.2 --- libavcodec/aarch64/vp8dsp.h | 5 + libavcodec/aarch64/vp8dsp_init_aarch64.c | 32 ++++ libavcodec/aarch64/vp8dsp_neon.S | 292 +++++++++++++++++++++++++++++++ 3 files changed, 329 insertions(+) diff --git a/libavcodec/aarch64/vp8dsp.h b/libavcodec/aarch64/vp8dsp.h index 40d0cae..616252e 100644 --- a/libavcodec/aarch64/vp8dsp.h +++ b/libavcodec/aarch64/vp8dsp.h @@ -67,4 +67,9 @@ VP8_MC(epel ## w ## _h4v6, opt); \ VP8_MC(epel ## w ## _h6v6, opt) +#define VP8_BILIN(w, opt) \ + VP8_MC(bilin ## w ## _h, opt); \ + VP8_MC(bilin ## w ## _v, opt); \ + VP8_MC(bilin ## w ## _hv, opt) + #endif /* AVCODEC_AARCH64_VP8DSP_H */ diff --git a/libavcodec/aarch64/vp8dsp_init_aarch64.c b/libavcodec/aarch64/vp8dsp_init_aarch64.c index 478f849..53fbfcd 100644 --- a/libavcodec/aarch64/vp8dsp_init_aarch64.c +++ b/libavcodec/aarch64/vp8dsp_init_aarch64.c @@ -36,6 +36,9 @@ VP8_EPEL(16, neon); VP8_EPEL(8, neon); VP8_EPEL(4, neon); +VP8_BILIN(16, neon); +VP8_BILIN(8, neon); +VP8_BILIN(4, neon); av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) { @@ -65,6 +68,35 @@ av_cold void ff_vp78dsp_init_aarch64(VP8DSPContext *dsp) dsp->put_vp8_epel_pixels_tab[2][2][0] = ff_put_vp8_epel4_v6_neon; dsp->put_vp8_epel_pixels_tab[2][2][1] = ff_put_vp8_epel4_h4v6_neon; dsp->put_vp8_epel_pixels_tab[2][2][2] = ff_put_vp8_epel4_h6v6_neon; + + dsp->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_neon; + dsp->put_vp8_bilinear_pixels_tab[0][0][1] = ff_put_vp8_bilin16_h_neon; + dsp->put_vp8_bilinear_pixels_tab[0][0][2] = ff_put_vp8_bilin16_h_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][0] = ff_put_vp8_bilin16_v_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][1] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][1][2] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][0] = ff_put_vp8_bilin16_v_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][1] = ff_put_vp8_bilin16_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[0][2][2] = ff_put_vp8_bilin16_hv_neon; + + dsp->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_neon; + dsp->put_vp8_bilinear_pixels_tab[1][0][1] = ff_put_vp8_bilin8_h_neon; + dsp->put_vp8_bilinear_pixels_tab[1][0][2] = ff_put_vp8_bilin8_h_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][0] = ff_put_vp8_bilin8_v_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][1] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][1][2] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][0] = ff_put_vp8_bilin8_v_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][1] = ff_put_vp8_bilin8_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[1][2][2] = ff_put_vp8_bilin8_hv_neon; + + dsp->put_vp8_bilinear_pixels_tab[2][0][1] = ff_put_vp8_bilin4_h_neon; + dsp->put_vp8_bilinear_pixels_tab[2][0][2] = ff_put_vp8_bilin4_h_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][0] = ff_put_vp8_bilin4_v_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][1] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][1][2] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][0] = ff_put_vp8_bilin4_v_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][1] = ff_put_vp8_bilin4_hv_neon; + dsp->put_vp8_bilinear_pixels_tab[2][2][2] = ff_put_vp8_bilin4_hv_neon; } av_cold void ff_vp8dsp_init_aarch64(VP8DSPContext *dsp) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 7fe2466..604be8a 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -1509,3 +1509,295 @@ function ff_put_vp8_epel4_h4v4_neon, export=1 add sp, sp, #44 ret endfunc + +/* Bilinear MC */ + +function ff_put_vp8_bilin16_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b,v3.8b,v4.8b}, [x2], x3 + ext v5.8b, v3.8b, v4.8b, #1 + ext v4.8b, v2.8b, v3.8b, #1 + umull v16.8h, v2.8b, v1.8b + umlal v16.8h, v4.8b, v0.8b + ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 + umull v6.8h, v3.8b, v1.8b + umlal v6.8h, v5.8b, v0.8b + ext v21.8b, v19.8b, v20.8b, #1 + ext v20.8b, v18.8b, v19.8b, #1 + umull v22.8h, v18.8b, v1.8b + umlal v22.8h, v20.8b, v0.8b + umull v24.8h, v19.8b, v1.8b + umlal v24.8h, v21.8b, v0.8b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v6.8h, #3 + rshrn v6.8b, v22.8h, #3 + rshrn2 v6.16b, v24.8h, #3 + st1 {v4.16b}, [x0], x1 + st1 {v6.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin16_v_neon, export=1 + mov w7, #8 + dup v0.16b, w6 + sub w6, w7, w6 + dup v1.16b, w6 + + ld1 {v2.16b}, [x2], x3 +1: + subs w4, w4, #2 + ld1 {v4.16b}, [x2], x3 + umull v6.8h, v2.8b, v1.8b + umlal v6.8h, v4.8b, v0.8b + umull2 v16.8h, v2.16b, v1.16b + umlal2 v16.8h, v4.16b, v0.16b + ld1 {v2.16b}, [x2], x3 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v2.8b, v0.8b + umull2 v20.8h, v4.16b, v1.16b + umlal2 v20.8h, v2.16b, v0.16b + rshrn v4.8b, v6.8h, #3 + rshrn2 v4.16b, v16.8h, #3 + rshrn v6.8b, v18.8h, #3 + rshrn2 v6.16b, v20.8h, #3 + st1 {v4.16b}, [x0], x1 + st1 {v6.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin16_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.16b, w6 // my + sub w6, w7, w6 + dup v3.16b, w6 + + ld1 {v4.8b,v5.8b,v6.8b}, [x2], x3 + + ext v7.8b, v5.8b, v6.8b, #1 + ext v6.8b, v4.8b, v5.8b, #1 + umull v16.8h, v4.8b, v1.8b + umlal v16.8h, v6.8b, v0.8b + umull v18.8h, v5.8b, v1.8b + umlal v18.8h, v7.8b, v0.8b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v18.8b,v19.8b,v20.8b}, [x2], x3 + ext v21.8b, v19.8b, v20.8b, #1 + ext v20.8b, v18.8b, v19.8b, #1 + umull v22.8h, v18.8b, v1.8b + umlal v22.8h, v20.8b, v0.8b + ld1 {v26.8b,v27.8b,v28.8b}, [x2], x3 + umull v24.8h, v19.8b, v1.8b + umlal v24.8h, v21.8b, v0.8b + ext v29.8b, v27.8b, v28.8b, #1 + ext v28.8b, v26.8b, v27.8b, #1 + umull v16.8h, v26.8b, v1.8b + umlal v16.8h, v28.8b, v0.8b + umull v18.8h, v27.8b, v1.8b + umlal v18.8h, v29.8b, v0.8b + rshrn v6.8b, v22.8h, #3 + rshrn2 v6.16b, v24.8h, #3 + umull v24.8h, v4.8b, v3.8b + umlal v24.8h, v6.8b, v2.8b + umull2 v30.8h, v4.16b, v3.16b + umlal2 v30.8h, v6.16b, v2.16b + rshrn v4.8b, v16.8h, #3 + rshrn2 v4.16b, v18.8h, #3 + umull v20.8h, v6.8b, v3.8b + umlal v20.8h, v4.8b, v2.8b + umull2 v22.8h, v6.16b, v3.16b + umlal2 v22.8h, v4.16b, v2.16b + rshrn v24.8b, v24.8h, #3 + rshrn2 v24.16b, v30.8h, #3 + st1 {v24.16b}, [x0], x1 + rshrn v20.8b, v20.8h, #3 + rshrn2 v20.16b, v22.8h, #3 + st1 {v20.16b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b,v3.8b}, [x2], x3 + ext v3.8b, v2.8b, v3.8b, #1 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + ld1 {v6.8b,v7.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + rshrn v16.8b, v16.8h, #3 + st1 {v4.8b}, [x0], x1 + st1 {v16.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_v_neon, export=1 + mov w7, #8 + dup v0.8b, w6 + sub w6, w7, w6 + dup v1.8b, w6 + + ld1 {v2.8b}, [x2], x3 +1: + subs w4, w4, #2 + ld1 {v3.8b}, [x2], x3 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + ld1 {v2.8b}, [x2], x3 + umull v6.8h, v3.8b, v1.8b + umlal v6.8h, v2.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + rshrn v6.8b, v6.8h, #3 + st1 {v4.8b}, [x0], x1 + st1 {v6.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin8_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.8b, w6 // my + sub w6, w7, w6 + dup v3.8b, w6 + + ld1 {v4.8b,v5.8b}, [x2], x3 + ext v5.8b, v4.8b, v5.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v22.8b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v6.8b,v7.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + ld1 {v4.8b,v5.8b}, [x2], x3 + ext v5.8b, v4.8b, v5.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v16.8b, v16.8h, #3 + umull v20.8h, v22.8b, v3.8b + umlal v20.8h, v16.8b, v2.8b + rshrn v22.8b, v18.8h, #3 + umull v24.8h, v16.8b, v3.8b + umlal v24.8h, v22.8b, v2.8b + rshrn v20.8b, v20.8h, #3 + st1 {v20.8b}, [x0], x1 + rshrn v23.8b, v24.8h, #3 + st1 {v23.8b}, [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_h_neon, export=1 + mov w7, #8 + dup v0.8b, w5 + sub w5, w7, w5 + dup v1.8b, w5 +1: + subs w4, w4, #2 + ld1 {v2.8b}, [x2], x3 + ext v3.8b, v2.8b, v3.8b, #1 + ld1 {v6.8b}, [x2], x3 + ext v7.8b, v6.8b, v7.8b, #1 + trn1 v2.2s, v2.2s, v6.2s + trn1 v3.2s, v3.2s, v7.2s + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + rshrn v4.8b, v4.8h, #3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_v_neon, export=1 + mov w7, #8 + dup v0.8b, w6 + sub w6, w7, w6 + dup v1.8b, w6 + + ld1r {v2.2s}, [x2], x3 +1: + ld1r {v3.2s}, [x2] + ld1 {v2.s}[1], [x2], x3 + ld1 {v3.s}[1], [x2], x3 + umull v4.8h, v2.8b, v1.8b + umlal v4.8h, v3.8b, v0.8b + trn2 v2.2s, v3.2s, v2.2s + rshrn v4.8b, v4.8h, #3 + st1 {v4.s}[0], [x0], x1 + st1 {v4.s}[1], [x0], x1 + subs w4, w4, #2 + b.gt 1b + + ret +endfunc + +function ff_put_vp8_bilin4_hv_neon, export=1 + mov w7, #8 + dup v0.8b, w5 // mx + sub w5, w7, w5 + dup v1.8b, w5 + dup v2.8b, w6 // my + sub w6, w7, w6 + dup v3.8b, w6 + + ld1 {v4.8b}, [x2], x3 + ext v5.8b, v4.8b, v4.8b, #1 + umull v18.8h, v4.8b, v1.8b + umlal v18.8h, v5.8b, v0.8b + rshrn v22.8b, v18.8h, #3 +1: + subs w4, w4, #2 + ld1 {v6.8b}, [x2], x3 + ext v7.8b, v6.8b, v6.8b, #1 + ld1 {v4.8b}, [x2], x3 + ext v5.8b, v4.8b, v4.8b, #1 + trn1 v6.2s, v6.2s, v4.2s + trn1 v7.2s, v7.2s, v5.2s + umull v16.8h, v6.8b, v1.8b + umlal v16.8h, v7.8b, v0.8b + rshrn v16.8b, v16.8h, #3 + umull v20.8h, v16.8b, v2.8b + trn1 v22.2s, v22.2s, v16.2s + umlal v20.8h, v22.8b, v3.8b + rev64 v22.2s, v16.2s + rshrn v20.8b, v20.8h, #3 + st1 {v20.s}[0], [x0], x1 + st1 {v20.s}[1], [x0], x1 + b.gt 1b + + ret +endfunc From patchwork Fri Feb 1 09:12:55 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [16/19] arm: vp8: Optimize put_epel16_h6v6 with vp8_epel8_v6_y2 X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64430 Message-Id: <1549012378-32118-16-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:55 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development This makes it similar to put_epel16_v6, and gives a 10-25% speedup of this function. Before: Cortex A7 A8 A9 A53 A72 vp8_put_epel16_h6v6_neon: 3058.0 2218.5 2459.8 2183.0 1572.2 After: vp8_put_epel16_h6v6_neon: 2670.8 1934.2 2244.4 1729.4 1503.9 --- libavcodec/arm/vp8dsp_neon.S | 41 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 28 deletions(-) diff --git a/libavcodec/arm/vp8dsp_neon.S b/libavcodec/arm/vp8dsp_neon.S index f43b4f7..b707d19 100644 --- a/libavcodec/arm/vp8dsp_neon.S +++ b/libavcodec/arm/vp8dsp_neon.S @@ -773,23 +773,6 @@ endfunc vqrshrun.s16 \d1, q14, #7 .endm -.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 - vmovl.u8 q10, \s2 - vmovl.u8 q11, \s3 - vmovl.u8 q9, \s1 - vmovl.u8 q12, \s4 - vmovl.u8 q8, \s0 - vmovl.u8 q13, \s5 - vmul.u16 q10, q10, d0[2] - vmul.u16 q11, q11, d0[3] - vmls.u16 q10, q9, d0[1] - vmls.u16 q11, q12, d1[0] - vmla.u16 q10, q8, d0[0] - vmla.u16 q11, q13, d1[1] - vqadd.s16 q11, q10, q11 - vqrshrun.s16 \d0, q11, #7 -.endm - .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 vmovl.u8 q10, \s0 vmovl.u8 q11, \s3 @@ -909,12 +892,12 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 sub r2, r2, r3, lsl #1 sub r2, r2, #2 push {r4,lr} - vpush {d8-d9} + vpush {d8-d15} @ first pass (horizontal): - ldr r4, [sp, #28] @ mx + ldr r4, [sp, #64+8+4] @ mx movrel lr, subpel_filters-16 - ldr r12, [sp, #24] @ h + ldr r12, [sp, #64+8+0] @ h add r4, lr, r4, lsl #4 sub sp, sp, #336+16 vld1.16 {q0}, [r4,:128] @@ -931,9 +914,9 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 bne 1b @ second pass (vertical): - ldr r4, [sp, #336+16+32] @ my + ldr r4, [sp, #336+16+64+8+8] @ my movrel lr, subpel_filters-16 - ldr r12, [sp, #336+16+24] @ h + ldr r12, [sp, #336+16+64+8+0] @ h add r4, lr, r4, lsl #4 add lr, sp, #15 vld1.16 {q0}, [r4,:128] @@ -941,18 +924,20 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 2: vld1.8 {d2-d5}, [lr,:128]! vld1.8 {d6-d9}, [lr,:128]! - vld1.8 {d28-d31},[lr,:128] - sub lr, lr, #48 + vld1.8 {d10-d13},[lr,:128]! + vld1.8 {d14-d15},[lr,:128] + sub lr, lr, #64 - vp8_epel8_v6 d2, d2, d4, d6, d8, d28, d30 - vp8_epel8_v6 d3, d3, d5, d7, d9, d29, d31 + vp8_epel8_v6_y2 d2, d4, d2, d4, d6, d8, d10, d12, d14 + vp8_epel8_v6_y2 d3, d5, d3, d5, d7, d9, d11, d13, d15 vst1.8 {d2-d3}, [r0,:128], r1 - subs r12, r12, #1 + vst1.8 {d4-d5}, [r0,:128], r1 + subs r12, r12, #2 bne 2b add sp, sp, #336+16 - vpop {d8-d9} + vpop {d8-d15} pop {r4,pc} endfunc From patchwork Fri Feb 1 09:12:56 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [17/19] aarch64: vp8: Optimize put_epel16_h6v6 with vp8_epel8_v6_y2 X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64431 Message-Id: <1549012378-32118-17-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:56 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development This makes it similar to put_epel16_v6, and gives a large speedup on Cortex A53, a minor speedup on A72 and a very minor slowdown on A73. Before: Cortex A53 A72 A73 vp8_put_epel16_h6v6_neon: 2211.4 1586.5 1431.7 After: vp8_put_epel16_h6v6_neon: 1736.9 1522.0 1448.1 --- libavcodec/aarch64/vp8dsp_neon.S | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 604be8a..139b380 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -769,23 +769,6 @@ endfunc sqrshrun2 \d0\().16b, v22.8h, #7 .endm -.macro vp8_epel8_v6 d0, s0, s1, s2, s3, s4, s5 - uxtl \s2\().8h, \s2\().8b - uxtl \s3\().8h, \s3\().8b - uxtl \s1\().8h, \s1\().8b - uxtl \s4\().8h, \s4\().8b - uxtl \s0\().8h, \s0\().8b - uxtl \s5\().8h, \s5\().8b - mul \s2\().8h, \s2\().8h, v0.h[2] - mul \s3\().8h, \s3\().8h, v0.h[3] - mls \s2\().8h, \s1\().8h, v0.h[1] - mls \s3\().8h, \s4\().8h, v0.h[4] - mla \s2\().8h, \s0\().8h, v0.h[0] - mla \s3\().8h, \s5\().8h, v0.h[5] - sqadd \s3\().8h, \s2\().8h, \s3\().8h - sqrshrun \d0\().8b, \s3\().8h, #7 -.endm - .macro vp8_epel8_v6_y2 d0, d1, s0, s1, s2, s3, s4, s5, s6 uxtl \s0\().8h, \s0\().8b uxtl \s3\().8h, \s3\().8b @@ -942,15 +925,18 @@ function ff_put_vp8_epel16_h6v6_neon, export=1 2: ld1 {v1.8b - v4.8b}, [x7], #32 ld1 {v16.8b - v19.8b}, [x7], #32 - ld1 {v20.8b - v23.8b}, [x7] - sub x7, x7, #48 + ld1 {v20.8b - v23.8b}, [x7], #32 + ld1 {v24.8b - v25.8b}, [x7] + sub x7, x7, #64 - vp8_epel8_v6 v5, v1, v3, v16, v18, v20, v22 - vp8_epel8_v6 v2, v2, v4, v17, v19, v21, v23 - trn1 v2.2d, v5.2d, v2.2d + vp8_epel8_v6_y2 v1, v3, v1, v3, v16, v18, v20, v22, v24 + vp8_epel8_v6_y2 v2, v4, v2, v4, v17, v19, v21, v23, v25 + trn1 v1.2d, v1.2d, v2.2d + trn1 v3.2d, v3.2d, v4.2d - st1 {v2.16b}, [x0], x1 - subs x4, x4, #1 + st1 {v1.16b}, [x0], x1 + st1 {v3.16b}, [x0], x1 + subs x4, x4, #2 b.ne 2b add sp, sp, #336+16 From patchwork Fri Feb 1 09:12:57 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [18/19] aarch64: vp8: Skip saturating in shrn in ff_vp8_idct_add_neon X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64433 Message-Id: <1549012378-32118-18-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:57 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development The original arm version didn't do saturation here. This probably doesn't make any difference for performance, but reduces the differences. --- libavcodec/aarch64/vp8dsp_neon.S | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index 139b380..cac4558 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -92,8 +92,8 @@ function ff_vp8_idct_add_neon, export=1 smull v27.4s, v3.4h, v4.h[0] sqdmulh v20.4h, v1.4h, v4.h[1] sqdmulh v23.4h, v3.4h, v4.h[1] - sqshrn v21.4h, v26.4s, #16 - sqshrn v22.4h, v27.4s, #16 + shrn v21.4h, v26.4s, #16 + shrn v22.4h, v27.4s, #16 add v21.4h, v21.4h, v1.4h add v22.4h, v22.4h, v3.4h @@ -117,8 +117,8 @@ function ff_vp8_idct_add_neon, export=1 st1 {v29.16b}, [x1] sqdmulh v21.4h, v1.4h, v4.h[1] sqdmulh v23.4h, v3.4h, v4.h[1] - sqshrn v20.4h, v26.4s, #16 - sqshrn v22.4h, v27.4s, #16 + shrn v20.4h, v26.4s, #16 + shrn v22.4h, v27.4s, #16 add v20.4h, v20.4h, v1.4h add v22.4h, v22.4h, v3.4h add v16.4h, v0.4h, v2.4h From patchwork Fri Feb 1 09:12:58 2019 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit Subject: [19/19] aarch64: vp8: Optimize vp8_idct_add_neon for aarch64 X-Patchwork-Submitter: =?utf-8?q?Martin_Storsj=C3=B6?= X-Patchwork-Id: 64434 Message-Id: <1549012378-32118-19-git-send-email-martin@martin.st> To: libav-devel@libav.org Date: Fri, 1 Feb 2019 11:12:58 +0200 From: =?utf-8?q?Martin_Storsj=C3=B6?= List-Id: libav development The previous version was a pretty exact translation of the arm version. This version does do some unnecessary arithemetic (it does more operations on vectors that are only half filled; it does 4 uaddw and 4 sqxtun instead of 2 of each), but it reduces the overhead of packing data together (which could be done for free in the arm version). This gives a decent speedup on Cortex A53, a minor speedup on A72 and a very minor slowdown on Cortex A73. Before: Cortex A53 A72 A73 vp8_idct_add_neon: 79.7 67.5 65.0 After: vp8_idct_add_neon: 67.7 64.8 66.7 --- libavcodec/aarch64/vp8dsp_neon.S | 49 ++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/libavcodec/aarch64/vp8dsp_neon.S b/libavcodec/aarch64/vp8dsp_neon.S index cac4558..47fdc21 100644 --- a/libavcodec/aarch64/vp8dsp_neon.S +++ b/libavcodec/aarch64/vp8dsp_neon.S @@ -125,36 +125,37 @@ function ff_vp8_idct_add_neon, export=1 sub v17.4h, v0.4h, v2.4h add v18.4h, v20.4h, v23.4h - ld1 {v24.d}[0], [x0], x2 - zip1 v16.2d, v16.2d, v17.2d - sub v19.4h, v21.4h, v22.4h - ld1 {v25.d}[0], [x0], x2 - zip1 v18.2d, v18.2d, v19.2d - add v0.8h, v16.8h, v18.8h - ld1 {v25.d}[1], [x0], x2 - sub v1.8h, v16.8h, v18.8h - ld1 {v24.d}[1], [x0], x2 - srshr v0.8h, v0.8h, #3 - trn1 v24.4s, v24.4s, v25.4s - srshr v1.8h, v1.8h, #3 + ld1 {v24.s}[0], [x0], x2 + sub v19.4h, v21.4h, v22.4h + ld1 {v25.s}[0], [x0], x2 + add v0.4h, v16.4h, v18.4h + add v1.4h, v17.4h, v19.4h + ld1 {v26.s}[0], [x0], x2 + sub v3.4h, v16.4h, v18.4h + sub v2.4h, v17.4h, v19.4h + ld1 {v27.s}[0], [x0], x2 + srshr v0.4h, v0.4h, #3 + srshr v1.4h, v1.4h, #3 + srshr v2.4h, v2.4h, #3 + srshr v3.4h, v3.4h, #3 + sub x0, x0, x2, lsl #2 - ext v1.16b, v1.16b, v1.16b, #8 - trn1 v3.2d, v0.2d, v1.2d - trn2 v0.2d, v0.2d, v1.2d - trn1 v1.8h, v3.8h, v0.8h - trn2 v3.8h, v3.8h, v0.8h - uzp1 v0.4s, v1.4s, v3.4s - uzp2 v1.4s, v3.4s, v1.4s + transpose_4x4H v0, v1, v2, v3, v5, v6, v7, v16 uaddw v0.8h, v0.8h, v24.8b - uaddw2 v1.8h, v1.8h, v24.16b + uaddw v1.8h, v1.8h, v25.8b + uaddw v2.8h, v2.8h, v26.8b + uaddw v3.8h, v3.8h, v27.8b sqxtun v0.8b, v0.8h - sqxtun2 v0.16b, v1.8h + sqxtun v1.8b, v1.8h + sqxtun v2.8b, v2.8h + sqxtun v3.8b, v3.8h + st1 {v0.s}[0], [x0], x2 - st1 {v0.s}[1], [x0], x2 - st1 {v0.s}[3], [x0], x2 - st1 {v0.s}[2], [x0], x2 + st1 {v1.s}[0], [x0], x2 + st1 {v2.s}[0], [x0], x2 + st1 {v3.s}[0], [x0], x2 ret endfunc