arm: Avoid using .dn register aliases

Message ID 1494315678-2204-1-git-send-email-martin@martin.st
State Committed
Commit d7320ca3ed10f0d35b3740fa03341161e74275ea
Headers show

Commit Message

Martin Storsjö May 9, 2017, 7:41 a.m.
clang now (in the upcoming 5.0 version) is capable of building our
arm assembly without relying on gas-preprocessor. The VC1 MC assembly
was only built and used if the chosen assembler supported the .dn
directives though. This meant that VC1 decoding got a speed regression
on clang, unless the user manually chose using gas-preprocessor again.

By avoiding using the .dn register aliases, we can build the VC1 MC
assembly with the latest clang version.

Support for the .dn/.qn directives in clang/LLVM isn't actively planned,
see https://bugs.llvm.org/show_bug.cgi?id=18199.

This partially reverts 896a5bff64264f4d01ed98eacc97a67260c1e17e.
---
 configure                         |  5 -----
 libavcodec/arm/vc1dsp_init_neon.c |  4 ----
 libavcodec/arm/vc1dsp_neon.S      | 21 +++++----------------
 3 files changed, 5 insertions(+), 25 deletions(-)

Comments

Diego Biurrun May 9, 2017, 9:29 a.m. | #1
On Tue, May 09, 2017 at 10:41:18AM +0300, Martin Storsjö wrote:
> clang now (in the upcoming 5.0 version) is capable of building our
> arm assembly without relying on gas-preprocessor. The VC1 MC assembly
> was only built and used if the chosen assembler supported the .dn
> directives though. This meant that VC1 decoding got a speed regression
> on clang, unless the user manually chose using gas-preprocessor again.
> 
> By avoiding using the .dn register aliases, we can build the VC1 MC
> assembly with the latest clang version.
> 
> Support for the .dn/.qn directives in clang/LLVM isn't actively planned,
> see https://bugs.llvm.org/show_bug.cgi?id=18199.
> 
> This partially reverts 896a5bff64264f4d01ed98eacc97a67260c1e17e.
> ---
>  configure                         |  5 -----
>  libavcodec/arm/vc1dsp_init_neon.c |  4 ----
>  libavcodec/arm/vc1dsp_neon.S      | 21 +++++----------------
>  3 files changed, 5 insertions(+), 25 deletions(-)

probably OK

Diego

Patch

diff --git a/configure b/configure
index d70e615..d6c44cf 100755
--- a/configure
+++ b/configure
@@ -1662,7 +1662,6 @@  SYSTEM_FUNCS="
 
 TOOLCHAIN_FEATURES="
     as_arch_directive
-    as_dn_directive
     as_fpu_directive
     as_func
     as_object_arch
@@ -4380,10 +4379,6 @@  EOF
     check_as <<EOF && enable as_arch_directive
 .arch armv7-a
 EOF
-    check_as <<EOF && enable as_dn_directive
-ra .dn d0.i16
-.unreq ra
-EOF
     check_as <<EOF && enable as_fpu_directive
 .fpu neon
 EOF
diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index 08c07c4..1c06597 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -22,8 +22,6 @@ 
 #include "libavcodec/vc1dsp.h"
 #include "vc1dsp.h"
 
-#include "config.h"
-
 void ff_vc1_inv_trans_8x8_neon(int16_t *block);
 void ff_vc1_inv_trans_4x8_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 void ff_vc1_inv_trans_8x4_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
@@ -93,7 +91,6 @@  av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->vc1_inv_trans_4x4_dc = ff_vc1_inv_trans_4x4_dc_neon;
 
     dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_pixels8x8_neon;
-    if (HAVE_AS_DN_DIRECTIVE) {
     dsp->put_vc1_mspel_pixels_tab[ 1] = ff_put_vc1_mspel_mc10_neon;
     dsp->put_vc1_mspel_pixels_tab[ 2] = ff_put_vc1_mspel_mc20_neon;
     dsp->put_vc1_mspel_pixels_tab[ 3] = ff_put_vc1_mspel_mc30_neon;
@@ -109,7 +106,6 @@  av_cold void ff_vc1dsp_init_neon(VC1DSPContext *dsp)
     dsp->put_vc1_mspel_pixels_tab[13] = ff_put_vc1_mspel_mc13_neon;
     dsp->put_vc1_mspel_pixels_tab[14] = ff_put_vc1_mspel_mc23_neon;
     dsp->put_vc1_mspel_pixels_tab[15] = ff_put_vc1_mspel_mc33_neon;
-    }
 
     dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_neon;
     dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_neon;
diff --git a/libavcodec/arm/vc1dsp_neon.S b/libavcodec/arm/vc1dsp_neon.S
index 1653a4c..ff88fe2 100644
--- a/libavcodec/arm/vc1dsp_neon.S
+++ b/libavcodec/arm/vc1dsp_neon.S
@@ -663,7 +663,6 @@  function ff_vc1_inv_trans_4x4_neon, export=1
         bx              lr
 endfunc
 
-#if HAVE_AS_DN_DIRECTIVE
 @ The absolute value of multiplication constants from vc1_mspel_filter and vc1_mspel_{ver,hor}_filter_16bits.
 @ The sign is embedded in the code below that carries out the multiplication (mspel_filter{,.16}).
 #define MSPEL_MODE_1_MUL_CONSTANTS  4 53 18 3
@@ -689,22 +688,18 @@  endfunc
 
 @ Setup constants in registers for a subsequent use of mspel_filter{,.16}.
 .macro mspel_constants typesize reg_a reg_b reg_c reg_d filter_a filter_b filter_c filter_d reg_add filter_add_register
-  @ Define double-word register aliases. Typesize should be i8 or i16.
-  ra .dn \reg_a\().\typesize
-  rb .dn \reg_b\().\typesize
-  rc .dn \reg_c\().\typesize
-  rd .dn \reg_d\().\typesize
+  @ Typesize should be i8 or i16.
 
   @ Only set the register if the value is not 1 and unique
   .if \filter_a != 1
-        vmov            ra,  #\filter_a              @ ra = filter_a
+        vmov.\typesize  \reg_a,  #\filter_a          @ reg_a = filter_a
   .endif
-        vmov            rb,  #\filter_b              @ rb = filter_b
+        vmov.\typesize  \reg_b,  #\filter_b          @ reg_b = filter_b
   .if \filter_b != \filter_c
-        vmov            rc,  #\filter_c              @ rc = filter_c
+        vmov.\typesize  \reg_c,  #\filter_c          @ reg_c = filter_c
   .endif
   .if \filter_d != 1
-        vmov            rd,  #\filter_d              @ rd = filter_d
+        vmov.\typesize  \reg_d,  #\filter_d          @ reg_d = filter_d
   .endif
   @ vdup to double the size of typesize
   .ifc \typesize,i8
@@ -712,11 +707,6 @@  endfunc
   .else
         vdup.32         \reg_add,  \filter_add_register     @ reg_add = filter_add_register
   .endif
-
-  .unreq ra
-  .unreq rb
-  .unreq rc
-  .unreq rd
 .endm
 
 @ After mspel_constants has been used, do the filtering.
@@ -987,7 +977,6 @@  PUT_VC1_MSPEL_MC_V_ONLY(2)
 PUT_VC1_MSPEL_MC_V_ONLY(3)
 
 #undef PUT_VC1_MSPEL_MC_V_ONLY
-#endif
 
 function ff_put_pixels8x8_neon, export=1
         vld1.64         {d0}, [r1], r2