[2/2] x86: check for AV_CPU_FLAG_AVXSLOW where useful

Message ID 1432619110-5336-2-git-send-email-jamrial@gmail.com
State New
Headers show

Commit Message

James Almer May 26, 2015, 5:45 a.m.
Signed-off-by: James Almer <jamrial@gmail.com>
---
The FMA4 functions from libavresample's audio_mix need to be handled
differently. Disabling them if avxslow is true is pointless since no
CPU out there currently has FMA4 and a fast float execution unit. So
I'm thinking about duplicating them and doing:

FMA3 YMM/XMM for current Intel CPUs (Basically, renaming the existing
functions)
FMA4 XMM for current AMD stuff (Regardless of x86_32 or x86_64).

I'll see about implementing that in the coming days.

 libavcodec/x86/dcadsp_init.c           |  4 ++--
 libavcodec/x86/dct_init.c              |  2 +-
 libavcodec/x86/fft_init.c              |  2 +-
 libavfilter/x86/af_volume_init.c       |  2 +-
 libavresample/x86/audio_convert_init.c | 10 ++++++----
 libavresample/x86/audio_mix_init.c     | 10 ++++++----
 libavresample/x86/dither_init.c        |  4 ++--
 libavutil/x86/float_dsp_init.c         |  2 +-
 libavutil/x86/lls_init.c               |  2 +-
 9 files changed, 21 insertions(+), 17 deletions(-)

Comments

Luca Barbato May 26, 2015, 9:49 a.m. | #1
On 26/05/15 07:45, James Almer wrote:
> Signed-off-by: James Almer <jamrial@gmail.com>
> ---
> The FMA4 functions from libavresample's audio_mix need to be handled
> differently. Disabling them if avxslow is true is pointless since no
> CPU out there currently has FMA4 and a fast float execution unit. So
> I'm thinking about duplicating them and doing:
> 
> FMA3 YMM/XMM for current Intel CPUs (Basically, renaming the existing
> functions)
> FMA4 XMM for current AMD stuff (Regardless of x86_32 or x86_64).
> 
> I'll see about implementing that in the coming days.
> 

I'd make an EXTERNAL_AVX_FAST since I notice that

`if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW))` is
repeated few times, thanks a lot for checking all the functions!

lu

Patch

diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 9acb818..8deb6d6 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -98,10 +98,10 @@  av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->synth_filter_float = synth_filter_sse2;
     }
-    if (EXTERNAL_AVX(cpu_flags)) {
+    if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
         s->synth_filter_float = synth_filter_avx;
     }
-    if (EXTERNAL_FMA3(cpu_flags)) {
+    if (EXTERNAL_FMA3(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
         s->synth_filter_float = synth_filter_fma3;
     }
 #endif /* HAVE_YASM */
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index 7bda5e8..660d118 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -34,6 +34,6 @@  av_cold void ff_dct_init_x86(DCTContext *s)
         s->dct32 = ff_dct32_float_sse;
     if (EXTERNAL_SSE2(cpu_flags))
         s->dct32 = ff_dct32_float_sse2;
-    if (EXTERNAL_AVX(cpu_flags))
+    if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW))
         s->dct32 = ff_dct32_float_avx;
 }
diff --git a/libavcodec/x86/fft_init.c b/libavcodec/x86/fft_init.c
index 7ca72c5..840f348 100644
--- a/libavcodec/x86/fft_init.c
+++ b/libavcodec/x86/fft_init.c
@@ -48,7 +48,7 @@  av_cold void ff_fft_init_x86(FFTContext *s)
         s->fft_calc    = ff_fft_calc_sse;
         s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
     }
-    if (EXTERNAL_AVX(cpu_flags) && s->nbits >= 5) {
+    if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW) && s->nbits >= 5) {
         /* AVX for SB */
         s->imdct_half      = ff_imdct_half_avx;
         s->fft_calc        = ff_fft_calc_avx;
diff --git a/libavfilter/x86/af_volume_init.c b/libavfilter/x86/af_volume_init.c
index c59e0ed..f70bafa 100644
--- a/libavfilter/x86/af_volume_init.c
+++ b/libavfilter/x86/af_volume_init.c
@@ -52,7 +52,7 @@  av_cold void ff_volume_init_x86(VolumeContext *vol)
             vol->scale_samples = ff_scale_samples_s32_ssse3_atom;
             vol->samples_align = 4;
         }
-        if (EXTERNAL_AVX(cpu_flags)) {
+        if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
             vol->scale_samples = ff_scale_samples_s32_avx;
             vol->samples_align = 8;
         }
diff --git a/libavresample/x86/audio_convert_init.c b/libavresample/x86/audio_convert_init.c
index d85ca84..1aab0f7 100644
--- a/libavresample/x86/audio_convert_init.c
+++ b/libavresample/x86/audio_convert_init.c
@@ -227,10 +227,12 @@  av_cold void ff_audio_convert_init_x86(AudioConvert *ac)
                                   6, 16, 4, "SSE4", ff_conv_fltp_to_flt_6ch_sse4);
     }
     if (EXTERNAL_AVX(cpu_flags)) {
-        ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S32,
-                                  0, 32, 16, "AVX", ff_conv_s32_to_flt_avx);
-        ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S32, AV_SAMPLE_FMT_FLT,
-                                  0, 32, 32, "AVX", ff_conv_flt_to_s32_avx);
+        if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+            ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_FLT, AV_SAMPLE_FMT_S32,
+                                      0, 32, 16, "AVX", ff_conv_s32_to_flt_avx);
+            ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S32, AV_SAMPLE_FMT_FLT,
+                                      0, 32, 32, "AVX", ff_conv_flt_to_s32_avx);
+        }
         ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P,
                                   2, 16, 16, "AVX", ff_conv_s16p_to_s16_2ch_avx);
         ff_audio_convert_set_func(ac, AV_SAMPLE_FMT_S16, AV_SAMPLE_FMT_S16P,
diff --git a/libavresample/x86/audio_mix_init.c b/libavresample/x86/audio_mix_init.c
index 7fc530e..4fc2749 100644
--- a/libavresample/x86/audio_mix_init.c
+++ b/libavresample/x86/audio_mix_init.c
@@ -196,10 +196,12 @@  av_cold void ff_audio_mix_init_x86(AudioMix *am)
                               1, 2, 16, 8, "SSE4", ff_mix_1_to_2_s16p_flt_sse4);
     }
     if (EXTERNAL_AVX(cpu_flags)) {
-        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,
-                              2, 1, 32, 16, "AVX", ff_mix_2_to_1_fltp_flt_avx);
-        ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,
-                              1, 2, 32, 8, "AVX", ff_mix_1_to_2_fltp_flt_avx);
+        if (!(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
+            ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,
+                                  2, 1, 32, 16, "AVX", ff_mix_2_to_1_fltp_flt_avx);
+            ff_audio_mix_set_func(am, AV_SAMPLE_FMT_FLTP, AV_MIX_COEFF_TYPE_FLT,
+                                  1, 2, 32, 8, "AVX", ff_mix_1_to_2_fltp_flt_avx);
+        }
         ff_audio_mix_set_func(am, AV_SAMPLE_FMT_S16P, AV_MIX_COEFF_TYPE_FLT,
                               1, 2, 16, 8, "AVX", ff_mix_1_to_2_s16p_flt_avx);
     }
diff --git a/libavresample/x86/dither_init.c b/libavresample/x86/dither_init.c
index 8349d5e..0f0e7db 100644
--- a/libavresample/x86/dither_init.c
+++ b/libavresample/x86/dither_init.c
@@ -46,14 +46,14 @@  av_cold void ff_dither_init_x86(DitherDSPContext *ddsp,
         if (EXTERNAL_SSE2(cpu_flags)) {
             ddsp->dither_int_to_float = ff_dither_int_to_float_rectangular_sse2;
         }
-        if (EXTERNAL_AVX(cpu_flags)) {
+        if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
             ddsp->dither_int_to_float = ff_dither_int_to_float_rectangular_avx;
         }
     } else {
         if (EXTERNAL_SSE2(cpu_flags)) {
             ddsp->dither_int_to_float = ff_dither_int_to_float_triangular_sse2;
         }
-        if (EXTERNAL_AVX(cpu_flags)) {
+        if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
             ddsp->dither_int_to_float = ff_dither_int_to_float_triangular_avx;
         }
     }
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index a04d91c..900f423 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -146,7 +146,7 @@  av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
     if (EXTERNAL_SSE2(cpu_flags)) {
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_sse2;
     }
-    if (EXTERNAL_AVX(cpu_flags)) {
+    if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
         fdsp->vector_fmul = ff_vector_fmul_avx;
         fdsp->vector_fmac_scalar = ff_vector_fmac_scalar_avx;
         fdsp->vector_dmul_scalar = ff_vector_dmul_scalar_avx;
diff --git a/libavutil/x86/lls_init.c b/libavutil/x86/lls_init.c
index 888bc54..1e70f11 100644
--- a/libavutil/x86/lls_init.c
+++ b/libavutil/x86/lls_init.c
@@ -35,7 +35,7 @@  av_cold void ff_init_lls_x86(LLSModel *m)
         if (m->indep_count >= 4)
             m->evaluate_lls = ff_evaluate_lls_sse2;
     }
-    if (EXTERNAL_AVX(cpu_flags)) {
+    if (EXTERNAL_AVX(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_AVXSLOW)) {
         m->update_lls = ff_update_lls_avx;
     }
 }