[1/3] x86/synth_filter: add synth_filter_sse

Message ID 1393899904-13802-2-git-send-email-jamrial@gmail.com
State New
Headers show

Commit Message

James Almer March 4, 2014, 2:25 a.m.
Build only on x86_32 targets.

Signed-off-by: James Almer <jamrial@gmail.com>
---
 libavcodec/x86/dcadsp.asm    | 55 +++++++++++++++++++++++++++++++++-----------
 libavcodec/x86/dcadsp_init.c | 44 +++++++++++++++++++++--------------
 2 files changed, 69 insertions(+), 30 deletions(-)

Comments

Christophe Gisquet March 14, 2014, 10:56 a.m. | #1
Hi,

2014-03-04 3:25 GMT+01:00 James Almer <jamrial@gmail.com>:
> -INIT_XMM sse2
> +%macro SETZERO 1
> +%if cpuflag(sse2)
> +    pxor          %1, %1
> +%else
> +    xorps         %1, %1, %1
> +%endif
> +%endmacro
> +
> +%macro SHUF 2
> +%if cpuflag(sse2)
> +    pshufd        %1, %2, q0123
> +%else
> +    mova          %1, %2
> +    shufps        %1, %1, q0123
> +%endif
> +%endmacro

We already discussed this, and indeed it is worth having SSE2
(integer) instructions instead of pure (float) SSE ones for the SSE2
version as they are actually faster. OK from me then for the asm.

Not sure if the C part still applies cleanly, but this should be minor.
James Almer March 15, 2014, 9:52 p.m. | #2
On 14/03/14 7:56 AM, Christophe Gisquet wrote:
> Hi,
> 
> 2014-03-04 3:25 GMT+01:00 James Almer <jamrial@gmail.com>:
>> -INIT_XMM sse2
>> +%macro SETZERO 1
>> +%if cpuflag(sse2)
>> +    pxor          %1, %1
>> +%else
>> +    xorps         %1, %1, %1
>> +%endif
>> +%endmacro
>> +
>> +%macro SHUF 2
>> +%if cpuflag(sse2)
>> +    pshufd        %1, %2, q0123
>> +%else
>> +    mova          %1, %2
>> +    shufps        %1, %1, q0123
>> +%endif
>> +%endmacro
> 
> We already discussed this, and indeed it is worth having SSE2
> (integer) instructions instead of pure (float) SSE ones for the SSE2
> version as they are actually faster. OK from me then for the asm.
> 
> Not sure if the C part still applies cleanly, but this should be minor.

It doesn't. I'll rebase and send the patchset again with some other changes 
later.

Patch

diff --git a/libavcodec/x86/dcadsp.asm b/libavcodec/x86/dcadsp.asm
index 56039ba..970ec3d 100644
--- a/libavcodec/x86/dcadsp.asm
+++ b/libavcodec/x86/dcadsp.asm
@@ -199,15 +199,31 @@  INIT_XMM sse
 DCA_LFE_FIR 0
 DCA_LFE_FIR 1
 
-INIT_XMM sse2
+%macro SETZERO 1
+%if cpuflag(sse2)
+    pxor          %1, %1
+%else
+    xorps         %1, %1, %1
+%endif
+%endmacro
+
+%macro SHUF 2
+%if cpuflag(sse2)
+    pshufd        %1, %2, q0123
+%else
+    mova          %1, %2
+    shufps        %1, %1, q0123
+%endif
+%endmacro
+
 %macro INNER_LOOP   1
     ; reading backwards:  ptr1 = synth_buf + j + i; ptr2 = synth_buf + j - i
     ;~ a += window[i + j]      * (-synth_buf[15 - i + j])
     ;~ b += window[i + j + 16] * (synth_buf[i + j])
-    pshufd        m5, [ptr2 + j + (15 - 3) * 4], q0123
+    SHUF          m5, [ptr2 + j + (15 - 3) * 4]
     mova          m6, [ptr1 + j]
 %if ARCH_X86_64
-    pshufd       m11, [ptr2 + j + (15 - 3) * 4 - mmsize], q0123
+    SHUF         m11, [ptr2 + j + (15 - 3) * 4 - mmsize]
     mova         m12, [ptr1 + j + mmsize]
 %endif
     mulps         m6, [win  + %1 + j + 16 * 4]
@@ -224,10 +240,10 @@  INIT_XMM sse2
 %endif
     ;~ c += window[i + j + 32] * (synth_buf[16 + i + j])
     ;~ d += window[i + j + 48] * (synth_buf[31 - i + j])
-    pshufd        m6, [ptr2 + j + (31 - 3) * 4], q0123
+    SHUF          m6, [ptr2 + j + (31 - 3) * 4]
     mova          m5, [ptr1 + j + 16 * 4]
 %if ARCH_X86_64
-    pshufd       m12, [ptr2 + j + (31 - 3) * 4 - mmsize], q0123
+    SHUF         m12, [ptr2 + j + (31 - 3) * 4 - mmsize]
     mova         m11, [ptr1 + j + mmsize + 16 * 4]
 %endif
     mulps         m5, [win  + %1 + j + 32 * 4]
@@ -245,20 +261,25 @@  INIT_XMM sse2
     sub            j, 64 * 4
 %endmacro
 
-; void ff_synth_filter_inner_sse2(float *synth_buf, float synth_buf2[32],
-;                                 const float window[512], float out[32],
-;                                 intptr_t offset, float scale)
+; void ff_synth_filter_inner_<opt>(float *synth_buf, float synth_buf2[32],
+;                                  const float window[512], float out[32],
+;                                  intptr_t offset, float scale)
+%macro SYNTH_FILTER 0
 cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
                               synth_buf, synth_buf2, window, out, off, scale
 %define scale m0
 %if ARCH_X86_32 || WIN64
+%if cpuflag(sse2)
     movd       scale, scalem
+%else
+    movss      scale, scalem
+%endif
 ; Make sure offset is in a register and not on the stack
 %define OFFQ  r4q
 %else
 %define OFFQ  offq
 %endif
-    pshufd        m0, m0, 0
+    SPLATD        m0
     ; prepare inner counter limit 1
     mov          r5q, 480
     sub          r5q, offmp
@@ -274,8 +295,8 @@  cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
 %endif
 .mainloop
     ; m1 = a  m2 = b  m3 = c  m4 = d
-    pxor          m3, m3
-    pxor          m4, m4
+    SETZERO       m3
+    SETZERO       m4
     mova          m1, [buf2 + i]
     mova          m2, [buf2 + i + 16 * 4]
 %if ARCH_X86_32
@@ -292,8 +313,8 @@  cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
 %define ptr2     r7q ; must be loaded
 %define win      r8q
 %define j        r9q
-    pxor          m9, m9
-    pxor         m10, m10
+    SETZERO       m9
+    SETZERO      m10
     mova          m7, [buf2 + i + mmsize]
     mova          m8, [buf2 + i + mmsize + 16 * 4]
     lea          win, [windowq + i]
@@ -350,3 +371,11 @@  cglobal synth_filter_inner, 0, 6 + 4 * ARCH_X86_64, 7 + 6 * ARCH_X86_64, \
     sub            i, (ARCH_X86_64 + 1) * mmsize
     jge    .mainloop
     RET
+%endmacro
+
+%if ARCH_X86_32
+INIT_XMM sse
+SYNTH_FILTER
+%endif
+INIT_XMM sse2
+SYNTH_FILTER
diff --git a/libavcodec/x86/dcadsp_init.c b/libavcodec/x86/dcadsp_init.c
index 3821892..f8dd9b1 100644
--- a/libavcodec/x86/dcadsp_init.c
+++ b/libavcodec/x86/dcadsp_init.c
@@ -56,29 +56,39 @@  av_cold void ff_dcadsp_init_x86(DCADSPContext *s)
     }
 }
 
-void ff_synth_filter_inner_sse2(float *synth_buf_ptr, float synth_buf2[32],
-                                const float window[512],
-                                float out[32], intptr_t offset, float scale);
+#define SYNTH_FILTER_FUNC(opt)                                                 \
+void ff_synth_filter_inner_##opt(float *synth_buf_ptr, float synth_buf2[32],   \
+                                 const float window[512],                      \
+                                 float out[32], intptr_t offset, float scale); \
+static void synth_filter_##opt(FFTContext *imdct,                              \
+                               float *synth_buf_ptr, int *synth_buf_offset,    \
+                               float synth_buf2[32], const float window[512],  \
+                               float out[32], const float in[32], float scale) \
+{                                                                              \
+    float *synth_buf= synth_buf_ptr + *synth_buf_offset;                       \
+                                                                               \
+    imdct->imdct_half(imdct, synth_buf, in);                                   \
+                                                                               \
+    ff_synth_filter_inner_##opt(synth_buf, synth_buf2, window,                 \
+                                out, *synth_buf_offset, scale);                \
+                                                                               \
+    *synth_buf_offset = (*synth_buf_offset - 32) & 511;                        \
+}                                                                              \
 
-static void synth_filter_sse2(FFTContext *imdct,
-                              float *synth_buf_ptr, int *synth_buf_offset,
-                              float synth_buf2[32], const float window[512],
-                              float out[32], const float in[32], float scale)
-{
-    float *synth_buf= synth_buf_ptr + *synth_buf_offset;
-
-    imdct->imdct_half(imdct, synth_buf, in);
-
-    ff_synth_filter_inner_sse2(synth_buf, synth_buf2, window,
-                               out, *synth_buf_offset, scale);
-
-    *synth_buf_offset = (*synth_buf_offset - 32) & 511;
-}
+#if ARCH_X86_32
+SYNTH_FILTER_FUNC(sse)
+#endif
+SYNTH_FILTER_FUNC(sse2)
 
 av_cold void ff_synth_filter_init_x86(SynthFilterContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
+    if (EXTERNAL_SSE(cpu_flags)) {
+        s->synth_filter_float = synth_filter_sse;
+    }
+#endif
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->synth_filter_float = synth_filter_sse2;
     }