ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder.

Message ID 1300393949-18949-1-git-send-email-justin.ruggles@gmail.com
State Committed
Commit 0f999cfddb0746602288eabddf38679fd25a2ff7
Headers show

Commit Message

Justin Ruggles March 17, 2011, 8:32 p.m.
---
Changed to put 1<<24 float x 4 in .rodata

I had previously tested unrolling more, but there have been some
changes in the code since then.  So I tested again.  3DNow and
SSE2 versions benefit from more unrolling, but SSE version does
not.

 libavcodec/ac3dsp.c         |   21 +++++++-
 libavcodec/ac3dsp.h         |   17 ++++++-
 libavcodec/ac3enc.c         |    2 +-
 libavcodec/ac3enc_float.c   |    5 +-
 libavcodec/x86/ac3dsp.asm   |  115 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/ac3dsp_mmx.c |   15 +++++-
 6 files changed, 166 insertions(+), 9 deletions(-)

Comments

Ronald Bultje March 17, 2011, 8:34 p.m. | #1
Hi,

On Thu, Mar 17, 2011 at 4:32 PM, Justin Ruggles
<justin.ruggles@gmail.com> wrote:
> ---
> Changed to put 1<<24 float x 4 in .rodata
>
> I had previously tested unrolling more, but there have been some
> changes in the code since then.  So I tested again.  3DNow and
> SSE2 versions benefit from more unrolling, but SSE version does
> not.
>
>  libavcodec/ac3dsp.c         |   21 +++++++-
>  libavcodec/ac3dsp.h         |   17 ++++++-
>  libavcodec/ac3enc.c         |    2 +-
>  libavcodec/ac3enc_float.c   |    5 +-
>  libavcodec/x86/ac3dsp.asm   |  115 +++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/ac3dsp_mmx.c |   15 +++++-
>  6 files changed, 166 insertions(+), 9 deletions(-)

ASM looks good to me. C code also looks fine, and Mans already
reviewed that sufficiently.

Ronald

Patch

diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 4d9db9b..9bfa730 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -85,13 +85,30 @@  static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
     } while (len > 0);
 }
 
-av_cold void ff_ac3dsp_init(AC3DSPContext *c)
+static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
+{
+    const float scale = 1 << 24;
+    do {
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        len -= 8;
+    } while (len > 0);
+}
+
+av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
 {
     c->ac3_exponent_min = ac3_exponent_min_c;
     c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
     c->ac3_lshift_int16 = ac3_lshift_int16_c;
     c->ac3_rshift_int32 = ac3_rshift_int32_c;
+    c->float_to_fixed24 = float_to_fixed24_c;
 
     if (HAVE_MMX)
-        ff_ac3dsp_init_x86(c);
+        ff_ac3dsp_init_x86(c, bit_exact);
 }
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index 31a0af3..0a2dedf 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -68,9 +68,22 @@  typedef struct AC3DSPContext {
      *               constraints: range [0,31]
      */
     void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift);
+
+    /**
+     * Convert an array of float in range [-1.0,1.0] to int32_t with range
+     * [-(1<<24),(1<<24)]
+     *
+     * @param dst destination array of int32_t.
+     *            constraints: 16-byte aligned
+     * @param src source array of float.
+     *            constraints: 16-byte aligned
+     * @param len number of elements to convert.
+     *            constraints: multiple of 32 greater than zero
+     */
+    void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len);
 } AC3DSPContext;
 
-void ff_ac3dsp_init    (AC3DSPContext *c);
-void ff_ac3dsp_init_x86(AC3DSPContext *c);
+void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
+void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
 
 #endif /* AVCODEC_AC3DSP_H */
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 4c01fe3..5b76ae6 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -1843,7 +1843,7 @@  static av_cold int ac3_encode_init(AVCodecContext *avctx)
     avctx->coded_frame= avcodec_alloc_frame();
 
     dsputil_init(&s->dsp, avctx);
-    ff_ac3dsp_init(&s->ac3dsp);
+    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
 
     return 0;
 init_fail:
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 8668b2e..4b13e4c 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -103,9 +103,8 @@  static int normalize_samples(AC3EncodeContext *s)
  */
 static void scale_coefficients(AC3EncodeContext *s)
 {
-    int i;
-    for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
-        s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
+    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
+                               AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
 }
 
 
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index e281791..8b7e826 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -22,6 +22,11 @@ 
 %include "x86inc.asm"
 %include "x86util.asm"
 
+SECTION_RODATA
+
+; 16777216.0f - used in ff_float_to_fixed24()
+pf_1_24: times 4 dd 0x4B800000
+
 SECTION .text
 
 ;-----------------------------------------------------------------------------
@@ -178,3 +183,113 @@  INIT_MMX
 AC3_SHIFT r, 32, psrad, mmx
 INIT_XMM
 AC3_SHIFT r, 32, psrad, sse2
+
+;-----------------------------------------------------------------------------
+; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
+;-----------------------------------------------------------------------------
+
+; The 3DNow! version is not bit-identical because pf2id uses truncation rather
+; than round-to-nearest.
+INIT_MMX
+cglobal float_to_fixed24_3dnow, 3,3,0, dst, src, len
+    movq   m0, [pf_1_24]
+.loop:
+    movq   m1, [srcq   ]
+    movq   m2, [srcq+8 ]
+    movq   m3, [srcq+16]
+    movq   m4, [srcq+24]
+    pfmul  m1, m0
+    pfmul  m2, m0
+    pfmul  m3, m0
+    pfmul  m4, m0
+    pf2id  m1, m1
+    pf2id  m2, m2
+    pf2id  m3, m3
+    pf2id  m4, m4
+    movq  [dstq   ], m1
+    movq  [dstq+8 ], m2
+    movq  [dstq+16], m3
+    movq  [dstq+24], m4
+    add  srcq, 32
+    add  dstq, 32
+    sub  lend, 8
+    ja .loop
+    REP_RET
+
+INIT_XMM
+cglobal float_to_fixed24_sse, 3,3,3, dst, src, len
+    movaps     m0, [pf_1_24]
+.loop:
+    movaps     m1, [srcq   ]
+    movaps     m2, [srcq+16]
+    mulps      m1, m0
+    mulps      m2, m0
+    cvtps2pi  mm0, m1
+    movhlps    m1, m1
+    cvtps2pi  mm1, m1
+    cvtps2pi  mm2, m2
+    movhlps    m2, m2
+    cvtps2pi  mm3, m2
+    movq  [dstq   ], mm0
+    movq  [dstq+ 8], mm1
+    movq  [dstq+16], mm2
+    movq  [dstq+24], mm3
+    add      srcq, 32
+    add      dstq, 32
+    sub      lend, 8
+    ja .loop
+    REP_RET
+
+INIT_XMM
+cglobal float_to_fixed24_sse2, 3,3,9, dst, src, len
+    movaps     m0, [pf_1_24]
+.loop:
+    movaps     m1, [srcq    ]
+    movaps     m2, [srcq+16 ]
+    movaps     m3, [srcq+32 ]
+    movaps     m4, [srcq+48 ]
+%ifdef m8
+    movaps     m5, [srcq+64 ]
+    movaps     m6, [srcq+80 ]
+    movaps     m7, [srcq+96 ]
+    movaps     m8, [srcq+112]
+%endif
+    mulps      m1, m0
+    mulps      m2, m0
+    mulps      m3, m0
+    mulps      m4, m0
+%ifdef m8
+    mulps      m5, m0
+    mulps      m6, m0
+    mulps      m7, m0
+    mulps      m8, m0
+%endif
+    cvtps2dq   m1, m1
+    cvtps2dq   m2, m2
+    cvtps2dq   m3, m3
+    cvtps2dq   m4, m4
+%ifdef m8
+    cvtps2dq   m5, m5
+    cvtps2dq   m6, m6
+    cvtps2dq   m7, m7
+    cvtps2dq   m8, m8
+%endif
+    movdqa  [dstq    ], m1
+    movdqa  [dstq+16 ], m2
+    movdqa  [dstq+32 ], m3
+    movdqa  [dstq+48 ], m4
+%ifdef m8
+    movdqa  [dstq+64 ], m5
+    movdqa  [dstq+80 ], m6
+    movdqa  [dstq+96 ], m7
+    movdqa  [dstq+112], m8
+    add      srcq, 128
+    add      dstq, 128
+    sub      lenq, 32
+%else
+    add      srcq, 64
+    add      dstq, 64
+    sub      lenq, 16
+%endif
+    ja .loop
+    REP_RET
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 835b106..97d0657 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -38,7 +38,11 @@  extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned in
 extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
 extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
 
-av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
+extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
+extern void ff_float_to_fixed24_sse  (int32_t *dst, const float *src, unsigned int len);
+extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
+
+av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
 {
     int mm_flags = av_get_cpu_flags();
 
@@ -49,13 +53,22 @@  av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
         c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
         c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
     }
+    if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
+        if (!bit_exact) {
+            c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
+        }
+    }
     if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
         c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
     }
+    if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
+        c->float_to_fixed24 = ff_float_to_fixed24_sse;
+    }
     if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
         c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
+        c->float_to_fixed24 = ff_float_to_fixed24_sse2;
         if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
             c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
             c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;