ac3enc: add float_to_fixed24() with x86-optimized versions to AC3DSPContext and use in scale_coefficients() for the floating-point AC-3 encoder.

Message ID 1300245816-19473-1-git-send-email-justin.ruggles@gmail.com
State Superseded
Headers show

Commit Message

Justin Ruggles March 16, 2011, 3:23 a.m.
---
 libavcodec/ac3dsp.c         |   21 ++++++++++-
 libavcodec/ac3dsp.h         |   17 ++++++++-
 libavcodec/ac3enc.c         |    2 +-
 libavcodec/ac3enc_float.c   |    5 +--
 libavcodec/x86/ac3dsp.asm   |   80 +++++++++++++++++++++++++++++++++++++++++++
 libavcodec/x86/ac3dsp_mmx.c |   15 +++++++-
 6 files changed, 131 insertions(+), 9 deletions(-)

Comments

Justin Ruggles March 16, 2011, 3:35 a.m. | #1
On 03/15/2011 11:23 PM, Justin Ruggles wrote:

> ---
>  libavcodec/ac3dsp.c         |   21 ++++++++++-
>  libavcodec/ac3dsp.h         |   17 ++++++++-
>  libavcodec/ac3enc.c         |    2 +-
>  libavcodec/ac3enc_float.c   |    5 +--
>  libavcodec/x86/ac3dsp.asm   |   80 +++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/ac3dsp_mmx.c |   15 +++++++-
>  6 files changed, 131 insertions(+), 9 deletions(-)


benchmarks:

       Athlon64  Atom 330
--------------------------
current: 163252   674434
      C: 130315   462646
  3DNow:  51371     n/a
    SSE:  51344   102766
   SSE2:  51455    80588

-Justin
Mans Rullgard March 16, 2011, 3:38 a.m. | #2
Justin Ruggles <justin.ruggles@gmail.com> writes:

> ---
>  libavcodec/ac3dsp.c         |   21 ++++++++++-
>  libavcodec/ac3dsp.h         |   17 ++++++++-
>  libavcodec/ac3enc.c         |    2 +-
>  libavcodec/ac3enc_float.c   |    5 +--
>  libavcodec/x86/ac3dsp.asm   |   80 +++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/ac3dsp_mmx.c |   15 +++++++-
>  6 files changed, 131 insertions(+), 9 deletions(-)
>
>
> diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
> index 4d9db9b..4862031 100644
> --- a/libavcodec/ac3dsp.c
> +++ b/libavcodec/ac3dsp.c
> @@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
>      } while (len > 0);
>  }
>  
> -av_cold void ff_ac3dsp_init(AC3DSPContext *c)
> +static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
> +{
> +    const float scale = (float)(1 << 24);

That cast shouldn't be needed.

> +    do {
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        len -= 8;
> +    } while (len > 0);
> +}
> +
> +av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
>  {
>      c->ac3_exponent_min = ac3_exponent_min_c;
>      c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
>      c->ac3_lshift_int16 = ac3_lshift_int16_c;
>      c->ac3_rshift_int32 = ac3_rshift_int32_c;
> +    c->float_to_fixed24 = float_to_fixed24_c;
>  
>      if (HAVE_MMX)
> -        ff_ac3dsp_init_x86(c);
> +        ff_ac3dsp_init_x86(c, bit_exact);
>  }
> diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
> index 31a0af3..f197349 100644
> --- a/libavcodec/ac3dsp.h
> +++ b/libavcodec/ac3dsp.h
> @@ -68,9 +68,22 @@ typedef struct AC3DSPContext {
>       *               constraints: range [0,31]
>       */
>      void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift);
> +
> +    /**
> +     * Convert an array of float in range [-1.0,1.0] to int32_t with range
> +     * [-(1<<24),(1<<24)]
> +     *
> +     * @param dst destination array of int32_t.
> +     *            constraints: 16-byte aligned
> +     * @param src source array of float.
> +     *            constraints: 16-byte aligned
> +     * @param len number of elements to convert.
> +     *            constraints: multiple of 8 greater than zero
> +     */
> +    void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len);
>  } AC3DSPContext;
>  
> -void ff_ac3dsp_init    (AC3DSPContext *c);
> -void ff_ac3dsp_init_x86(AC3DSPContext *c);
> +void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
> +void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
>  
>  #endif /* AVCODEC_AC3DSP_H */
> diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
> index 4c01fe3..5b76ae6 100644
> --- a/libavcodec/ac3enc.c
> +++ b/libavcodec/ac3enc.c
> @@ -1843,7 +1843,7 @@ static av_cold int ac3_encode_init(AVCodecContext *avctx)
>      avctx->coded_frame= avcodec_alloc_frame();
>  
>      dsputil_init(&s->dsp, avctx);
> -    ff_ac3dsp_init(&s->ac3dsp);
> +    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
>  
>      return 0;
>  init_fail:
> diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
> index 8668b2e..4b13e4c 100644
> --- a/libavcodec/ac3enc_float.c
> +++ b/libavcodec/ac3enc_float.c
> @@ -103,9 +103,8 @@ static int normalize_samples(AC3EncodeContext *s)
>   */
>  static void scale_coefficients(AC3EncodeContext *s)
>  {
> -    int i;
> -    for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
> -        s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
> +    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
> +                               AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
>  }
>  

C parts look good otherwise.

> diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
> index e281791..1fe6fad 100644
> --- a/libavcodec/x86/ac3dsp.asm
> +++ b/libavcodec/x86/ac3dsp.asm

I'm too tired to attempt a review of this.
Vitor Sessak March 16, 2011, 6:49 p.m. | #3
On 03/16/2011 04:23 AM, Justin Ruggles wrote:
> ---
>   libavcodec/ac3dsp.c         |   21 ++++++++++-
>   libavcodec/ac3dsp.h         |   17 ++++++++-
>   libavcodec/ac3enc.c         |    2 +-
>   libavcodec/ac3enc_float.c   |    5 +--
>   libavcodec/x86/ac3dsp.asm   |   80 +++++++++++++++++++++++++++++++++++++++++++
>   libavcodec/x86/ac3dsp_mmx.c |   15 +++++++-
>   6 files changed, 131 insertions(+), 9 deletions(-)

>
> diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
> index 4d9db9b..4862031 100644
> --- a/libavcodec/ac3dsp.c
> +++ b/libavcodec/ac3dsp.c
> @@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
>      } while (len > 0);
>  }
>
> -av_cold void ff_ac3dsp_init(AC3DSPContext *c)
> +static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
> +{
> +    const float scale = (float)(1 << 24);
> +    do {
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        *dst++ = lrintf(*src++ * scale);
> +        len -= 8;
> +    } while (len > 0);
> +}

Why not make the function more general by making scale a parameter (and 
changing the function name correspondingly)?

-Vitor
Mans Rullgard March 16, 2011, 6:54 p.m. | #4
Vitor Sessak <vitor1001@gmail.com> writes:

> On 03/16/2011 04:23 AM, Justin Ruggles wrote:
>> ---
>>   libavcodec/ac3dsp.c         |   21 ++++++++++-
>>   libavcodec/ac3dsp.h         |   17 ++++++++-
>>   libavcodec/ac3enc.c         |    2 +-
>>   libavcodec/ac3enc_float.c   |    5 +--
>>   libavcodec/x86/ac3dsp.asm   |   80 +++++++++++++++++++++++++++++++++++++++++++
>>   libavcodec/x86/ac3dsp_mmx.c |   15 +++++++-
>>   6 files changed, 131 insertions(+), 9 deletions(-)
>
>>
>> diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
>> index 4d9db9b..4862031 100644
>> --- a/libavcodec/ac3dsp.c
>> +++ b/libavcodec/ac3dsp.c
>> @@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
>>      } while (len > 0);
>>  }
>>
>> -av_cold void ff_ac3dsp_init(AC3DSPContext *c)
>> +static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
>> +{
>> +    const float scale = (float)(1 << 24);
>> +    do {
>> +        *dst++ = lrintf(*src++ * scale);
>> +        *dst++ = lrintf(*src++ * scale);
>> +        *dst++ = lrintf(*src++ * scale);
>> +        *dst++ = lrintf(*src++ * scale);
>> +        *dst++ = lrintf(*src++ * scale);
>> +        *dst++ = lrintf(*src++ * scale);
>> +        *dst++ = lrintf(*src++ * scale);
>> +        *dst++ = lrintf(*src++ * scale);
>> +        len -= 8;
>> +    } while (len > 0);
>> +}
>
> Why not make the function more general by making scale a parameter
> (and changing the function name correspondingly)?

We want to be able to use the NEON float to fixed-point conversion
instructions which require a constant for the number of fractional bits.
Justin Ruggles March 16, 2011, 8:27 p.m. | #5
On 03/16/2011 02:54 PM, Måns Rullgård wrote:

> Vitor Sessak <vitor1001@gmail.com> writes:
> 
>> On 03/16/2011 04:23 AM, Justin Ruggles wrote:
>>> ---
>>>   libavcodec/ac3dsp.c         |   21 ++++++++++-
>>>   libavcodec/ac3dsp.h         |   17 ++++++++-
>>>   libavcodec/ac3enc.c         |    2 +-
>>>   libavcodec/ac3enc_float.c   |    5 +--
>>>   libavcodec/x86/ac3dsp.asm   |   80 +++++++++++++++++++++++++++++++++++++++++++
>>>   libavcodec/x86/ac3dsp_mmx.c |   15 +++++++-
>>>   6 files changed, 131 insertions(+), 9 deletions(-)
>>
>>>
>>> diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
>>> index 4d9db9b..4862031 100644
>>> --- a/libavcodec/ac3dsp.c
>>> +++ b/libavcodec/ac3dsp.c
>>> @@ -85,13 +85,30 @@ static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
>>>      } while (len > 0);
>>>  }
>>>
>>> -av_cold void ff_ac3dsp_init(AC3DSPContext *c)
>>> +static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
>>> +{
>>> +    const float scale = (float)(1 << 24);
>>> +    do {
>>> +        *dst++ = lrintf(*src++ * scale);
>>> +        *dst++ = lrintf(*src++ * scale);
>>> +        *dst++ = lrintf(*src++ * scale);
>>> +        *dst++ = lrintf(*src++ * scale);
>>> +        *dst++ = lrintf(*src++ * scale);
>>> +        *dst++ = lrintf(*src++ * scale);
>>> +        *dst++ = lrintf(*src++ * scale);
>>> +        *dst++ = lrintf(*src++ * scale);
>>> +        len -= 8;
>>> +    } while (len > 0);
>>> +}
>>
>> Why not make the function more general by making scale a parameter
>> (and changing the function name correspondingly)?
> 
> We want to be able to use the NEON float to fixed-point conversion
> instructions which require a constant for the number of fractional bits.


If we need a general function we can always add that separately in
FmtConvertContext.

-Justin
Ronald Bultje March 17, 2011, 3:17 p.m. | #6
Hi,

On Tue, Mar 15, 2011 at 11:23 PM, Justin Ruggles
<justin.ruggles@gmail.com> wrote:
> ---
>  libavcodec/ac3dsp.c         |   21 ++++++++++-
>  libavcodec/ac3dsp.h         |   17 ++++++++-
>  libavcodec/ac3enc.c         |    2 +-
>  libavcodec/ac3enc_float.c   |    5 +--
>  libavcodec/x86/ac3dsp.asm   |   80 +++++++++++++++++++++++++++++++++++++++++++
>  libavcodec/x86/ac3dsp_mmx.c |   15 +++++++-
>  6 files changed, 131 insertions(+), 9 deletions(-)
[..]
> +    mov      r3d, 0x4B800000 ; 16777216.0f
> +    movd      m0, r3d
> +    punpckldq m0, m0

Minor nit: I tens to just splat the data in memory and use movq/movdqa
directly, saves a register. Not important because it's outside the
loop.

> +cglobal float_to_fixed24_sse2, 4,4,5, dst, src, len
[..]
> +.loop:
[..]
> +    add    srcq, 64
> +    add    dstq, 64
> +    sub    lenq, 16
> +    ja .loop

I assume you've tested various unroll-versions to see which is best
here - or different questions, what is a typical value of len in this
function? The actual asm in the loops looks great, nice work.

Ronald

Patch

diff --git a/libavcodec/ac3dsp.c b/libavcodec/ac3dsp.c
index 4d9db9b..4862031 100644
--- a/libavcodec/ac3dsp.c
+++ b/libavcodec/ac3dsp.c
@@ -85,13 +85,30 @@  static void ac3_rshift_int32_c(int32_t *src, unsigned int len,
     } while (len > 0);
 }
 
-av_cold void ff_ac3dsp_init(AC3DSPContext *c)
+static void float_to_fixed24_c(int32_t *dst, const float *src, unsigned int len)
+{
+    const float scale = (float)(1 << 24);
+    do {
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        *dst++ = lrintf(*src++ * scale);
+        len -= 8;
+    } while (len > 0);
+}
+
+av_cold void ff_ac3dsp_init(AC3DSPContext *c, int bit_exact)
 {
     c->ac3_exponent_min = ac3_exponent_min_c;
     c->ac3_max_msb_abs_int16 = ac3_max_msb_abs_int16_c;
     c->ac3_lshift_int16 = ac3_lshift_int16_c;
     c->ac3_rshift_int32 = ac3_rshift_int32_c;
+    c->float_to_fixed24 = float_to_fixed24_c;
 
     if (HAVE_MMX)
-        ff_ac3dsp_init_x86(c);
+        ff_ac3dsp_init_x86(c, bit_exact);
 }
diff --git a/libavcodec/ac3dsp.h b/libavcodec/ac3dsp.h
index 31a0af3..f197349 100644
--- a/libavcodec/ac3dsp.h
+++ b/libavcodec/ac3dsp.h
@@ -68,9 +68,22 @@  typedef struct AC3DSPContext {
      *               constraints: range [0,31]
      */
     void (*ac3_rshift_int32)(int32_t *src, unsigned int len, unsigned int shift);
+
+    /**
+     * Convert an array of float in range [-1.0,1.0] to int32_t with range
+     * [-(1<<24),(1<<24)]
+     *
+     * @param dst destination array of int32_t.
+     *            constraints: 16-byte aligned
+     * @param src source array of float.
+     *            constraints: 16-byte aligned
+     * @param len number of elements to convert.
+     *            constraints: multiple of 8 greater than zero
+     */
+    void (*float_to_fixed24)(int32_t *dst, const float *src, unsigned int len);
 } AC3DSPContext;
 
-void ff_ac3dsp_init    (AC3DSPContext *c);
-void ff_ac3dsp_init_x86(AC3DSPContext *c);
+void ff_ac3dsp_init    (AC3DSPContext *c, int bit_exact);
+void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact);
 
 #endif /* AVCODEC_AC3DSP_H */
diff --git a/libavcodec/ac3enc.c b/libavcodec/ac3enc.c
index 4c01fe3..5b76ae6 100644
--- a/libavcodec/ac3enc.c
+++ b/libavcodec/ac3enc.c
@@ -1843,7 +1843,7 @@  static av_cold int ac3_encode_init(AVCodecContext *avctx)
     avctx->coded_frame= avcodec_alloc_frame();
 
     dsputil_init(&s->dsp, avctx);
-    ff_ac3dsp_init(&s->ac3dsp);
+    ff_ac3dsp_init(&s->ac3dsp, avctx->flags & CODEC_FLAG_BITEXACT);
 
     return 0;
 init_fail:
diff --git a/libavcodec/ac3enc_float.c b/libavcodec/ac3enc_float.c
index 8668b2e..4b13e4c 100644
--- a/libavcodec/ac3enc_float.c
+++ b/libavcodec/ac3enc_float.c
@@ -103,9 +103,8 @@  static int normalize_samples(AC3EncodeContext *s)
  */
 static void scale_coefficients(AC3EncodeContext *s)
 {
-    int i;
-    for (i = 0; i < AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels; i++)
-        s->fixed_coef_buffer[i] = SCALE_FLOAT(s->mdct_coef_buffer[i], 24);
+    s->ac3dsp.float_to_fixed24(s->fixed_coef_buffer, s->mdct_coef_buffer,
+                               AC3_MAX_COEFS * AC3_MAX_BLOCKS * s->channels);
 }
 
 
diff --git a/libavcodec/x86/ac3dsp.asm b/libavcodec/x86/ac3dsp.asm
index e281791..1fe6fad 100644
--- a/libavcodec/x86/ac3dsp.asm
+++ b/libavcodec/x86/ac3dsp.asm
@@ -178,3 +178,83 @@  INIT_MMX
 AC3_SHIFT r, 32, psrad, mmx
 INIT_XMM
 AC3_SHIFT r, 32, psrad, sse2
+
+;-----------------------------------------------------------------------------
+; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
+;-----------------------------------------------------------------------------
+
+; The 3DNow! version is not bit-identical because pf2id uses truncation rather
+; than round-to-nearest.
+INIT_MMX
+cglobal float_to_fixed24_3dnow, 3,4,0, dst, src, len
+    mov      r3d, 0x4B800000 ; 16777216.0f
+    movd      m0, r3d
+    punpckldq m0, m0
+.loop:
+    movq      m1, [srcq  ]
+    movq      m2, [srcq+8]
+    pfmul     m1, m0
+    pfmul     m2, m0
+    pf2id     m1, m1
+    pf2id     m2, m2
+    movq [dstq  ], m1
+    movq [dstq+8], m2
+    add     srcq, 16
+    add     dstq, 16
+    sub     lend, 4
+    ja .loop
+    REP_RET
+
+INIT_XMM
+cglobal float_to_fixed24_sse, 4,4,3, dst, src, len
+    mov      r3d, 0x4B800000 ; 16777216.0f
+    movd      m0, r3d
+    shufps    m0, m0, 0
+.loop:
+    movaps    m1, [srcq   ]
+    movaps    m2, [srcq+16]
+    mulps     m1, m0
+    mulps     m2, m0
+    cvtps2pi mm0, m1
+    movhlps   m1, m1
+    cvtps2pi mm1, m1
+    cvtps2pi mm2, m2
+    movhlps   m2, m2
+    cvtps2pi mm3, m2
+    movq  [dstq   ], mm0
+    movq  [dstq+ 8], mm1
+    movq  [dstq+16], mm2
+    movq  [dstq+24], mm3
+    add     srcq, 32
+    add     dstq, 32
+    sub     lend, 8
+    ja .loop
+    REP_RET
+
+INIT_XMM
+cglobal float_to_fixed24_sse2, 4,4,5, dst, src, len
+    mov      r3d, 0x4B800000 ; 16777216.0f
+    movd      m0, r3d
+    shufps    m0, m0, 0
+.loop:
+    movaps   m1, [srcq   ]
+    movaps   m2, [srcq+16]
+    movaps   m3, [srcq+32]
+    movaps   m4, [srcq+48]
+    mulps    m1, m0
+    mulps    m2, m0
+    mulps    m3, m0
+    mulps    m4, m0
+    cvtps2dq m1, m1
+    cvtps2dq m2, m2
+    cvtps2dq m3, m3
+    cvtps2dq m4, m4
+    movaps [dstq   ], m1
+    movaps [dstq+16], m2
+    movaps [dstq+32], m3
+    movaps [dstq+48], m4
+    add    srcq, 64
+    add    dstq, 64
+    sub    lenq, 16
+    ja .loop
+    REP_RET
diff --git a/libavcodec/x86/ac3dsp_mmx.c b/libavcodec/x86/ac3dsp_mmx.c
index 835b106..97d0657 100644
--- a/libavcodec/x86/ac3dsp_mmx.c
+++ b/libavcodec/x86/ac3dsp_mmx.c
@@ -38,7 +38,11 @@  extern void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned in
 extern void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
 extern void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
 
-av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
+extern void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
+extern void ff_float_to_fixed24_sse  (int32_t *dst, const float *src, unsigned int len);
+extern void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
+
+av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
 {
     int mm_flags = av_get_cpu_flags();
 
@@ -49,13 +53,22 @@  av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c)
         c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
         c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
     }
+    if (mm_flags & AV_CPU_FLAG_3DNOW && HAVE_AMD3DNOW) {
+        if (!bit_exact) {
+            c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
+        }
+    }
     if (mm_flags & AV_CPU_FLAG_MMX2 && HAVE_MMX2) {
         c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
     }
+    if (mm_flags & AV_CPU_FLAG_SSE && HAVE_SSE) {
+        c->float_to_fixed24 = ff_float_to_fixed24_sse;
+    }
     if (mm_flags & AV_CPU_FLAG_SSE2 && HAVE_SSE) {
         c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
+        c->float_to_fixed24 = ff_float_to_fixed24_sse2;
         if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
             c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
             c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;