[2/2] dca: change the core to work with integer coefficients.

Message ID 1449146710-12408-1-git-send-email-alexandra@khirnov.net
State Superseded
Headers show

Commit Message

Alexandra Hájková Dec. 3, 2015, 12:45 p.m.
The dca core decoder converts integer coefficients read from the
bitstream to floats just after reading them (along with dequantization).
All the other steps of the audio reconstruction are done with floats
which makes the output for the DTS lossless extension (XLL)
actually lossy.
This patch changes the dca core to work with integer coefficients
till QMF. At this point the integer coefficients are transformed to floats.
The coefficients for the LFE channel (lfe_data) are not touched.
This is the first step for the really lossless XLL decoding.
---

the output channels waveforms was compared in audacity with the waveforms of the
"before this patch state" and were considered the same

 libavcodec/dca.h    |   6 +--
 libavcodec/dcadec.c | 117 ++++++++++++++++++++++++++++++++++------------------
 2 files changed, 79 insertions(+), 44 deletions(-)

Comments

Janne Grunau Dec. 9, 2015, 9:20 p.m. | #1
On 2015-12-03 13:45:09 +0100, Alexandra Hájková wrote:
> The dca core decoder converts integer coefficients read from the
> bitstream to floats just after reading them (along with dequantization).
> All the other steps of the audio reconstruction are done with floats
> which makes the output for the DTS lossless extension (XLL)
> actually lossy.
> This patch changes the dca core to work with integer coefficients
> till QMF. At this point the integer coefficients are transformed to floats.
> The coefficients for the LFE channel (lfe_data) are not touched.
> This is the first step for the really lossless XLL decoding.
> ---
> 
> the output channels waveforms was compared in audacity with the waveforms of the
> "before this patch state" and were considered the same
> 
>  libavcodec/dca.h    |   6 +--
>  libavcodec/dcadec.c | 117 ++++++++++++++++++++++++++++++++++------------------
>  2 files changed, 79 insertions(+), 44 deletions(-)
> 
> diff --git a/libavcodec/dca.h b/libavcodec/dca.h
> index 6548d75..9947878 100644
> --- a/libavcodec/dca.h
> +++ b/libavcodec/dca.h
> @@ -139,7 +139,7 @@ typedef struct DCAAudioHeader {
>      int scalefactor_huffman[DCA_PRIM_CHANNELS_MAX]; ///< scale factor code book
>      int bitalloc_huffman[DCA_PRIM_CHANNELS_MAX];    ///< bit allocation quantizer select
>      int quant_index_huffman[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///< quantization index codebook select
> -    float scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX];   ///< scale factor adjustment
> +    int scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX];     ///< scale factor adjustment
>  
>      int subframes;              ///< number of subframes
>      int total_channels;         ///< number of channels including extensions
> @@ -147,10 +147,10 @@ typedef struct DCAAudioHeader {
>  } DCAAudioHeader;
>  
>  typedef struct DCAChan {
> -    DECLARE_ALIGNED(32, float, subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
> +    DECLARE_ALIGNED(32, int, subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
>  
>      /* Subband samples history (for ADPCM) */
> -    DECLARE_ALIGNED(16, float, subband_samples_hist)[DCA_SUBBANDS][4];
> +    DECLARE_ALIGNED(16, int, subband_samples_hist)[DCA_SUBBANDS][4];

these two should be probably int32_t instead of int to match code

>      int hist_index;
>  
>      /* Half size is sufficient for core decoding, but for 96 kHz data
> diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
> index 7e94638..80da622 100644
> --- a/libavcodec/dcadec.c
> +++ b/libavcodec/dcadec.c
> @@ -44,6 +44,7 @@
>  #include "dcadata.h"
>  #include "dcadsp.h"
>  #include "dcahuff.h"
> +#include "dcamath.h"
>  #include "fft.h"
>  #include "fmtconvert.h"
>  #include "get_bits.h"
> @@ -225,7 +226,7 @@ static inline void get_array(GetBitContext *gb, int *dst, int len, int bits)
>  static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
>  {
>      int i, j;
> -    static const float adj_table[4] = { 1.0, 1.1250, 1.2500, 1.4375 };
> +    static const int adj_table[4] = { 16, 18, 20, 23 };
>      static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 };
>      static const int thr[11]    = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
>  
> @@ -785,14 +786,26 @@ static int decode_blockcodes(int code1, int code2, int levels, int32_t *values)
>  static const uint8_t abits_sizes[7]  = { 7, 10, 12, 13, 15, 17, 19 };
>  static const uint8_t abits_levels[7] = { 3,  5,  7,  9, 13, 17, 25 };
>  

This is probably also a candidate for SIMD optimizations and should go 
in dcadsc.c

> +static void dequantize(int *samples, int step_size, int scale) {
> +    int64_t step = (int64_t)step_size * scale;
> +    int shift, i;
> +    int32_t step_scale;
> +
> +    if (step > (1 << 23))
> +        shift = av_log2(step >> 23) + 1;
> +    else
> +        shift = 0;
> +    step_scale = (int32_t)(step >> shift);
> +
> +    for (i = 0; i < SAMPLES_PER_SUBBAND; i++)
> +        samples[i] = dca_clip23(dca_norm((int64_t)samples[i] * step_scale, 22 - shift));
> +}
> +
>  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>  {
>      int k, l;
>      int subsubframe = s->current_subsubframe;
> -
> -    const float *quant_step_table;
> -
> -    LOCAL_ALIGNED_16(int32_t, block, [SAMPLES_PER_SUBBAND * DCA_SUBBANDS]);
> +    const int *quant_step_table;
>  
>      /*
>       * Audio data
> @@ -800,13 +813,13 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>  
>      /* Select quantization step size table */
>      if (s->bit_rate_index == 0x1f)
> -        quant_step_table = ff_dca_lossless_quant_d;
> +        quant_step_table = ff_dca_lossless_quant;
>      else
> -        quant_step_table = ff_dca_lossy_quant_d;
> +        quant_step_table = ff_dca_lossy_quant;
>  
>      for (k = base_channel; k < s->audio_header.prim_channels; k++) {
> -        float (*subband_samples)[8] = s->dca_chan[k].subband_samples[block_index];
> -        float rscale[DCA_SUBBANDS];
> +        int (*subband_samples)[8] = s->dca_chan[k].subband_samples[block_index];
> +        int64_t rscale[DCA_SUBBANDS];
>  
>          if (get_bits_left(&s->gb) < 0)
>              return AVERROR_INVALIDDATA;
> @@ -817,7 +830,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>              /* Select the mid-tread linear quantizer */
>              int abits = s->dca_chan[k].bitalloc[l];
>  
> -            float quant_step_size = quant_step_table[abits];
> +            int quant_step_size = quant_step_table[abits];
>  
>              /*
>               * Determine quantization index code book and its type
> @@ -831,12 +844,13 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>               */
>              if (!abits) {
>                  rscale[l] = 0;
> -                memset(block + SAMPLES_PER_SUBBAND * l, 0, SAMPLES_PER_SUBBAND * sizeof(block[0]));
> +                memset(subband_samples[l], 0, SAMPLES_PER_SUBBAND *
> +                       sizeof(subband_samples[l][0]));
>              } else {
>                  /* Deal with transients */
>                  int sfi = s->dca_chan[k].transition_mode[l] &&
>                      subsubframe >= s->dca_chan[k].transition_mode[l];
> -                rscale[l] = quant_step_size * s->dca_chan[k].scale_factor[l][sfi] *
> +                rscale[l] = s->dca_chan[k].scale_factor[l][sfi] *
>                              s->audio_header.scalefactor_adj[k][sel];
>  
>                  if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table) {
> @@ -850,7 +864,7 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>                          block_code1 = get_bits(&s->gb, size);
>                          block_code2 = get_bits(&s->gb, size);
>                          err         = decode_blockcodes(block_code1, block_code2,
> -                                                        levels, block + SAMPLES_PER_SUBBAND * l);
> +                                                        levels, subband_samples[l]);
>                          if (err) {
>                              av_log(s->avctx, AV_LOG_ERROR,
>                                     "ERROR: block code look-up failed\n");
> @@ -859,20 +873,18 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>                      } else {
>                          /* no coding */
>                          for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
> -                            block[SAMPLES_PER_SUBBAND * l + m] = get_sbits(&s->gb, abits - 3);
> +                            subband_samples[l][m] = get_sbits(&s->gb, abits - 3);
>                      }
>                  } else {
>                      /* Huffman coded */
>                      for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
> -                        block[SAMPLES_PER_SUBBAND * l + m] = get_bitalloc(&s->gb,
> -                                                        &dca_smpl_bitalloc[abits], sel);
> +                        subband_samples[l][m] = get_bitalloc(&s->gb,
> +                                                             &dca_smpl_bitalloc[abits], sel);
>                  }
>              }
> +            dequantize(subband_samples[l], quant_step_size, rscale[l]);
>          }
>  
> -        s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, subband_samples[0],
> -                                               block, rscale, SAMPLES_PER_SUBBAND * s->audio_header.vq_start_subband[k]);
> -
>          for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
>              int m;
>              /*
> @@ -882,25 +894,25 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>                  int n;
>                  if (s->predictor_history)
>                      subband_samples[l][0] += (ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> -                                                 s->dca_chan[k].subband_samples_hist[l][3] +
> -                                                 ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
> -                                                 s->dca_chan[k].subband_samples_hist[l][2] +
> -                                                 ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
> -                                                 s->dca_chan[k].subband_samples_hist[l][1] +
> -                                                 ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
> -                                                 s->dca_chan[k].subband_samples_hist[l][0]) *
> -                                                (1.0f / 8192);
> +                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][3] +
> +                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
> +                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][2] +
> +                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
> +                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][1] +
> +                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
> +                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][0]) +
> +                                              (1 << 12) >> 13;
>                  for (m = 1; m < SAMPLES_PER_SUBBAND; m++) {
> -                    float sum = ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> -                                subband_samples[l][m - 1];
> +                    int64_t sum = ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
> +                                  (int64_t)subband_samples[l][m - 1];
>                      for (n = 2; n <= 4; n++)
>                          if (m >= n)
>                              sum += ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
> -                                   subband_samples[l][m - n];
> +                                   (int64_t)subband_samples[l][m - n];
>                          else if (s->predictor_history)
>                              sum += ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
> -                                   s->dca_chan[k].subband_samples_hist[l][m - n + 4];
> -                    subband_samples[l][m] += sum * 1.0f / 8192;
> +                                   (int64_t)s->dca_chan[k].subband_samples_hist[l][m - n + 4];
> +                    subband_samples[l][m] += (int)(sum + (1 << 12) >> 13);
>                  }
>              }
>  
> @@ -914,17 +926,22 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>           * Decode VQ encoded high frequencies
>           */
>          if (s->audio_header.subband_activity[k] > s->audio_header.vq_start_subband[k]) {
> +            int i, j;
> +
>              if (!s->debug_flag & 0x01) {
>                  av_log(s->avctx, AV_LOG_DEBUG,
>                         "Stream with high frequencies VQ coding\n");
>                  s->debug_flag |= 0x01;
>              }
>  
> -            s->dcadsp.decode_hf(subband_samples, s->dca_chan[k].high_freq_vq,
> -                                ff_dca_high_freq_vq, subsubframe * SAMPLES_PER_SUBBAND,
> -                                s->dca_chan[k].scale_factor,
> -                                s->audio_header.vq_start_subband[k],
> -                                s->audio_header.subband_activity[k]);
> +            // this should be SIMDified

please move it to dcadsp.c and call it through a function pointer then

> +            for (j = s->audio_header.vq_start_subband[k]; j < s->audio_header.subband_activity[k]; j++) {
> +                /* 1 vector -> 32 sampjes but we only need the 8 samples
> +                 * for this subsubframe. */
> +                const int8_t *ptr = &ff_dca_high_freq_vq[s->dca_chan[k].high_freq_vq[j]][subsubframe * SAMPLES_PER_SUBBAND];
> +                for (i = 0; i < 8; i++)
> +                    subband_samples[j][i] = ptr[i] * s->dca_chan[k].scale_factor[j][0] + 8 >> 4;
> +            }
>          }
>      }
>  
> @@ -942,8 +959,14 @@ static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
>  static int dca_filter_channels(DCAContext *s, int block_index, int upsample)
>  {
>      int k;
> +    float param[DCA_SUBBANDS];
> +
> +    for (k = 0; k < DCA_SUBBANDS; k++)
> +        param[k] = 1;

this is a little pointless, we don't seem to have SIMD code for int32_t 
to float conversion but that is no good reason to multiply by 1.0

>  
>      if (upsample) {
> +        LOCAL_ALIGNED_16(float, samples, [64], [SAMPLES_PER_SUBBAND]);
> +
>          if (!s->qmf64_table) {
>              s->qmf64_table = qmf64_precompute();
>              if (!s->qmf64_table)
> @@ -952,21 +975,33 @@ static int dca_filter_channels(DCAContext *s, int block_index, int upsample)
>  
>          /* 64 subbands QMF */
>          for (k = 0; k < s->audio_header.prim_channels; k++) {
> -            float (*subband_samples)[SAMPLES_PER_SUBBAND] = s->dca_chan[k].subband_samples[block_index];
> +            int (*subband_samples)[SAMPLES_PER_SUBBAND] =
> +                s->dca_chan[k].subband_samples[block_index];
> +
> +            s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, samples[0],
> +                                                   subband_samples[0], param,
> +                                                   64 * SAMPLES_PER_SUBBAND);

int32_to_float_fmul_array8 can be used instead if the factors in param 
are all identical, better yet add int32_to_float to fmtconvert.c so that 
we can avoid the pointless multiplication.

>              if (s->channel_order_tab[k] >= 0)
> -                qmf_64_subbands(s, k, subband_samples,
> +                qmf_64_subbands(s, k, samples,
>                                  s->samples_chanptr[s->channel_order_tab[k]],
>                                  /* Upsampling needs a factor 2 here. */
>                                  M_SQRT2 / 32768.0);
>          }
>      } else {
>          /* 32 subbands QMF */
> +        LOCAL_ALIGNED_16(float, samples, [32], [SAMPLES_PER_SUBBAND]);
> +
>          for (k = 0; k < s->audio_header.prim_channels; k++) {
> -            float (*subband_samples)[SAMPLES_PER_SUBBAND] = s->dca_chan[k].subband_samples[block_index];
> +            int (*subband_samples)[SAMPLES_PER_SUBBAND] =
> +                s->dca_chan[k].subband_samples[block_index];
> +
> +            s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, samples[0],
> +                                                   subband_samples[0], param,
> +                                                   32 * SAMPLES_PER_SUBBAND);

same here

>  
>              if (s->channel_order_tab[k] >= 0)
> -                qmf_32_subbands(s, k, subband_samples,
> +                qmf_32_subbands(s, k, samples,
>                                  s->samples_chanptr[s->channel_order_tab[k]],
>                                  M_SQRT1_2 / 32768.0);
>          }

a 20% slow down in decoding performance looks high for this change.  Did 
you ran the tests on a 32-bit or 64-bit system?

Janne

Patch

diff --git a/libavcodec/dca.h b/libavcodec/dca.h
index 6548d75..9947878 100644
--- a/libavcodec/dca.h
+++ b/libavcodec/dca.h
@@ -139,7 +139,7 @@  typedef struct DCAAudioHeader {
     int scalefactor_huffman[DCA_PRIM_CHANNELS_MAX]; ///< scale factor code book
     int bitalloc_huffman[DCA_PRIM_CHANNELS_MAX];    ///< bit allocation quantizer select
     int quant_index_huffman[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX]; ///< quantization index codebook select
-    float scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX];   ///< scale factor adjustment
+    int scalefactor_adj[DCA_PRIM_CHANNELS_MAX][DCA_ABITS_MAX];     ///< scale factor adjustment
 
     int subframes;              ///< number of subframes
     int total_channels;         ///< number of channels including extensions
@@ -147,10 +147,10 @@  typedef struct DCAAudioHeader {
 } DCAAudioHeader;
 
 typedef struct DCAChan {
-    DECLARE_ALIGNED(32, float, subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
+    DECLARE_ALIGNED(32, int, subband_samples)[DCA_BLOCKS_MAX][DCA_SUBBANDS][8];
 
     /* Subband samples history (for ADPCM) */
-    DECLARE_ALIGNED(16, float, subband_samples_hist)[DCA_SUBBANDS][4];
+    DECLARE_ALIGNED(16, int, subband_samples_hist)[DCA_SUBBANDS][4];
     int hist_index;
 
     /* Half size is sufficient for core decoding, but for 96 kHz data
diff --git a/libavcodec/dcadec.c b/libavcodec/dcadec.c
index 7e94638..80da622 100644
--- a/libavcodec/dcadec.c
+++ b/libavcodec/dcadec.c
@@ -44,6 +44,7 @@ 
 #include "dcadata.h"
 #include "dcadsp.h"
 #include "dcahuff.h"
+#include "dcamath.h"
 #include "fft.h"
 #include "fmtconvert.h"
 #include "get_bits.h"
@@ -225,7 +226,7 @@  static inline void get_array(GetBitContext *gb, int *dst, int len, int bits)
 static int dca_parse_audio_coding_header(DCAContext *s, int base_channel)
 {
     int i, j;
-    static const float adj_table[4] = { 1.0, 1.1250, 1.2500, 1.4375 };
+    static const int adj_table[4] = { 16, 18, 20, 23 };
     static const int bitlen[11] = { 0, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3 };
     static const int thr[11]    = { 0, 1, 3, 3, 3, 3, 7, 7, 7, 7, 7 };
 
@@ -785,14 +786,26 @@  static int decode_blockcodes(int code1, int code2, int levels, int32_t *values)
 static const uint8_t abits_sizes[7]  = { 7, 10, 12, 13, 15, 17, 19 };
 static const uint8_t abits_levels[7] = { 3,  5,  7,  9, 13, 17, 25 };
 
+static void dequantize(int *samples, int step_size, int scale) {
+    int64_t step = (int64_t)step_size * scale;
+    int shift, i;
+    int32_t step_scale;
+
+    if (step > (1 << 23))
+        shift = av_log2(step >> 23) + 1;
+    else
+        shift = 0;
+    step_scale = (int32_t)(step >> shift);
+
+    for (i = 0; i < SAMPLES_PER_SUBBAND; i++)
+        samples[i] = dca_clip23(dca_norm((int64_t)samples[i] * step_scale, 22 - shift));
+}
+
 static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
 {
     int k, l;
     int subsubframe = s->current_subsubframe;
-
-    const float *quant_step_table;
-
-    LOCAL_ALIGNED_16(int32_t, block, [SAMPLES_PER_SUBBAND * DCA_SUBBANDS]);
+    const int *quant_step_table;
 
     /*
      * Audio data
@@ -800,13 +813,13 @@  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
 
     /* Select quantization step size table */
     if (s->bit_rate_index == 0x1f)
-        quant_step_table = ff_dca_lossless_quant_d;
+        quant_step_table = ff_dca_lossless_quant;
     else
-        quant_step_table = ff_dca_lossy_quant_d;
+        quant_step_table = ff_dca_lossy_quant;
 
     for (k = base_channel; k < s->audio_header.prim_channels; k++) {
-        float (*subband_samples)[8] = s->dca_chan[k].subband_samples[block_index];
-        float rscale[DCA_SUBBANDS];
+        int (*subband_samples)[8] = s->dca_chan[k].subband_samples[block_index];
+        int64_t rscale[DCA_SUBBANDS];
 
         if (get_bits_left(&s->gb) < 0)
             return AVERROR_INVALIDDATA;
@@ -817,7 +830,7 @@  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
             /* Select the mid-tread linear quantizer */
             int abits = s->dca_chan[k].bitalloc[l];
 
-            float quant_step_size = quant_step_table[abits];
+            int quant_step_size = quant_step_table[abits];
 
             /*
              * Determine quantization index code book and its type
@@ -831,12 +844,13 @@  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
              */
             if (!abits) {
                 rscale[l] = 0;
-                memset(block + SAMPLES_PER_SUBBAND * l, 0, SAMPLES_PER_SUBBAND * sizeof(block[0]));
+                memset(subband_samples[l], 0, SAMPLES_PER_SUBBAND *
+                       sizeof(subband_samples[l][0]));
             } else {
                 /* Deal with transients */
                 int sfi = s->dca_chan[k].transition_mode[l] &&
                     subsubframe >= s->dca_chan[k].transition_mode[l];
-                rscale[l] = quant_step_size * s->dca_chan[k].scale_factor[l][sfi] *
+                rscale[l] = s->dca_chan[k].scale_factor[l][sfi] *
                             s->audio_header.scalefactor_adj[k][sel];
 
                 if (abits >= 11 || !dca_smpl_bitalloc[abits].vlc[sel].table) {
@@ -850,7 +864,7 @@  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
                         block_code1 = get_bits(&s->gb, size);
                         block_code2 = get_bits(&s->gb, size);
                         err         = decode_blockcodes(block_code1, block_code2,
-                                                        levels, block + SAMPLES_PER_SUBBAND * l);
+                                                        levels, subband_samples[l]);
                         if (err) {
                             av_log(s->avctx, AV_LOG_ERROR,
                                    "ERROR: block code look-up failed\n");
@@ -859,20 +873,18 @@  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
                     } else {
                         /* no coding */
                         for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
-                            block[SAMPLES_PER_SUBBAND * l + m] = get_sbits(&s->gb, abits - 3);
+                            subband_samples[l][m] = get_sbits(&s->gb, abits - 3);
                     }
                 } else {
                     /* Huffman coded */
                     for (m = 0; m < SAMPLES_PER_SUBBAND; m++)
-                        block[SAMPLES_PER_SUBBAND * l + m] = get_bitalloc(&s->gb,
-                                                        &dca_smpl_bitalloc[abits], sel);
+                        subband_samples[l][m] = get_bitalloc(&s->gb,
+                                                             &dca_smpl_bitalloc[abits], sel);
                 }
             }
+            dequantize(subband_samples[l], quant_step_size, rscale[l]);
         }
 
-        s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, subband_samples[0],
-                                               block, rscale, SAMPLES_PER_SUBBAND * s->audio_header.vq_start_subband[k]);
-
         for (l = 0; l < s->audio_header.vq_start_subband[k]; l++) {
             int m;
             /*
@@ -882,25 +894,25 @@  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
                 int n;
                 if (s->predictor_history)
                     subband_samples[l][0] += (ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
-                                                 s->dca_chan[k].subband_samples_hist[l][3] +
-                                                 ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
-                                                 s->dca_chan[k].subband_samples_hist[l][2] +
-                                                 ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
-                                                 s->dca_chan[k].subband_samples_hist[l][1] +
-                                                 ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
-                                                 s->dca_chan[k].subband_samples_hist[l][0]) *
-                                                (1.0f / 8192);
+                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][3] +
+                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][1] *
+                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][2] +
+                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][2] *
+                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][1] +
+                                              ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][3] *
+                                              (int64_t)s->dca_chan[k].subband_samples_hist[l][0]) +
+                                              (1 << 12) >> 13;
                 for (m = 1; m < SAMPLES_PER_SUBBAND; m++) {
-                    float sum = ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
-                                subband_samples[l][m - 1];
+                    int64_t sum = ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][0] *
+                                  (int64_t)subband_samples[l][m - 1];
                     for (n = 2; n <= 4; n++)
                         if (m >= n)
                             sum += ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
-                                   subband_samples[l][m - n];
+                                   (int64_t)subband_samples[l][m - n];
                         else if (s->predictor_history)
                             sum += ff_dca_adpcm_vb[s->dca_chan[k].prediction_vq[l]][n - 1] *
-                                   s->dca_chan[k].subband_samples_hist[l][m - n + 4];
-                    subband_samples[l][m] += sum * 1.0f / 8192;
+                                   (int64_t)s->dca_chan[k].subband_samples_hist[l][m - n + 4];
+                    subband_samples[l][m] += (int)(sum + (1 << 12) >> 13);
                 }
             }
 
@@ -914,17 +926,22 @@  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
          * Decode VQ encoded high frequencies
          */
         if (s->audio_header.subband_activity[k] > s->audio_header.vq_start_subband[k]) {
+            int i, j;
+
             if (!s->debug_flag & 0x01) {
                 av_log(s->avctx, AV_LOG_DEBUG,
                        "Stream with high frequencies VQ coding\n");
                 s->debug_flag |= 0x01;
             }
 
-            s->dcadsp.decode_hf(subband_samples, s->dca_chan[k].high_freq_vq,
-                                ff_dca_high_freq_vq, subsubframe * SAMPLES_PER_SUBBAND,
-                                s->dca_chan[k].scale_factor,
-                                s->audio_header.vq_start_subband[k],
-                                s->audio_header.subband_activity[k]);
+            // this should be SIMDified
+            for (j = s->audio_header.vq_start_subband[k]; j < s->audio_header.subband_activity[k]; j++) {
+                /* 1 vector -> 32 sampjes but we only need the 8 samples
+                 * for this subsubframe. */
+                const int8_t *ptr = &ff_dca_high_freq_vq[s->dca_chan[k].high_freq_vq[j]][subsubframe * SAMPLES_PER_SUBBAND];
+                for (i = 0; i < 8; i++)
+                    subband_samples[j][i] = ptr[i] * s->dca_chan[k].scale_factor[j][0] + 8 >> 4;
+            }
         }
     }
 
@@ -942,8 +959,14 @@  static int dca_subsubframe(DCAContext *s, int base_channel, int block_index)
 static int dca_filter_channels(DCAContext *s, int block_index, int upsample)
 {
     int k;
+    float param[DCA_SUBBANDS];
+
+    for (k = 0; k < DCA_SUBBANDS; k++)
+        param[k] = 1;
 
     if (upsample) {
+        LOCAL_ALIGNED_16(float, samples, [64], [SAMPLES_PER_SUBBAND]);
+
         if (!s->qmf64_table) {
             s->qmf64_table = qmf64_precompute();
             if (!s->qmf64_table)
@@ -952,21 +975,33 @@  static int dca_filter_channels(DCAContext *s, int block_index, int upsample)
 
         /* 64 subbands QMF */
         for (k = 0; k < s->audio_header.prim_channels; k++) {
-            float (*subband_samples)[SAMPLES_PER_SUBBAND] = s->dca_chan[k].subband_samples[block_index];
+            int (*subband_samples)[SAMPLES_PER_SUBBAND] =
+                s->dca_chan[k].subband_samples[block_index];
+
+            s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, samples[0],
+                                                   subband_samples[0], param,
+                                                   64 * SAMPLES_PER_SUBBAND);
 
             if (s->channel_order_tab[k] >= 0)
-                qmf_64_subbands(s, k, subband_samples,
+                qmf_64_subbands(s, k, samples,
                                 s->samples_chanptr[s->channel_order_tab[k]],
                                 /* Upsampling needs a factor 2 here. */
                                 M_SQRT2 / 32768.0);
         }
     } else {
         /* 32 subbands QMF */
+        LOCAL_ALIGNED_16(float, samples, [32], [SAMPLES_PER_SUBBAND]);
+
         for (k = 0; k < s->audio_header.prim_channels; k++) {
-            float (*subband_samples)[SAMPLES_PER_SUBBAND] = s->dca_chan[k].subband_samples[block_index];
+            int (*subband_samples)[SAMPLES_PER_SUBBAND] =
+                s->dca_chan[k].subband_samples[block_index];
+
+            s->fmt_conv.int32_to_float_fmul_array8(&s->fmt_conv, samples[0],
+                                                   subband_samples[0], param,
+                                                   32 * SAMPLES_PER_SUBBAND);
 
             if (s->channel_order_tab[k] >= 0)
-                qmf_32_subbands(s, k, subband_samples,
+                qmf_32_subbands(s, k, samples,
                                 s->samples_chanptr[s->channel_order_tab[k]],
                                 M_SQRT1_2 / 32768.0);
         }