[02/27] vp3/x86: Use full transpose for all IDCTs

Message ID 1365583975-73297-2-git-send-email-martin@martin.st
State Committed
Commit 015821229f96bf7e677f2a711a58dbea3009f574
Headers show

Commit Message

Martin Storsjö April 10, 2013, 8:52 a.m.
From: "Ronald S. Bultje" <rsbultje@gmail.com>

This way, the special IDCT permutations are no longer needed. This
is similar to how H264 does it, and removes the dsputil dependency
imposed by the scantable code.
---
 libavcodec/arm/vp3dsp_init_arm.c |    1 -
 libavcodec/bfin/vp3_bfin.c       |    1 -
 libavcodec/ppc/vp3dsp_altivec.c  |    1 -
 libavcodec/vp3.c                 |   13 ++--
 libavcodec/vp3dsp.c              |   92 +++++++++++-----------------
 libavcodec/vp3dsp.h              |    2 -
 libavcodec/vp5.c                 |    2 +-
 libavcodec/vp56.c                |    9 ++-
 libavcodec/vp56.h                |    2 +-
 libavcodec/vp6.c                 |    4 +-
 libavcodec/x86/vp3dsp.asm        |  123 +++++++++++++++++++++++++-------------
 libavcodec/x86/vp3dsp_init.c     |    2 -
 12 files changed, 135 insertions(+), 117 deletions(-)

Comments

Kostya Shishkov April 10, 2013, 9:23 a.m. | #1
On Wed, Apr 10, 2013 at 11:52:30AM +0300, Martin Storsjö wrote:
> From: "Ronald S. Bultje" <rsbultje@gmail.com>
> 
> This way, the special IDCT permutations are no longer needed. This
> is similar to how H264 does it, and removes the dsputil dependency
> imposed by the scantable code.
> ---
>  libavcodec/arm/vp3dsp_init_arm.c |    1 -
>  libavcodec/bfin/vp3_bfin.c       |    1 -
>  libavcodec/ppc/vp3dsp_altivec.c  |    1 -
>  libavcodec/vp3.c                 |   13 ++--
>  libavcodec/vp3dsp.c              |   92 +++++++++++-----------------
>  libavcodec/vp3dsp.h              |    2 -
>  libavcodec/vp5.c                 |    2 +-
>  libavcodec/vp56.c                |    9 ++-
>  libavcodec/vp56.h                |    2 +-
>  libavcodec/vp6.c                 |    4 +-
>  libavcodec/x86/vp3dsp.asm        |  123 +++++++++++++++++++++++++-------------
>  libavcodec/x86/vp3dsp_init.c     |    2 -
>  12 files changed, 135 insertions(+), 117 deletions(-)

transform changes are a bit hard to comprehend but probably OK
Diego Biurrun April 12, 2013, 7:03 a.m. | #2
On Wed, Apr 10, 2013 at 11:52:30AM +0300, Martin Storsjö wrote:
> From: "Ronald S. Bultje" <rsbultje@gmail.com>
> 
> This way, the special IDCT permutations are no longer needed. This
> is similar to how H264 does it, and removes the dsputil dependency
> imposed by the scantable code.

Just "vp3:" as log msg prefix seems to fit what the commit does better.

Diego

Patch

diff --git a/libavcodec/arm/vp3dsp_init_arm.c b/libavcodec/arm/vp3dsp_init_arm.c
index 5a7950c..dfd6078 100644
--- a/libavcodec/arm/vp3dsp_init_arm.c
+++ b/libavcodec/arm/vp3dsp_init_arm.c
@@ -41,6 +41,5 @@  av_cold void ff_vp3dsp_init_arm(VP3DSPContext *c, int flags)
         c->idct_dc_add   = ff_vp3_idct_dc_add_neon;
         c->v_loop_filter = ff_vp3_v_loop_filter_neon;
         c->h_loop_filter = ff_vp3_h_loop_filter_neon;
-        c->idct_perm     = FF_TRANSPOSE_IDCT_PERM;
     }
 }
diff --git a/libavcodec/bfin/vp3_bfin.c b/libavcodec/bfin/vp3_bfin.c
index 86c4b23..a8cdcb6 100644
--- a/libavcodec/bfin/vp3_bfin.c
+++ b/libavcodec/bfin/vp3_bfin.c
@@ -61,6 +61,5 @@  av_cold void ff_vp3dsp_init_bfin(VP3DSPContext *c, int flags)
     if (!(flags & CODEC_FLAG_BITEXACT)) {
         c->idct_add = bfin_vp3_idct_add;
         c->idct_put = bfin_vp3_idct_put;
-        c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
     }
 }
diff --git a/libavcodec/ppc/vp3dsp_altivec.c b/libavcodec/ppc/vp3dsp_altivec.c
index e1e8988..c512047 100644
--- a/libavcodec/ppc/vp3dsp_altivec.c
+++ b/libavcodec/ppc/vp3dsp_altivec.c
@@ -184,7 +184,6 @@  av_cold void ff_vp3dsp_init_ppc(VP3DSPContext *c, int flags)
     if (av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC) {
         c->idct_put  = vp3_idct_put_altivec;
         c->idct_add  = vp3_idct_add_altivec;
-        c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
     }
 #endif
 }
diff --git a/libavcodec/vp3.c b/libavcodec/vp3.c
index 18b9cb8..7c26609 100644
--- a/libavcodec/vp3.c
+++ b/libavcodec/vp3.c
@@ -136,6 +136,7 @@  typedef struct Vp3DecodeContext {
     ThreadFrame current_frame;
     int keyframe;
     uint8_t idct_permutation[64];
+    uint8_t idct_scantable[64];
     DSPContext dsp;
     VideoDSPContext vdsp;
     VP3DSPContext vp3dsp;
@@ -173,8 +174,6 @@  typedef struct Vp3DecodeContext {
 
     int8_t (*motion_val[2])[2];
 
-    ScanTable scantable;
-
     /* tables */
     uint16_t coded_dc_scale_factor[64];
     uint32_t coded_ac_scale_factor[64];
@@ -1351,7 +1350,7 @@  static inline int vp3_dequant(Vp3DecodeContext *s, Vp3Fragment *frag,
                               int plane, int inter, int16_t block[64])
 {
     int16_t *dequantizer = s->qmat[frag->qpi][inter][plane];
-    uint8_t *perm = s->scantable.permutated;
+    uint8_t *perm = s->idct_scantable;
     int i = 0;
 
     do {
@@ -1700,8 +1699,12 @@  static av_cold int vp3_decode_init(AVCodecContext *avctx)
     ff_videodsp_init(&s->vdsp, 8);
     ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
 
-    ff_init_scantable_permutation(s->idct_permutation, s->vp3dsp.idct_perm);
-    ff_init_scantable(s->idct_permutation, &s->scantable, ff_zigzag_direct);
+    for (i = 0; i < 64; i++) {
+#define T(x) (x >> 3) | ((x & 7) << 3)
+        s->idct_permutation[i] = T(i);
+        s->idct_scantable[i] = T(ff_zigzag_direct[i]);
+#undef T
+    }
 
     /* initialize to an impossible value which will force a recalculation
      * in the first frame decode */
diff --git a/libavcodec/vp3dsp.c b/libavcodec/vp3dsp.c
index d1a7db9..94de0e5 100644
--- a/libavcodec/vp3dsp.c
+++ b/libavcodec/vp3dsp.c
@@ -54,11 +54,12 @@  static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
     /* Inverse DCT on the rows now */
     for (i = 0; i < 8; i++) {
         /* Check for non-zero values */
-        if ( ip[0] | ip[1] | ip[2] | ip[3] | ip[4] | ip[5] | ip[6] | ip[7] ) {
-            A = M(xC1S7, ip[1]) + M(xC7S1, ip[7]);
-            B = M(xC7S1, ip[1]) - M(xC1S7, ip[7]);
-            C = M(xC3S5, ip[3]) + M(xC5S3, ip[5]);
-            D = M(xC3S5, ip[5]) - M(xC5S3, ip[3]);
+        if ( ip[0 * 8] | ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
+             ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+            A = M(xC1S7, ip[1 * 8]) + M(xC7S1, ip[7 * 8]);
+            B = M(xC7S1, ip[1 * 8]) - M(xC1S7, ip[7 * 8]);
+            C = M(xC3S5, ip[3 * 8]) + M(xC5S3, ip[5 * 8]);
+            D = M(xC3S5, ip[5 * 8]) - M(xC5S3, ip[3 * 8]);
 
             Ad = M(xC4S4, (A - C));
             Bd = M(xC4S4, (B - D));
@@ -66,11 +67,11 @@  static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
             Cd = A + C;
             Dd = B + D;
 
-            E = M(xC4S4, (ip[0] + ip[4]));
-            F = M(xC4S4, (ip[0] - ip[4]));
+            E = M(xC4S4, (ip[0 * 8] + ip[4 * 8]));
+            F = M(xC4S4, (ip[0 * 8] - ip[4 * 8]));
 
-            G = M(xC2S6, ip[2]) + M(xC6S2, ip[6]);
-            H = M(xC6S2, ip[2]) - M(xC2S6, ip[6]);
+            G = M(xC2S6, ip[2 * 8]) + M(xC6S2, ip[6 * 8]);
+            H = M(xC6S2, ip[2 * 8]) - M(xC2S6, ip[6 * 8]);
 
             Ed = E - G;
             Gd = E + G;
@@ -82,33 +83,33 @@  static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
             Hd = Bd + H;
 
             /*  Final sequence of operations over-write original inputs. */
-            ip[0] = Gd + Cd ;
-            ip[7] = Gd - Cd ;
+            ip[0 * 8] = Gd + Cd ;
+            ip[7 * 8] = Gd - Cd ;
 
-            ip[1] = Add + Hd;
-            ip[2] = Add - Hd;
+            ip[1 * 8] = Add + Hd;
+            ip[2 * 8] = Add - Hd;
 
-            ip[3] = Ed + Dd ;
-            ip[4] = Ed - Dd ;
+            ip[3 * 8] = Ed + Dd ;
+            ip[4 * 8] = Ed - Dd ;
 
-            ip[5] = Fd + Bdd;
-            ip[6] = Fd - Bdd;
+            ip[5 * 8] = Fd + Bdd;
+            ip[6 * 8] = Fd - Bdd;
         }
 
-        ip += 8;            /* next row */
+        ip += 1;            /* next row */
     }
 
     ip = input;
 
     for ( i = 0; i < 8; i++) {
         /* Check for non-zero values (bitwise or faster than ||) */
-        if ( ip[1 * 8] | ip[2 * 8] | ip[3 * 8] |
-             ip[4 * 8] | ip[5 * 8] | ip[6 * 8] | ip[7 * 8] ) {
+        if ( ip[1] | ip[2] | ip[3] |
+             ip[4] | ip[5] | ip[6] | ip[7] ) {
 
-            A = M(xC1S7, ip[1*8]) + M(xC7S1, ip[7*8]);
-            B = M(xC7S1, ip[1*8]) - M(xC1S7, ip[7*8]);
-            C = M(xC3S5, ip[3*8]) + M(xC5S3, ip[5*8]);
-            D = M(xC3S5, ip[5*8]) - M(xC5S3, ip[3*8]);
+            A = M(xC1S7, ip[1]) + M(xC7S1, ip[7]);
+            B = M(xC7S1, ip[1]) - M(xC1S7, ip[7]);
+            C = M(xC3S5, ip[3]) + M(xC5S3, ip[5]);
+            D = M(xC3S5, ip[5]) - M(xC5S3, ip[3]);
 
             Ad = M(xC4S4, (A - C));
             Bd = M(xC4S4, (B - D));
@@ -116,16 +117,16 @@  static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
             Cd = A + C;
             Dd = B + D;
 
-            E = M(xC4S4, (ip[0*8] + ip[4*8])) + 8;
-            F = M(xC4S4, (ip[0*8] - ip[4*8])) + 8;
+            E = M(xC4S4, (ip[0] + ip[4])) + 8;
+            F = M(xC4S4, (ip[0] - ip[4])) + 8;
 
             if(type==1){  //HACK
                 E += 16*128;
                 F += 16*128;
             }
 
-            G = M(xC2S6, ip[2*8]) + M(xC6S2, ip[6*8]);
-            H = M(xC6S2, ip[2*8]) - M(xC2S6, ip[6*8]);
+            G = M(xC2S6, ip[2]) + M(xC6S2, ip[6]);
+            H = M(xC6S2, ip[2]) - M(xC2S6, ip[6]);
 
             Ed = E - G;
             Gd = E + G;
@@ -137,19 +138,7 @@  static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
             Hd = Bd + H;
 
             /* Final sequence of operations over-write original inputs. */
-            if(type==0){
-                ip[0*8] = (Gd + Cd )  >> 4;
-                ip[7*8] = (Gd - Cd )  >> 4;
-
-                ip[1*8] = (Add + Hd ) >> 4;
-                ip[2*8] = (Add - Hd ) >> 4;
-
-                ip[3*8] = (Ed + Dd )  >> 4;
-                ip[4*8] = (Ed - Dd )  >> 4;
-
-                ip[5*8] = (Fd + Bdd ) >> 4;
-                ip[6*8] = (Fd - Bdd ) >> 4;
-            }else if(type==1){
+            if (type == 1) {
                 dst[0*stride] = av_clip_uint8((Gd + Cd )  >> 4);
                 dst[7*stride] = av_clip_uint8((Gd - Cd )  >> 4);
 
@@ -176,16 +165,7 @@  static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
             }
 
         } else {
-            if(type==0){
-                ip[0*8] =
-                ip[1*8] =
-                ip[2*8] =
-                ip[3*8] =
-                ip[4*8] =
-                ip[5*8] =
-                ip[6*8] =
-                ip[7*8] = ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20);
-            }else if(type==1){
+            if (type == 1) {
                 dst[0*stride]=
                 dst[1*stride]=
                 dst[2*stride]=
@@ -193,10 +173,10 @@  static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
                 dst[4*stride]=
                 dst[5*stride]=
                 dst[6*stride]=
-                dst[7*stride]= av_clip_uint8(128 + ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20));
+                dst[7*stride]= av_clip_uint8(128 + ((xC4S4 * ip[0] + (IdctAdjustBeforeShift<<16))>>20));
             }else{
-                if(ip[0*8]){
-                    int v= ((xC4S4 * ip[0*8] + (IdctAdjustBeforeShift<<16))>>20);
+                if(ip[0]){
+                    int v= ((xC4S4 * ip[0] + (IdctAdjustBeforeShift<<16))>>20);
                     dst[0*stride] = av_clip_uint8(dst[0*stride] + v);
                     dst[1*stride] = av_clip_uint8(dst[1*stride] + v);
                     dst[2*stride] = av_clip_uint8(dst[2*stride] + v);
@@ -209,7 +189,7 @@  static av_always_inline void idct(uint8_t *dst, int stride, int16_t *input, int
             }
         }
 
-        ip++;            /* next column */
+        ip += 8;            /* next column */
         dst++;
     }
 }
@@ -307,8 +287,6 @@  av_cold void ff_vp3dsp_init(VP3DSPContext *c, int flags)
     c->v_loop_filter = vp3_v_loop_filter_c;
     c->h_loop_filter = vp3_h_loop_filter_c;
 
-    c->idct_perm = FF_NO_IDCT_PERM;
-
     if (ARCH_ARM)
         ff_vp3dsp_init_arm(c, flags);
     if (ARCH_BFIN)
diff --git a/libavcodec/vp3dsp.h b/libavcodec/vp3dsp.h
index 755271d..39c4408 100644
--- a/libavcodec/vp3dsp.h
+++ b/libavcodec/vp3dsp.h
@@ -43,8 +43,6 @@  typedef struct VP3DSPContext {
     void (*idct_dc_add)(uint8_t *dest, int line_size, int16_t *block);
     void (*v_loop_filter)(uint8_t *src, int stride, int *bounding_values);
     void (*h_loop_filter)(uint8_t *src, int stride, int *bounding_values);
-
-    int idct_perm;
 } VP3DSPContext;
 
 void ff_vp3dsp_init(VP3DSPContext *c, int flags);
diff --git a/libavcodec/vp5.c b/libavcodec/vp5.c
index 742262b..1415428 100644
--- a/libavcodec/vp5.c
+++ b/libavcodec/vp5.c
@@ -173,7 +173,7 @@  static void vp5_parse_coeff(VP56Context *s)
 {
     VP56RangeCoder *c = &s->c;
     VP56Model *model = s->modelp;
-    uint8_t *permute = s->scantable.permutated;
+    uint8_t *permute = s->idct_scantable;
     uint8_t *model1, *model2;
     int coeff, sign, coeff_idx;
     int b, i, cg, idx, ctx, ctx_last;
diff --git a/libavcodec/vp56.c b/libavcodec/vp56.c
index b93f75d..1a83f00 100644
--- a/libavcodec/vp56.c
+++ b/libavcodec/vp56.c
@@ -263,7 +263,7 @@  static VP56mb vp56_decode_mv(VP56Context *s, int row, int col)
 
 static void vp56_add_predictors_dc(VP56Context *s, VP56Frame ref_frame)
 {
-    int idx = s->scantable.permutated[0];
+    int idx = s->idct_scantable[0];
     int b;
 
     for (b=0; b<6; b++) {
@@ -661,8 +661,11 @@  av_cold int ff_vp56_init(AVCodecContext *avctx, int flip, int has_alpha)
     ff_videodsp_init(&s->vdsp, 8);
     ff_vp3dsp_init(&s->vp3dsp, avctx->flags);
     ff_vp56dsp_init(&s->vp56dsp, avctx->codec->id);
-    ff_init_scantable_permutation(s->dsp.idct_permutation, s->vp3dsp.idct_perm);
-    ff_init_scantable(s->dsp.idct_permutation, &s->scantable,ff_zigzag_direct);
+    for (i = 0; i < 64; i++) {
+#define T(x) (x >> 3) | ((x & 7) << 3)
+        s->idct_scantable[i] = T(ff_zigzag_direct[i]);
+#undef T
+    }
 
     for (i = 0; i < FF_ARRAY_ELEMS(s->frames); i++) {
         s->frames[i] = av_frame_alloc();
diff --git a/libavcodec/vp56.h b/libavcodec/vp56.h
index 80ede6a..7fe6cf3 100644
--- a/libavcodec/vp56.h
+++ b/libavcodec/vp56.h
@@ -100,7 +100,7 @@  struct vp56_context {
     VideoDSPContext vdsp;
     VP3DSPContext vp3dsp;
     VP56DSPContext vp56dsp;
-    ScanTable scantable;
+    uint8_t idct_scantable[64];
     AVFrame *frames[4];
     uint8_t *edge_emu_buffer_alloc;
     uint8_t *edge_emu_buffer;
diff --git a/libavcodec/vp6.c b/libavcodec/vp6.c
index 2e25a55..54dc378 100644
--- a/libavcodec/vp6.c
+++ b/libavcodec/vp6.c
@@ -368,7 +368,7 @@  static unsigned vp6_get_nb_null(VP56Context *s)
 static void vp6_parse_coeff_huffman(VP56Context *s)
 {
     VP56Model *model = s->modelp;
-    uint8_t *permute = s->scantable.permutated;
+    uint8_t *permute = s->idct_scantable;
     VLC *vlc_coeff;
     int coeff, sign, coeff_idx;
     int b, cg, idx;
@@ -428,7 +428,7 @@  static void vp6_parse_coeff(VP56Context *s)
 {
     VP56RangeCoder *c = s->ccp;
     VP56Model *model = s->modelp;
-    uint8_t *permute = s->scantable.permutated;
+    uint8_t *permute = s->idct_scantable;
     uint8_t *model1, *model2, *model3;
     int coeff, sign, coeff_idx;
     int b, i, cg, idx, ctx;
diff --git a/libavcodec/x86/vp3dsp.asm b/libavcodec/x86/vp3dsp.asm
index d2c464c..9ea9cca 100644
--- a/libavcodec/x86/vp3dsp.asm
+++ b/libavcodec/x86/vp3dsp.asm
@@ -500,22 +500,22 @@  cglobal vp3_h_loop_filter, 3, 4
 
     ; at this point, function has completed dequantization + dezigzag +
     ; partial transposition; now do the idct itself
-%define I(x) [%1+16* x     ]
-%define J(x) [%1+16*(x-4)+8]
+%define I(x) [%1+16*x]
+%define J(x) [%1+16*x]
     RowIDCT
     Transpose
 
-%define I(x) [%1+16* x   +64]
-%define J(x) [%1+16*(x-4)+72]
+%define I(x) [%1+16*x+8]
+%define J(x) [%1+16*x+8]
     RowIDCT
     Transpose
 
-%define I(x) [%1+16*x]
-%define J(x) [%1+16*x]
+%define I(x) [%1+16* x]
+%define J(x) [%1+16*(x-4)+8]
     ColumnIDCT
 
-%define I(x) [%1+16*x+8]
-%define J(x) [%1+16*x+8]
+%define I(x) [%1+16* x   +64]
+%define J(x) [%1+16*(x-4)+72]
     ColumnIDCT
 %endif ; mmsize == 16/8
 %endmacro
@@ -533,10 +533,17 @@  cglobal vp3_idct_put, 3, 4, 9
     mova          m1, [r2+mmsize*2+%%i]
     mova          m2, [r2+mmsize*4+%%i]
     mova          m3, [r2+mmsize*6+%%i]
+%if mmsize == 8
+    packsswb      m0, [r2+mmsize*8+%%i]
+    packsswb      m1, [r2+mmsize*10+%%i]
+    packsswb      m2, [r2+mmsize*12+%%i]
+    packsswb      m3, [r2+mmsize*14+%%i]
+%else
     packsswb      m0, [r2+mmsize*1+%%i]
     packsswb      m1, [r2+mmsize*3+%%i]
     packsswb      m2, [r2+mmsize*5+%%i]
     packsswb      m3, [r2+mmsize*7+%%i]
+%endif
     paddb         m0, m4
     paddb         m1, m4
     paddb         m2, m4
@@ -560,7 +567,7 @@  cglobal vp3_idct_put, 3, 4, 9
     movq   [r0+r1*2], m3
     movhps [r0+r3  ], m3
 %endif
-%assign %%i %%i+64
+%assign %%i %%i+8
 %endrep
 
     pxor          m0, m0
@@ -574,47 +581,81 @@  cglobal vp3_idct_put, 3, 4, 9
 cglobal vp3_idct_add, 3, 4, 9
     VP3_IDCT      r2
 
-    mov           r3, 4
-    pxor          m4, m4
     movsxdifnidn  r1, r1d
-.loop:
+    lea           r3, [r1*3]
+    pxor          m4, m4
+%if mmsize == 16
+%assign %%i 0
+%rep 2
     movq          m0, [r0]
     movq          m1, [r0+r1]
-%if mmsize == 8
-    mova          m2, m0
-    mova          m3, m1
-%endif
+    movq          m2, [r0+r1*2]
+    movq          m3, [r0+r3]
     punpcklbw     m0, m4
     punpcklbw     m1, m4
-%if mmsize == 8
-    punpckhbw     m2, m4
-    punpckhbw     m3, m4
-%endif
-    paddsw        m0, [r2+ 0]
-    paddsw        m1, [r2+16]
-%if mmsize == 8
-    paddsw        m2, [r2+ 8]
-    paddsw        m3, [r2+24]
-    packuswb      m0, m2
-    packuswb      m1, m3
-%else ; mmsize == 16
+    punpcklbw     m2, m4
+    punpcklbw     m3, m4
+    paddsw        m0, [r2+ 0+%%i]
+    paddsw        m1, [r2+16+%%i]
+    paddsw        m2, [r2+32+%%i]
+    paddsw        m3, [r2+48+%%i]
     packuswb      m0, m1
+    packuswb      m2, m3
+    movq   [r0     ], m0
+    movhps [r0+r1  ], m0
+    movq   [r0+r1*2], m2
+    movhps [r0+r3  ], m2
+%if %%i == 0
+    lea           r0, [r0+r1*4]
 %endif
-    movq     [r0   ], m0
-%if mmsize == 8
-    movq     [r0+r1], m1
-%else ; mmsize == 16
-    movhps   [r0+r1], m0
+%assign %%i %%i+64
+%endrep
+%else
+%assign %%i 0
+%rep 2
+    movq          m0, [r0]
+    movq          m1, [r0+r1]
+    movq          m2, [r0+r1*2]
+    movq          m3, [r0+r3]
+    movq          m5, m0
+    movq          m6, m1
+    movq          m7, m2
+    punpcklbw     m0, m4
+    punpcklbw     m1, m4
+    punpcklbw     m2, m4
+    punpckhbw     m5, m4
+    punpckhbw     m6, m4
+    punpckhbw     m7, m4
+    paddsw        m0, [r2+ 0+%%i]
+    paddsw        m1, [r2+16+%%i]
+    paddsw        m2, [r2+32+%%i]
+    paddsw        m5, [r2+64+%%i]
+    paddsw        m6, [r2+80+%%i]
+    paddsw        m7, [r2+96+%%i]
+    packuswb      m0, m5
+    movq          m5, m3
+    punpcklbw     m3, m4
+    punpckhbw     m5, m4
+    packuswb      m1, m6
+    paddsw        m3, [r2+48+%%i]
+    paddsw        m5, [r2+112+%%i]
+    packuswb      m2, m7
+    packuswb      m3, m5
+    movq   [r0     ], m0
+    movq   [r0+r1  ], m1
+    movq   [r0+r1*2], m2
+    movq   [r0+r3  ], m3
+%if %%i == 0
+    lea           r0, [r0+r1*4]
 %endif
-    lea           r0, [r0+r1*2]
-%assign %%offset 0
-%rep 32/mmsize
-    mova [r2+%%offset], m4
-%assign %%offset %%offset+mmsize
+%assign %%i %%i+8
+%endrep
+%endif
+%assign %%i 0
+%rep 128/mmsize
+    mova    [r2+%%i], m4
+%assign %%i %%i+mmsize
 %endrep
-    add           r2, 32
-    dec           r3
-    jg .loop
     RET
 %endmacro
 
diff --git a/libavcodec/x86/vp3dsp_init.c b/libavcodec/x86/vp3dsp_init.c
index 2668bcf..cc52fbc 100644
--- a/libavcodec/x86/vp3dsp_init.c
+++ b/libavcodec/x86/vp3dsp_init.c
@@ -48,7 +48,6 @@  av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
     if (EXTERNAL_MMX(cpuflags)) {
         c->idct_put  = ff_vp3_idct_put_mmx;
         c->idct_add  = ff_vp3_idct_add_mmx;
-        c->idct_perm = FF_PARTTRANS_IDCT_PERM;
     }
 #endif
 
@@ -64,6 +63,5 @@  av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
     if (EXTERNAL_SSE2(cpuflags)) {
         c->idct_put  = ff_vp3_idct_put_sse2;
         c->idct_add  = ff_vp3_idct_add_sse2;
-        c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
     }
 }