x86: Skip compiling/assembling unused SIMD optimizations on x86_64

Message ID 1381080485-30627-1-git-send-email-diego@biurrun.de
State New
Headers show

Commit Message

Diego Biurrun Oct. 6, 2013, 5:28 p.m.
x86_64 always has SSE2, so skip compiling in SIMD-optimized functions
that will always be overridden by variants of the same function done
with more advanced SIMD optimization types.
---

Updated to now actually disable the yasm code as well, as noted by Justin.

The total saving is about 100kB:

before

$ ls -l libavcodec/libavcodec.a
-rw-rw-r-- 1 biurrun 10006 84458588 Oct  6 19:22 libavcodec/libavcodec.a

after

$ ls -l libavcodec/libavcodec.a
-rw-rw-r-- 1 biurrun 10006 84349546 Oct  6 18:57 libavcodec/libavcodec.a

  84458588
- 84349546
----------
    109042

 libavcodec/x86/ac3dsp_init.c         |    4 ++
 libavcodec/x86/dct32.asm             |    2 +
 libavcodec/x86/dct_init.c            |    2 +
 libavcodec/x86/dsputil.asm           |   10 ++++-
 libavcodec/x86/dsputil_init.c        |   12 +++++-
 libavcodec/x86/dsputilenc.asm        |    6 +++
 libavcodec/x86/dsputilenc_mmx.c      |   26 +++++++++---
 libavcodec/x86/fmtconvert.asm        |   14 +++++++
 libavcodec/x86/fmtconvert_init.c     |   10 +++++
 libavcodec/x86/h264_chromamc.asm     |    2 +
 libavcodec/x86/h264_idct.asm         |   76 ++++++++--------------------------
 libavcodec/x86/h264_intrapred.asm    |   22 ++++++++++
 libavcodec/x86/h264_intrapred_init.c |   20 ++++++++-
 libavcodec/x86/h264chroma_init.c     |    2 +
 libavcodec/x86/h264dsp_init.c        |   16 +++++--
 libavcodec/x86/hpeldsp.asm           |   22 ++++++++++
 libavcodec/x86/hpeldsp_init.c        |    6 +++
 libavcodec/x86/imdct36.asm           |    2 +
 libavcodec/x86/motion_est.c          |   35 +++++++++++-----
 libavcodec/x86/mpegaudiodsp.c        |    4 ++
 libavcodec/x86/mpegvideo.c           |    4 ++
 libavcodec/x86/mpegvideoenc.c        |    6 ++-
 libavcodec/x86/rv34dsp.asm           |    2 +
 libavcodec/x86/rv34dsp_init.c        |    2 +
 libavcodec/x86/rv40dsp.asm           |    2 +
 libavcodec/x86/rv40dsp_init.c        |    8 ++--
 libavcodec/x86/vc1dsp_init.c         |    2 +
 libavutil/x86/float_dsp_init.c       |    4 ++
 libswscale/x86/rgb2rgb.c             |   26 +++++++-----
 29 files changed, 250 insertions(+), 99 deletions(-)

Comments

Luca Barbato Oct. 6, 2013, 6:40 p.m. | #1
On 06/10/13 19:28, Diego Biurrun wrote:
> x86_64 always has SSE2, so skip compiling in SIMD-optimized functions
> that will always be overridden by variants of the same function done
> with more advanced SIMD optimization types.
> ---

Again, why didn't you change the logic of EXTERNAL_AMD3DNOW etc so you
spare lots of lines there?

lu
Diego Elio Pettenò Oct. 6, 2013, 7:05 p.m. | #2
On Sun, Oct 6, 2013 at 6:28 PM, Diego Biurrun <diego@biurrun.de> wrote:

>
> before
>
> $ ls -l libavcodec/libavcodec.a
> -rw-rw-r-- 1 biurrun 10006 84458588 Oct  6 19:22 libavcodec/libavcodec.a
>
> after
>
> $ ls -l libavcodec/libavcodec.a
> -rw-rw-r-- 1 biurrun 10006 84349546 Oct  6 18:57 libavcodec/libavcodec.a
>


Diego, can you please use rbelf-diff for this kind of changes? I'd be more
interested in the difference on either avconv or libavcodec.so, rather than
the final size of the .a that makes little sense.


Diego Elio Pettenò — Flameeyes
flameeyes@flameeyes.eu — http://blog.flameeyes.eu/
Diego Biurrun Oct. 6, 2013, 8:15 p.m. | #3
On Sun, Oct 06, 2013 at 08:40:21PM +0200, Luca Barbato wrote:
> On 06/10/13 19:28, Diego Biurrun wrote:
> > x86_64 always has SSE2, so skip compiling in SIMD-optimized functions
> > that will always be overridden by variants of the same function done
> > with more advanced SIMD optimization types.
> 
> Again, why didn't you change the logic of EXTERNAL_AMD3DNOW etc so you
> spare lots of lines there?

I don't see how that could work at all for the assembly parts (which I
overlooked in the first iteration of this patch).

I also don't see how such a macro should figure out which of the parts
it covers have already been set in another optimization block.

Diego

Patch

diff --git a/libavcodec/x86/ac3dsp_init.c b/libavcodec/x86/ac3dsp_init.c
index ca10864..9d0a221 100644
--- a/libavcodec/x86/ac3dsp_init.c
+++ b/libavcodec/x86/ac3dsp_init.c
@@ -183,11 +183,14 @@  av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
     int cpu_flags = av_get_cpu_flags();
 
     if (EXTERNAL_MMX(cpu_flags)) {
+#if ARCH_X86_32
         c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
+#endif /* ARCH_X86_32 */
         c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
         c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
     }
+#if ARCH_X86_32
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
         if (!bit_exact) {
             c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
@@ -200,6 +203,7 @@  av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
     if (EXTERNAL_SSE(cpu_flags)) {
         c->float_to_fixed24 = ff_float_to_fixed24_sse;
     }
+#endif /* ARCH_X86_32 */
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
         c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
diff --git a/libavcodec/x86/dct32.asm b/libavcodec/x86/dct32.asm
index 9c147b9..42abb91 100644
--- a/libavcodec/x86/dct32.asm
+++ b/libavcodec/x86/dct32.asm
@@ -482,7 +482,9 @@  cglobal dct32_float, 2, 3, 16, out, in, tmp
 %endif
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 DCT32_FUNC
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 DCT32_FUNC
diff --git a/libavcodec/x86/dct_init.c b/libavcodec/x86/dct_init.c
index 7bda5e8..16050cd 100644
--- a/libavcodec/x86/dct_init.c
+++ b/libavcodec/x86/dct_init.c
@@ -30,8 +30,10 @@  av_cold void ff_dct_init_x86(DCTContext *s)
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_SSE(cpu_flags))
         s->dct32 = ff_dct32_float_sse;
+#endif /* ARCH_X86_32 */
     if (EXTERNAL_SSE2(cpu_flags))
         s->dct32 = ff_dct32_float_sse2;
     if (EXTERNAL_AVX(cpu_flags))
diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 5d73ff8..4601954 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -107,8 +107,10 @@  cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 SCALARPRODUCT
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 SCALARPRODUCT
 
@@ -316,13 +318,17 @@  cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 APPLY_WINDOW_INT16 0
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 APPLY_WINDOW_INT16 0
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 APPLY_WINDOW_INT16 1
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 APPLY_WINDOW_INT16 1
 INIT_XMM ssse3
@@ -526,9 +532,11 @@  cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
     REP_RET
 %endmacro
 
-INIT_MMX mmx
 %define CLIPD CLIPD_MMX
+%if ARCH_X86_32
+INIT_MMX mmx
 VECTOR_CLIP_INT32 0, 1, 0, 0
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 VECTOR_CLIP_INT32 6, 1, 0, 0, _int
 %define CLIPD CLIPD_SSE2
diff --git a/libavcodec/x86/dsputil_init.c b/libavcodec/x86/dsputil_init.c
index a38cf24..d0a7cc4 100644
--- a/libavcodec/x86/dsputil_init.c
+++ b/libavcodec/x86/dsputil_init.c
@@ -540,8 +540,10 @@  static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
 
     if (!high_bit_depth) {
+#if ARCH_X86_32
         c->clear_block  = ff_clear_block_mmx;
         c->clear_blocks = ff_clear_blocks_mmx;
+#endif /* ARCH_X86_32 */
         c->draw_edges   = ff_draw_edges_mmx;
 
         switch (avctx->idct_algo) {
@@ -552,11 +554,13 @@  static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
             c->idct                  = ff_simple_idct_mmx;
             c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
             break;
+#if ARCH_X86_32
         case FF_IDCT_XVIDMMX:
             c->idct_put              = ff_idct_xvid_mmx_put;
             c->idct_add              = ff_idct_xvid_mmx_add;
             c->idct                  = ff_idct_xvid_mmx;
             break;
+#endif /* ARCH_X86_32 */
         }
     }
 
@@ -571,14 +575,16 @@  static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
         c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
     }
 
+#if ARCH_X86_32
     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
+#endif /* ARCH_X86_32 */
 #endif /* HAVE_MMX_EXTERNAL */
 }
 
 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
                                         int cpu_flags)
 {
-#if HAVE_MMXEXT_INLINE
+#if HAVE_MMXEXT_INLINE && ARCH_X86_32
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
@@ -586,7 +592,7 @@  static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
         c->idct_add = ff_idct_xvid_mmxext_add;
         c->idct     = ff_idct_xvid_mmxext;
     }
-#endif /* HAVE_MMXEXT_INLINE */
+#endif /* HAVE_MMXEXT_INLINE && ARCH_X86_32 */
 
 #if HAVE_MMXEXT_EXTERNAL
     SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
@@ -601,6 +607,7 @@  static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
 
+#if ARCH_X86_32
     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
 
@@ -609,6 +616,7 @@  static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     } else {
         c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
     }
+#endif /* ARCH_X86_32 */
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
 
diff --git a/libavcodec/x86/dsputilenc.asm b/libavcodec/x86/dsputilenc.asm
index 7e4fd81..469c157 100644
--- a/libavcodec/x86/dsputilenc.asm
+++ b/libavcodec/x86/dsputilenc.asm
@@ -256,12 +256,15 @@  hadamard8_16_wrapper 0, 14
 %endif
 %endmacro
 
+%if ARCH_X86_32 || HAVE_ALIGNED_STACK == 0
 INIT_MMX mmx
 HADAMARD8_DIFF
 
 INIT_MMX mmxext
 HADAMARD8_DIFF
+%endif ; ARCH_X86_32 || HAVE_ALIGNED_STACK == 0
 
+%if HAVE_ALIGNED_STACK
 INIT_XMM sse2
 %if ARCH_X86_64
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
@@ -273,6 +276,7 @@  HADAMARD8_DIFF 10
 INIT_XMM ssse3
 %define ABS_SUM_8x8 ABS_SUM_8x8_64
 HADAMARD8_DIFF 9
+%endif ; HAVE_ALIGNED_STACK
 
 INIT_XMM sse2
 ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
@@ -334,6 +338,7 @@  cglobal sse16, 5, 5, 8
     movd     eax, m7         ; return value
     RET
 
+%if ARCH_X86_32
 INIT_MMX mmx
 ; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
 cglobal get_pixels, 3,4
@@ -358,6 +363,7 @@  cglobal get_pixels, 3,4
     add          r3, 32
     js .loop
     REP_RET
+%endif ; ARCH_X86_32
 
 INIT_XMM sse2
 cglobal get_pixels, 3, 4
diff --git a/libavcodec/x86/dsputilenc_mmx.c b/libavcodec/x86/dsputilenc_mmx.c
index a1f80af..36575a3 100644
--- a/libavcodec/x86/dsputilenc_mmx.c
+++ b/libavcodec/x86/dsputilenc_mmx.c
@@ -421,6 +421,7 @@  static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int
     else  return score1 + FFABS(score2)*8;
 }
 
+#if ARCH_X86_32
 static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
     int tmp;
 
@@ -481,6 +482,7 @@  static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_si
     return tmp & 0xFFFF;
 }
 #undef SUM
+#endif /* ARCH_X86_32 */
 
 static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
                                int line_size, int h)
@@ -524,6 +526,7 @@  static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
 }
 #undef SUM
 
+#if ARCH_X86_32
 static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
     int tmp;
 
@@ -601,6 +604,7 @@  static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, in
     return tmp & 0x7FFF;
 }
 #undef SUM
+#endif /* ARCH_X86_32 */
 
 static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
                          int line_size, int h)
@@ -809,6 +813,7 @@  static int sum_abs_dctelem_##cpu(int16_t *block){\
     return sum&0xFFFF;\
 }
 
+#if ARCH_X86_32
 #define DCT_SAD       DCT_SAD_MMX
 #define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
 #define MMABS(a,z)    MMABS_MMX(a,z)
@@ -821,9 +826,11 @@  DCT_SAD_FUNC(mmx)
 DCT_SAD_FUNC(mmxext)
 #undef HSUM
 #undef DCT_SAD
+#endif /* ARCH_X86_32 */
 
 #define DCT_SAD       DCT_SAD_SSE2
 #define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
+#define MMABS(a,z)    MMABS_MMXEXT(a,z)
 DCT_SAD_FUNC(sse2)
 #undef MMABS
 
@@ -952,8 +959,10 @@  av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
     int bit_depth = avctx->bits_per_raw_sample;
 
     if (EXTERNAL_MMX(cpu_flags)) {
+#if ARCH_X86_32
         if (bit_depth <= 8)
             c->get_pixels = ff_get_pixels_mmx;
+#endif /* ARCH_X86_32 */
         c->diff_pixels = ff_diff_pixels_mmx;
         c->pix_sum = ff_pix_sum16_mmx;
 
@@ -966,22 +975,25 @@  av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
 
 #if HAVE_INLINE_ASM
     if (INLINE_MMX(cpu_flags)) {
+#if ARCH_X86_32
         if (avctx->bits_per_raw_sample <= 8 &&
             (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
             c->fdct = ff_fdct_mmx;
 
-        c->diff_bytes= diff_bytes_mmx;
         c->sum_abs_dctelem= sum_abs_dctelem_mmx;
 
+        c->vsad[4] = vsad_intra16_mmx;
+        if (!(avctx->flags & CODEC_FLAG_BITEXACT))
+            c->vsad[0] = vsad16_mmx;
+#endif /* ARCH_X86_32 */
+
+        c->diff_bytes = diff_bytes_mmx;
+
         c->sse[0] = sse16_mmx;
         c->sse[1] = sse8_mmx;
-        c->vsad[4]= vsad_intra16_mmx;
 
         c->nsse[0] = nsse16_mmx;
         c->nsse[1] = nsse8_mmx;
-        if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
-            c->vsad[0] = vsad16_mmx;
-        }
 
         if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
             c->try_8x8basis= try_8x8basis_mmx;
@@ -999,11 +1011,13 @@  av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
     }
 
     if (INLINE_MMXEXT(cpu_flags)) {
+#if ARCH_X86_32
         if (avctx->bits_per_raw_sample <= 8 &&
             (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
             c->fdct = ff_fdct_mmxext;
 
         c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
+#endif /* ARCH_X86_32 */
         c->vsad[4]         = vsad_intra16_mmxext;
 
         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
@@ -1032,6 +1046,7 @@  av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
 #endif
 #endif /* HAVE_INLINE_ASM */
 
+#if ARCH_X86_32 || !HAVE_ALIGNED_STACK
     if (EXTERNAL_MMX(cpu_flags)) {
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
@@ -1041,6 +1056,7 @@  av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
         c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
         c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
     }
+#endif /* ARCH_X86_32 || !HAVE_ALIGNED_STACK */
 
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->sse[0] = ff_sse16_sse2;
diff --git a/libavcodec/x86/fmtconvert.asm b/libavcodec/x86/fmtconvert.asm
index e7803df..0123390 100644
--- a/libavcodec/x86/fmtconvert.asm
+++ b/libavcodec/x86/fmtconvert.asm
@@ -71,8 +71,10 @@  cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 INT32_TO_FLOAT_FMUL_SCALAR 5
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 INT32_TO_FLOAT_FMUL_SCALAR 3
 
@@ -112,10 +114,12 @@  cglobal float_to_int16, 3, 3, %1, dst, src, len
 
 INIT_XMM sse2
 FLOAT_TO_INT16 2
+%if ARCH_X86_32
 INIT_MMX sse
 FLOAT_TO_INT16 0
 INIT_MMX 3dnow
 FLOAT_TO_INT16 0
+%endif ; ARCH_X86_32
 
 ;------------------------------------------------------------------------------
 ; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
@@ -190,10 +194,12 @@  cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
 
 INIT_XMM sse2
 FLOAT_TO_INT16_STEP 2
+%if ARCH_X86_32
 INIT_MMX sse
 FLOAT_TO_INT16_STEP 0
 INIT_MMX 3dnow
 FLOAT_TO_INT16_STEP 0
+%endif ; ARCH_X86_32
 
 ;-------------------------------------------------------------------------------
 ; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
@@ -236,10 +242,12 @@  cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX 3dnow
 FLOAT_TO_INT16_INTERLEAVE2
 INIT_MMX sse
 FLOAT_TO_INT16_INTERLEAVE2
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 FLOAT_TO_INT16_INTERLEAVE2
 
@@ -293,10 +301,12 @@  cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, s
 
 INIT_MMX sse
 FLOAT_TO_INT16_INTERLEAVE6
+%if ARCH_X86_32
 INIT_MMX 3dnow
 FLOAT_TO_INT16_INTERLEAVE6
 INIT_MMX 3dnowext
 FLOAT_TO_INT16_INTERLEAVE6
+%endif ; ARCH_X86_32
 
 ;-----------------------------------------------------------------------------
 ; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
@@ -376,8 +386,10 @@  cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 FLOAT_INTERLEAVE6 0
+%endif ; ARCH_X86_32
 INIT_XMM sse
 FLOAT_INTERLEAVE6 7
 
@@ -419,10 +431,12 @@  cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 %define PUNPCKLDQ punpckldq
 %define PUNPCKHDQ punpckhdq
 FLOAT_INTERLEAVE2 0
+%endif ; ARCH_X86_32
 INIT_XMM sse
 %define PUNPCKLDQ unpcklps
 %define PUNPCKHDQ unpckhps
diff --git a/libavcodec/x86/fmtconvert_init.c b/libavcodec/x86/fmtconvert_init.c
index 3d75df9..8230c29 100644
--- a/libavcodec/x86/fmtconvert_init.c
+++ b/libavcodec/x86/fmtconvert_init.c
@@ -71,10 +71,13 @@  static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, lon
         float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
 }
 
+#if ARCH_X86_32
 FLOAT_TO_INT16_INTERLEAVE(3dnow)
 FLOAT_TO_INT16_INTERLEAVE(sse)
+#endif /* ARCH_X86_32 */
 FLOAT_TO_INT16_INTERLEAVE(sse2)
 
+#if ARCH_X86_32
 static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
                                                long len, int channels)
 {
@@ -83,6 +86,7 @@  static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
     else
         float_to_int16_interleave_3dnow(dst, src, len, channels);
 }
+#endif /* ARCH_X86_32 */
 
 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
@@ -90,6 +94,7 @@  void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
 void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
 void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
 
+#if ARCH_X86_32
 static void float_interleave_mmx(float *dst, const float **src,
                                  unsigned int len, int channels)
 {
@@ -100,6 +105,7 @@  static void float_interleave_mmx(float *dst, const float **src,
     else
         ff_float_interleave_c(dst, src, len, channels);
 }
+#endif /* ARCH_X86_32 */
 
 static void float_interleave_sse(float *dst, const float **src,
                                  unsigned int len, int channels)
@@ -118,6 +124,7 @@  av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
 #if HAVE_YASM
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags)) {
         c->float_interleave = float_interleave_mmx;
     }
@@ -132,10 +139,13 @@  av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx
             c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
         }
     }
+#endif /* ARCH_X86_32 */
     if (EXTERNAL_SSE(cpu_flags)) {
+#if ARCH_X86_32
         c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
         c->float_to_int16             = ff_float_to_int16_sse;
         c->float_to_int16_interleave  = float_to_int16_interleave_sse;
+#endif /* ARCH_X86_32 */
         c->float_interleave           = float_interleave_sse;
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
diff --git a/libavcodec/x86/h264_chromamc.asm b/libavcodec/x86/h264_chromamc.asm
index b7b18e0..f7fa1c5 100644
--- a/libavcodec/x86/h264_chromamc.asm
+++ b/libavcodec/x86/h264_chromamc.asm
@@ -455,12 +455,14 @@  chroma_mc4_mmx_func avg, h264
 chroma_mc4_mmx_func avg, rv40
 chroma_mc2_mmx_func avg, h264
 
+%if ARCH_X86_32
 INIT_MMX 3dnow
 chroma_mc8_mmx_func avg, h264, _rnd
 chroma_mc8_mmx_func avg, vc1,  _nornd
 chroma_mc8_mmx_func avg, rv40
 chroma_mc4_mmx_func avg, h264
 chroma_mc4_mmx_func avg, rv40
+%endif ; ARCH_X86_32
 
 %macro chroma_mc8_ssse3_func 2-3
 cglobal %1_%2_chroma_mc8%3, 6, 7, 8
diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 2771291..34b840f 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -347,6 +347,7 @@  cglobal h264_idct8_dc_add_8, 2, 3, 0
     RET
 %endif
 
+%if ARCH_X86_32
 INIT_MMX mmx
 ; ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
 ;                          int16_t *block, int stride,
@@ -425,16 +426,10 @@  cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
     jz .no_dc
     mov   word [r2], 0
     DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
-    mov       dst2d, dword [r1+r5*4]
-    lea       dst2q, [r0+dst2q]
-    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
+    mov         r1d, dword [r1+r5*4]
+    lea          r1, [r0+r1]
+    DC_ADD_MMXEXT_OP movh, r1, r3, r6
     mov          r1, r1m
-%endif
     inc          r5
     add          r2, 32
     cmp          r5, 16
@@ -504,16 +499,10 @@  cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s
     jz .skipblock
     mov   word [r2], 0
     DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
-    mov       dst2d, dword [r1+r5*4]
-    add       dst2q, r0
-    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
-%if ARCH_X86_64 == 0
+    mov         r1d, dword [r1+r5*4]
+    add          r1, r0
+    DC_ADD_MMXEXT_OP movh, r1, r3, r6
     mov          r1, r1m
-%endif
 .skipblock:
     inc          r5
     add          r2, 32
@@ -544,18 +533,12 @@  cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
     jz .no_dc
     mov   word [r2], 0
     DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64 == 0
-%define dst2q r1
-%define dst2d r1d
-%endif
-    mov       dst2d, dword [r1+r5*4]
-    lea       dst2q, [r0+dst2q]
-    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
-    lea       dst2q, [dst2q+r3*4]
-    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
-%if ARCH_X86_64 == 0
+    mov         r1d, dword [r1+r5*4]
+    lea          r1, [r0+r1]
+    DC_ADD_MMXEXT_OP mova, r1, r3, r6
+    lea          r1, [r1+r3*4]
+    DC_ADD_MMXEXT_OP mova, r1, r3, r6
     mov          r1, r1m
-%endif
     add          r5, 4
     add          r2, 128
     cmp          r5, 16
@@ -581,6 +564,7 @@  cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride
 
     ADD         rsp, pad
     RET
+%endif ; ARCH_X86_32
 
 INIT_XMM sse2
 ; ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
@@ -636,6 +620,7 @@  INIT_XMM cpuname
     jl .nextblock
     REP_RET
 
+%if ARCH_X86_32
 INIT_MMX mmx
 h264_idct_add8_mmx_plane:
 .nextblock:
@@ -644,14 +629,9 @@  h264_idct_add8_mmx_plane:
     or          r6w, word [r2]
     test         r6, r6
     jz .skipblock
-%if ARCH_X86_64
-    mov         r0d, dword [r1+r5*4]
-    add          r0, [dst2q]
-%else
     mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
     mov          r0, [r0]
     add          r0, dword [r1+r5*4]
-%endif
     IDCT4_ADD    r0, r2, r3
 .skipblock:
     inc          r5
@@ -668,17 +648,10 @@  cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
 %ifdef PIC
     lea     picregq, [scan8_mem]
 %endif
-%if ARCH_X86_64
-    mov       dst2q, r0
-%endif
     call         h264_idct_add8_mmx_plane
     mov          r5, 32
     add          r2, 384
-%if ARCH_X86_64
-    add       dst2q, gprsize
-%else
     add        r0mp, gprsize
-%endif
     call         h264_idct_add8_mmx_plane
     RET
 
@@ -688,14 +661,9 @@  h264_idct_add8_mmxext_plane:
     movzx        r6, byte [r4+r6]
     test         r6, r6
     jz .try_dc
-%if ARCH_X86_64
-    mov         r0d, dword [r1+r5*4]
-    add          r0, [dst2q]
-%else
     mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
     mov          r0, [r0]
     add          r0, dword [r1+r5*4]
-%endif
     IDCT4_ADD    r0, r2, r3
     inc          r5
     add          r2, 32
@@ -708,14 +676,9 @@  h264_idct_add8_mmxext_plane:
     jz .skipblock
     mov   word [r2], 0
     DC_ADD_MMXEXT_INIT r6, r3
-%if ARCH_X86_64
-    mov         r0d, dword [r1+r5*4]
-    add          r0, [dst2q]
-%else
     mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
     mov          r0, [r0]
     add          r0, dword [r1+r5*4]
-%endif
     DC_ADD_MMXEXT_OP movh, r0, r3, r6
 .skipblock:
     inc          r5
@@ -731,23 +694,18 @@  INIT_MMX mmxext
 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
     mov          r5, 16
     add          r2, 512
-%if ARCH_X86_64
-    mov       dst2q, r0
-%endif
 %ifdef PIC
     lea     picregq, [scan8_mem]
 %endif
     call h264_idct_add8_mmxext_plane
     mov          r5, 32
     add          r2, 384
-%if ARCH_X86_64
-    add       dst2q, gprsize
-%else
     add        r0mp, gprsize
-%endif
     call h264_idct_add8_mmxext_plane
     RET
+%endif ; ARCH_X86_32
 
+INIT_MMX mmxext
 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
     movd         m0, [r2   ]          ;  0 0 X D
@@ -1076,7 +1034,9 @@  cglobal h264_luma_dc_dequant_idct, 3, 4, %1
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 IDCT_DC_DEQUANT 0
+%endif ; ARCH_X86_32
 INIT_MMX sse2
 IDCT_DC_DEQUANT 7
diff --git a/libavcodec/x86/h264_intrapred.asm b/libavcodec/x86/h264_intrapred.asm
index b9db3f4..feb481a 100644
--- a/libavcodec/x86/h264_intrapred.asm
+++ b/libavcodec/x86/h264_intrapred.asm
@@ -118,8 +118,10 @@  cglobal pred16x16_horizontal_8, 2,3
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 PRED16x16_H
+%endif ; ARCH_X86_32
 INIT_MMX mmxext
 PRED16x16_H
 INIT_XMM ssse3
@@ -180,8 +182,10 @@  cglobal pred16x16_dc_8, 2,7
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 PRED16x16_DC
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 PRED16x16_DC
 INIT_XMM ssse3
@@ -227,10 +231,12 @@  cglobal pred16x16_tm_vp8_8, 2,5
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 PRED16x16_TM
 INIT_MMX mmxext
 PRED16x16_TM
+%endif ; ARCH_X86_32
 
 INIT_XMM sse2
 cglobal pred16x16_tm_vp8_8, 2,6,6
@@ -532,6 +538,7 @@  cglobal pred16x16_plane_%1_8, 2,9,7
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 H264_PRED16x16_PLANE h264
 H264_PRED16x16_PLANE rv40
@@ -540,6 +547,7 @@  INIT_MMX mmxext
 H264_PRED16x16_PLANE h264
 H264_PRED16x16_PLANE rv40
 H264_PRED16x16_PLANE svq3
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 H264_PRED16x16_PLANE h264
 H264_PRED16x16_PLANE rv40
@@ -714,10 +722,12 @@  ALIGN 16
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 H264_PRED8x8_PLANE
 INIT_MMX mmxext
 H264_PRED8x8_PLANE
+%endif ; ARCH_X86_32
 INIT_XMM sse2
 H264_PRED8x8_PLANE
 INIT_XMM ssse3
@@ -761,8 +771,10 @@  cglobal pred8x8_horizontal_8, 2,3
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 PRED8x8_H
+%endif ; ARCH_X86_32
 INIT_MMX mmxext
 PRED8x8_H
 INIT_MMX ssse3
@@ -939,10 +951,12 @@  cglobal pred8x8_tm_vp8_8, 2,6
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 PRED8x8_TM
 INIT_MMX mmxext
 PRED8x8_TM
+%endif ; ARCH_X86_32
 
 INIT_XMM sse2
 cglobal pred8x8_tm_vp8_8, 2,6,4
@@ -1495,6 +1509,7 @@  PRED8x8L_DOWN_LEFT
 ;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
 ;-----------------------------------------------------------------------------
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 cglobal pred8x8l_down_right_8, 4,5
     sub          r0, r3
@@ -1626,6 +1641,7 @@  cglobal pred8x8l_down_right_8, 4,5
     por        mm0, mm1
     movq [r0+r3*1], mm0
     RET
+%endif ; ARCH_X86_32
 
 %macro PRED8x8L_DOWN_RIGHT 0
 cglobal pred8x8l_down_right_8, 4,5
@@ -1746,6 +1762,7 @@  PRED8x8L_DOWN_RIGHT
 ; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int has_topright, int stride)
 ;-----------------------------------------------------------------------------
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 cglobal pred8x8l_vertical_right_8, 4,5
     sub          r0, r3
@@ -1852,6 +1869,7 @@  cglobal pred8x8l_vertical_right_8, 4,5
     PALIGNR    mm5, mm0, 7, mm1
     movq [r4+r3*2], mm5
     RET
+%endif ; ARCH_X86_32
 
 %macro PRED8x8L_VERTICAL_RIGHT 0
 cglobal pred8x8l_vertical_right_8, 4,5,7
@@ -2149,6 +2167,7 @@  PRED8x8L_HORIZONTAL_UP
 ;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int has_topright, int stride)
 ;-----------------------------------------------------------------------------
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 cglobal pred8x8l_horizontal_down_8, 4,5
     sub          r0, r3
@@ -2263,6 +2282,7 @@  cglobal pred8x8l_horizontal_down_8, 4,5
     PALIGNR    mm3, mm4, 6, mm4
     movq [r0+r3*1], mm3
     RET
+%endif ; ARCH_X86_32
 
 %macro PRED8x8L_HORIZONTAL_DOWN 0
 cglobal pred8x8l_horizontal_down_8, 4,5
@@ -2463,8 +2483,10 @@  cglobal pred4x4_tm_vp8_8, 3,6
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmx
 PRED4x4_TM
+%endif ; ARCH_X86_32
 INIT_MMX mmxext
 PRED4x4_TM
 
diff --git a/libavcodec/x86/h264_intrapred_init.c b/libavcodec/x86/h264_intrapred_init.c
index 6dd98aa..35dfb59 100644
--- a/libavcodec/x86/h264_intrapred_init.c
+++ b/libavcodec/x86/h264_intrapred_init.c
@@ -190,11 +190,16 @@  av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
     if (bit_depth == 8) {
         if (EXTERNAL_MMX(cpu_flags)) {
             h->pred16x16[VERT_PRED8x8         ] = ff_pred16x16_vertical_8_mmx;
+#if ARCH_X86_32
             h->pred16x16[HOR_PRED8x8          ] = ff_pred16x16_horizontal_8_mmx;
+#endif /* ARCH_X86_32 */
             if (chroma_format_idc == 1) {
                 h->pred8x8  [VERT_PRED8x8     ] = ff_pred8x8_vertical_8_mmx;
+#if ARCH_X86_32
                 h->pred8x8  [HOR_PRED8x8      ] = ff_pred8x8_horizontal_8_mmx;
+#endif /* ARCH_X86_32 */
             }
+#if ARCH_X86_32
             if (codec_id == AV_CODEC_ID_VP8) {
                 h->pred16x16[PLANE_PRED8x8    ] = ff_pred16x16_tm_vp8_8_mmx;
                 h->pred8x8  [PLANE_PRED8x8    ] = ff_pred8x8_tm_vp8_8_mmx;
@@ -211,22 +216,27 @@  av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
                     h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
                 }
             }
+#endif /* ARCH_X86_32 */
         }
 
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             h->pred16x16[HOR_PRED8x8            ] = ff_pred16x16_horizontal_8_mmxext;
+#if ARCH_X86_32
             h->pred16x16[DC_PRED8x8             ] = ff_pred16x16_dc_8_mmxext;
+#endif /* ARCH_X86_32 */
             if (chroma_format_idc == 1)
                 h->pred8x8[HOR_PRED8x8          ] = ff_pred8x8_horizontal_8_mmxext;
             h->pred8x8l [TOP_DC_PRED            ] = ff_pred8x8l_top_dc_8_mmxext;
             h->pred8x8l [DC_PRED                ] = ff_pred8x8l_dc_8_mmxext;
             h->pred8x8l [HOR_PRED               ] = ff_pred8x8l_horizontal_8_mmxext;
             h->pred8x8l [VERT_PRED              ] = ff_pred8x8l_vertical_8_mmxext;
+#if ARCH_X86_32
             h->pred8x8l [DIAG_DOWN_RIGHT_PRED   ] = ff_pred8x8l_down_right_8_mmxext;
             h->pred8x8l [VERT_RIGHT_PRED        ] = ff_pred8x8l_vertical_right_8_mmxext;
-            h->pred8x8l [HOR_UP_PRED            ] = ff_pred8x8l_horizontal_up_8_mmxext;
             h->pred8x8l [DIAG_DOWN_LEFT_PRED    ] = ff_pred8x8l_down_left_8_mmxext;
             h->pred8x8l [HOR_DOWN_PRED          ] = ff_pred8x8l_horizontal_down_8_mmxext;
+#endif /* ARCH_X86_32 */
+            h->pred8x8l [HOR_UP_PRED            ] = ff_pred8x8l_horizontal_up_8_mmxext;
             h->pred4x4  [DIAG_DOWN_RIGHT_PRED   ] = ff_pred4x4_down_right_8_mmxext;
             h->pred4x4  [VERT_RIGHT_PRED        ] = ff_pred4x4_vertical_right_8_mmxext;
             h->pred4x4  [HOR_DOWN_PRED          ] = ff_pred4x4_horizontal_down_8_mmxext;
@@ -247,12 +257,15 @@  av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
                 }
             }
             if (codec_id == AV_CODEC_ID_VP8) {
+#if ARCH_X86_32
                 h->pred16x16[PLANE_PRED8x8      ] = ff_pred16x16_tm_vp8_8_mmxext;
-                h->pred8x8  [DC_PRED8x8         ] = ff_pred8x8_dc_rv40_8_mmxext;
                 h->pred8x8  [PLANE_PRED8x8      ] = ff_pred8x8_tm_vp8_8_mmxext;
+#endif /* ARCH_X86_32 */
+                h->pred8x8  [DC_PRED8x8         ] = ff_pred8x8_dc_rv40_8_mmxext;
                 h->pred4x4  [TM_VP8_PRED        ] = ff_pred4x4_tm_vp8_8_mmxext;
                 h->pred4x4  [VERT_PRED          ] = ff_pred4x4_vertical_vp8_8_mmxext;
             } else {
+#if ARCH_X86_32
                 if (chroma_format_idc == 1)
                     h->pred8x8  [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
                 if (codec_id == AV_CODEC_ID_SVQ3) {
@@ -262,6 +275,7 @@  av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
                 } else {
                     h->pred16x16[PLANE_PRED8x8  ] = ff_pred16x16_plane_h264_8_mmxext;
                 }
+#endif /* ARCH_X86_32 */
             }
         }
 
@@ -327,6 +341,7 @@  av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
             h->pred4x4[DC_PRED             ] = ff_pred4x4_dc_10_mmxext;
             h->pred4x4[HOR_UP_PRED         ] = ff_pred4x4_horizontal_up_10_mmxext;
 
+#if ARCH_X86_32
             if (chroma_format_idc == 1)
                 h->pred8x8[DC_PRED8x8      ] = ff_pred8x8_dc_10_mmxext;
 
@@ -338,6 +353,7 @@  av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
             h->pred16x16[LEFT_DC_PRED8x8   ] = ff_pred16x16_left_dc_10_mmxext;
             h->pred16x16[VERT_PRED8x8      ] = ff_pred16x16_vertical_10_mmxext;
             h->pred16x16[HOR_PRED8x8       ] = ff_pred16x16_horizontal_10_mmxext;
+#endif /* ARCH_X86_32 */
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
             h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
diff --git a/libavcodec/x86/h264chroma_init.c b/libavcodec/x86/h264chroma_init.c
index eec1653..504be95 100644
--- a/libavcodec/x86/h264chroma_init.c
+++ b/libavcodec/x86/h264chroma_init.c
@@ -73,6 +73,7 @@  av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
     int high_bit_depth = bit_depth > 8;
     int cpu_flags      = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
         c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
         c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
@@ -82,6 +83,7 @@  av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
     }
+#endif /* ARCH_X86_32 */
 
     if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
diff --git a/libavcodec/x86/h264dsp_init.c b/libavcodec/x86/h264dsp_init.c
index 4164b83..824872f 100644
--- a/libavcodec/x86/h264dsp_init.c
+++ b/libavcodec/x86/h264dsp_init.c
@@ -223,6 +223,7 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct8_dc_add  =
             c->h264_idct8_add     = ff_h264_idct8_add_8_mmx;
 
+#if ARCH_X86_32
             c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
             c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
             if (chroma_format_idc == 1)
@@ -230,15 +231,18 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
             c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
             if (cpu_flags & AV_CPU_FLAG_CMOV)
                 c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
+#endif /* ARCH_X86_32 */
         }
         if (EXTERNAL_MMXEXT(cpu_flags)) {
             c->h264_idct_dc_add  = ff_h264_idct_dc_add_8_mmxext;
             c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
+#if ARCH_X86_32
             c->h264_idct_add16   = ff_h264_idct_add16_8_mmxext;
             c->h264_idct8_add4   = ff_h264_idct8_add4_8_mmxext;
             if (chroma_format_idc == 1)
                 c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
             c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
+#endif /* ARCH_X86_32 */
 
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_8_mmxext;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
@@ -246,18 +250,22 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
                 c->h264_h_loop_filter_chroma       = ff_deblock_h_chroma_8_mmxext;
                 c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
             }
-#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
+#if ARCH_X86_32
+#if HAVE_MMXEXT_EXTERNAL
             c->h264_v_loop_filter_luma       = deblock_v_luma_8_mmxext;
-            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_mmxext;
             c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
+#endif /* HAVE_MMXEXT_EXTERNAL */
+            c->h264_h_loop_filter_luma       = ff_deblock_h_luma_8_mmxext;
             c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
-#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
             c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
             c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
+#endif /* ARCH_X86_32 */
             c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
 
+#if ARCH_X86_32
             c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
             c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
+#endif /* ARCH_X86_32 */
             c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
         }
         if (EXTERNAL_SSE2(cpu_flags)) {
@@ -293,7 +301,7 @@  av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
         }
     } else if (bit_depth == 10) {
         if (EXTERNAL_MMXEXT(cpu_flags)) {
-#if ARCH_X86_32
+#if ARCH_X86_32 || !HAVE_ALIGNED_STACK
             c->h264_v_loop_filter_chroma       = ff_deblock_v_chroma_10_mmxext;
             c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
             c->h264_v_loop_filter_luma         = ff_deblock_v_luma_10_mmxext;
diff --git a/libavcodec/x86/hpeldsp.asm b/libavcodec/x86/hpeldsp.asm
index ec04d99..2cd89e6 100644
--- a/libavcodec/x86/hpeldsp.asm
+++ b/libavcodec/x86/hpeldsp.asm
@@ -53,8 +53,10 @@  cglobal put_pixels8_x2, 4,5
 
 INIT_MMX mmxext
 PUT_PIXELS8_X2
+%if ARCH_X86_32
 INIT_MMX 3dnow
 PUT_PIXELS8_X2
+%endif ; ARCH_X86_32
 
 
 ; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -97,8 +99,10 @@  cglobal put_pixels16_x2, 4,5
 
 INIT_MMX mmxext
 PUT_PIXELS_16
+%if ARCH_X86_32
 INIT_MMX 3dnow
 PUT_PIXELS_16
+%endif ; ARCH_X86_32
 
 
 ; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -138,8 +142,10 @@  cglobal put_no_rnd_pixels8_x2, 4,5
 
 INIT_MMX mmxext
 PUT_NO_RND_PIXELS8_X2
+%if ARCH_X86_32
 INIT_MMX 3dnow
 PUT_NO_RND_PIXELS8_X2
+%endif ; ARCH_X86_32
 
 
 ; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -185,8 +191,10 @@  cglobal put_no_rnd_pixels8_x2_exact, 4,5
 
 INIT_MMX mmxext
 PUT_NO_RND_PIXELS8_X2_EXACT
+%if ARCH_X86_32
 INIT_MMX 3dnow
 PUT_NO_RND_PIXELS8_X2_EXACT
+%endif ; ARCH_X86_32
 
 
 ; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -219,8 +227,10 @@  cglobal put_pixels8_y2, 4,5
 
 INIT_MMX mmxext
 PUT_PIXELS8_Y2
+%if ARCH_X86_32
 INIT_MMX 3dnow
 PUT_PIXELS8_Y2
+%endif ; ARCH_X86_32
 
 
 ; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -256,8 +266,10 @@  cglobal put_no_rnd_pixels8_y2, 4,5
 
 INIT_MMX mmxext
 PUT_NO_RND_PIXELS8_Y2
+%if ARCH_X86_32
 INIT_MMX 3dnow
 PUT_NO_RND_PIXELS8_Y2
+%endif ; ARCH_X86_32
 
 
 ; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -298,8 +310,10 @@  cglobal put_no_rnd_pixels8_y2_exact, 4,5
 
 INIT_MMX mmxext
 PUT_NO_RND_PIXELS8_Y2_EXACT
+%if ARCH_X86_32
 INIT_MMX 3dnow
 PUT_NO_RND_PIXELS8_Y2_EXACT
+%endif ; ARCH_X86_32
 
 
 ; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -328,8 +342,10 @@  cglobal avg_pixels8, 4,5
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX 3dnow
 AVG_PIXELS8
+%endif ; ARCH_X86_32
 
 
 ; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -364,8 +380,10 @@  cglobal avg_pixels8_x2, 4,5
 
 INIT_MMX mmxext
 AVG_PIXELS8_X2
+%if ARCH_X86_32
 INIT_MMX 3dnow
 AVG_PIXELS8_X2
+%endif ; ARCH_X86_32
 
 
 ; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -406,8 +424,10 @@  cglobal avg_pixels8_y2, 4,5
 
 INIT_MMX mmxext
 AVG_PIXELS8_Y2
+%if ARCH_X86_32
 INIT_MMX 3dnow
 AVG_PIXELS8_Y2
+%endif ; ARCH_X86_32
 
 
 ; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
@@ -450,5 +470,7 @@  cglobal avg_pixels8_xy2, 4,5
 
 INIT_MMX mmxext
 AVG_PIXELS8_XY2
+%if ARCH_X86_32
 INIT_MMX 3dnow
 AVG_PIXELS8_XY2
+%endif ; ARCH_X86_32
diff --git a/libavcodec/x86/hpeldsp_init.c b/libavcodec/x86/hpeldsp_init.c
index 3bc5601..1d88a21 100644
--- a/libavcodec/x86/hpeldsp_init.c
+++ b/libavcodec/x86/hpeldsp_init.c
@@ -145,7 +145,9 @@  PIXELS16(static, put, , _y2, _mmx)
     PIXELS16(static, avg,        ff_,  _y2, CPUEXT) \
     PIXELS16(static, avg,        ff_, _xy2, CPUEXT)
 
+#if ARCH_X86_32
 HPELDSP_AVG_PIXELS16(_3dnow)
+#endif /* ARCH_X86_32 */
 HPELDSP_AVG_PIXELS16(_mmxext)
 
 #endif /* HAVE_YASM */
@@ -205,6 +207,7 @@  static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
 #endif /* HAVE_MMXEXT_EXTERNAL */
 }
 
+#if ARCH_X86_32
 static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
 {
 #if HAVE_AMD3DNOW_EXTERNAL
@@ -238,6 +241,7 @@  static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
     }
 #endif /* HAVE_AMD3DNOW_EXTERNAL */
 }
+#endif /* ARCH_X86_32 */
 
 static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
 {
@@ -258,8 +262,10 @@  void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
     if (INLINE_MMX(cpu_flags))
         hpeldsp_init_mmx(c, flags, cpu_flags);
 
+#if ARCH_X86_32
     if (EXTERNAL_AMD3DNOW(cpu_flags))
         hpeldsp_init_3dnow(c, flags, cpu_flags);
+#endif /* ARCH_X86_32 */
 
     if (EXTERNAL_MMXEXT(cpu_flags))
         hpeldsp_init_mmxext(c, flags, cpu_flags);
diff --git a/libavcodec/x86/imdct36.asm b/libavcodec/x86/imdct36.asm
index 633fcd9..71dfc25 100644
--- a/libavcodec/x86/imdct36.asm
+++ b/libavcodec/x86/imdct36.asm
@@ -358,8 +358,10 @@  cglobal imdct36_float, 4,4,9, out, buf, in, win
     RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_XMM sse
 DEFINE_IMDCT
+%endif ; ARCH_X86_32
 
 INIT_XMM sse2
 DEFINE_IMDCT
diff --git a/libavcodec/x86/motion_est.c b/libavcodec/x86/motion_est.c
index 41b9c5c..12fd953 100644
--- a/libavcodec/x86/motion_est.c
+++ b/libavcodec/x86/motion_est.c
@@ -329,7 +329,7 @@  static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
 }
 
 
-#define PIX_SAD(suf)\
+#define SAD8_16(suf)\
 static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 {\
     assert(h==8);\
@@ -340,6 +340,23 @@  static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h
 \
     return sum_ ## suf();\
 }\
+static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
+{\
+    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
+                 "pxor %%mm6, %%mm6     \n\t":);\
+\
+    sad8_1_ ## suf(blk1  , blk2  , stride, h);\
+    sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
+\
+    return sum_ ## suf();\
+}
+
+#if ARCH_X86_32
+SAD8_16(mmx)
+#endif /* ARCH_X86_32 */
+SAD8_16(mmxext)
+
+#define PIX_SAD(suf)\
 static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 {\
     assert(h==8);\
@@ -380,16 +397,6 @@  static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, i
     return sum_ ## suf();\
 }\
 \
-static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
-{\
-    __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
-                 "pxor %%mm6, %%mm6     \n\t":);\
-\
-    sad8_1_ ## suf(blk1  , blk2  , stride, h);\
-    sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
-\
-    return sum_ ## suf();\
-}\
 static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
 {\
     __asm__ volatile("pxor %%mm7, %%mm7     \n\t"\
@@ -439,17 +446,23 @@  av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
     int cpu_flags = av_get_cpu_flags();
 
     if (INLINE_MMX(cpu_flags)) {
+#if ARCH_X86_32
         c->pix_abs[0][0] = sad16_mmx;
+#endif /* ARCH_X86_32 */
         c->pix_abs[0][1] = sad16_x2_mmx;
         c->pix_abs[0][2] = sad16_y2_mmx;
         c->pix_abs[0][3] = sad16_xy2_mmx;
+#if ARCH_X86_32
         c->pix_abs[1][0] = sad8_mmx;
+#endif /* ARCH_X86_32 */
         c->pix_abs[1][1] = sad8_x2_mmx;
         c->pix_abs[1][2] = sad8_y2_mmx;
         c->pix_abs[1][3] = sad8_xy2_mmx;
 
+#if ARCH_X86_32
         c->sad[0]= sad16_mmx;
         c->sad[1]= sad8_mmx;
+#endif /* ARCH_X86_32 */
     }
     if (INLINE_MMXEXT(cpu_flags)) {
         c->pix_abs[0][0] = sad16_mmxext;
diff --git a/libavcodec/x86/mpegaudiodsp.c b/libavcodec/x86/mpegaudiodsp.c
index 3f0943c..5b143b7 100644
--- a/libavcodec/x86/mpegaudiodsp.c
+++ b/libavcodec/x86/mpegaudiodsp.c
@@ -217,7 +217,9 @@  static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in,      \
     }                                                                   \
 }
 
+#if ARCH_X86_32
 DECL_IMDCT_BLOCKS(sse,sse)
+#endif /* ARCH_X86_32 */
 DECL_IMDCT_BLOCKS(sse2,sse)
 DECL_IMDCT_BLOCKS(sse3,sse)
 DECL_IMDCT_BLOCKS(ssse3,sse)
@@ -249,9 +251,11 @@  av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
 #endif /* HAVE_SSE2_INLINE */
 
 #if HAVE_YASM
+#if ARCH_X86_32
     if (EXTERNAL_SSE(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse;
     }
+#endif /* ARCH_X86_32 */
     if (EXTERNAL_SSE2(cpu_flags)) {
         s->imdct36_blocks_float = imdct36_blocks_sse2;
     }
diff --git a/libavcodec/x86/mpegvideo.c b/libavcodec/x86/mpegvideo.c
index 07fd1e5..4271d99 100644
--- a/libavcodec/x86/mpegvideo.c
+++ b/libavcodec/x86/mpegvideo.c
@@ -443,6 +443,7 @@  __asm__ volatile(
         );
 }
 
+#if ARCH_X86_32
 static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
     int *sum= s->dct_error_sum[intra];
@@ -496,6 +497,7 @@  static void  denoise_dct_mmx(MpegEncContext *s, int16_t *block){
         : "r"(block+64)
     );
 }
+#endif /* ARCH_X86_32 */
 
 static void  denoise_dct_sse2(MpegEncContext *s, int16_t *block){
     const int intra= s->mb_intra;
@@ -568,7 +570,9 @@  av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
         if(!(s->flags & CODEC_FLAG_BITEXACT))
             s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
         s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
+#if ARCH_X86_32
         s->denoise_dct = denoise_dct_mmx;
+#endif /* ARCH_X86_32 */
     }
     if (INLINE_SSE2(cpu_flags)) {
         s->denoise_dct = denoise_dct_sse2;
diff --git a/libavcodec/x86/mpegvideoenc.c b/libavcodec/x86/mpegvideoenc.c
index ca39a3b..6904eff 100644
--- a/libavcodec/x86/mpegvideoenc.c
+++ b/libavcodec/x86/mpegvideoenc.c
@@ -30,6 +30,7 @@ 
 
 extern uint16_t ff_inv_zigzag_direct16[64];
 
+#if ARCH_X86_32
 #if HAVE_MMX_INLINE
 #define COMPILE_TEMPLATE_MMXEXT 0
 #define COMPILE_TEMPLATE_SSE2   0
@@ -52,6 +53,7 @@  extern uint16_t ff_inv_zigzag_direct16[64];
 #define RENAMEl(a) a ## _mmxext
 #include "mpegvideoenc_template.c"
 #endif /* HAVE_MMXEXT_INLINE */
+#endif /* ARCH_X86_32 */
 
 #if HAVE_SSE2_INLINE
 #undef COMPILE_TEMPLATE_MMXEXT
@@ -86,8 +88,9 @@  av_cold void ff_MPV_encode_init_x86(MpegEncContext *s)
     const int dct_algo = s->avctx->dct_algo;
 
     if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
-#if HAVE_MMX_INLINE
         int cpu_flags = av_get_cpu_flags();
+#if ARCH_X86_32
+#if HAVE_MMX_INLINE
         if (INLINE_MMX(cpu_flags))
             s->dct_quantize = dct_quantize_MMX;
 #endif
@@ -95,6 +98,7 @@  av_cold void ff_MPV_encode_init_x86(MpegEncContext *s)
         if (INLINE_MMXEXT(cpu_flags))
             s->dct_quantize = dct_quantize_MMXEXT;
 #endif
+#endif /* ARCH_X86_32 */
 #if HAVE_SSE2_INLINE
         if (INLINE_SSE2(cpu_flags))
             s->dct_quantize = dct_quantize_SSE2;
diff --git a/libavcodec/x86/rv34dsp.asm b/libavcodec/x86/rv34dsp.asm
index 4d9c35b..f1b84f6 100644
--- a/libavcodec/x86/rv34dsp.asm
+++ b/libavcodec/x86/rv34dsp.asm
@@ -63,6 +63,7 @@  rv34_idct dc
 %define IDCT_DC IDCT_DC_NOROUND
 rv34_idct dc_noround
 
+%if ARCH_X86_32
 ; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
 INIT_MMX mmx
 cglobal rv34_idct_dc_add, 3, 3
@@ -97,6 +98,7 @@  cglobal rv34_idct_dc_add, 3, 3
     movh       [r2], m4
     movh       [r2+r1], m5
     RET
+%endif ; ARCH_X86_32
 
 ; Load coeffs and perform row transform
 ; Output: coeffs in mm[0467], rounder in mm5
diff --git a/libavcodec/x86/rv34dsp_init.c b/libavcodec/x86/rv34dsp_init.c
index 5f284b8..91d0280 100644
--- a/libavcodec/x86/rv34dsp_init.c
+++ b/libavcodec/x86/rv34dsp_init.c
@@ -34,8 +34,10 @@  av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
 {
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (EXTERNAL_MMX(cpu_flags))
         c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
+#endif /* ARCH_X86_32 */
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
         c->rv34_idct_add         = ff_rv34_idct_add_mmxext;
diff --git a/libavcodec/x86/rv40dsp.asm b/libavcodec/x86/rv40dsp.asm
index 7ec72be..694d07d 100644
--- a/libavcodec/x86/rv40dsp.asm
+++ b/libavcodec/x86/rv40dsp.asm
@@ -486,11 +486,13 @@  cglobal rv40_weight_func_%1_%2, 6, 7, 8
     REP_RET
 %endmacro
 
+%if ARCH_X86_32
 INIT_MMX mmxext
 RV40_WEIGHT   rnd,    8, 3
 RV40_WEIGHT   rnd,   16, 4
 RV40_WEIGHT   nornd,  8, 3
 RV40_WEIGHT   nornd, 16, 4
+%endif ; ARCH_X86_32
 
 INIT_XMM sse2
 RV40_WEIGHT   rnd,    8, 3
diff --git a/libavcodec/x86/rv40dsp_init.c b/libavcodec/x86/rv40dsp_init.c
index 781f467..0d75a00 100644
--- a/libavcodec/x86/rv40dsp_init.c
+++ b/libavcodec/x86/rv40dsp_init.c
@@ -232,23 +232,23 @@  av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
         QPEL_MC_SET(put_, _mmx)
 #endif
     }
+#if ARCH_X86_32
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
         c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
         c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
-#if ARCH_X86_32
         QPEL_MC_SET(avg_, _3dnow)
-#endif
     }
+#endif /* ARCH_X86_32 */
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         c->avg_chroma_pixels_tab[0]     = ff_avg_rv40_chroma_mc8_mmxext;
         c->avg_chroma_pixels_tab[1]     = ff_avg_rv40_chroma_mc4_mmxext;
+#if ARCH_X86_32
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
         c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
         c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
         c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
-#if ARCH_X86_32
         QPEL_MC_SET(avg_, _mmxext)
-#endif
+#endif /* ARCH_X86_32 */
     }
     if (EXTERNAL_SSE2(cpu_flags)) {
         c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 9f18131..5f16659 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -103,9 +103,11 @@  av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
     if (EXTERNAL_MMX(cpu_flags)) {
         dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
     }
+#if ARCH_X86_32
     if (EXTERNAL_AMD3DNOW(cpu_flags)) {
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
     }
+#endif /* ARCH_X86_32 */
     if (EXTERNAL_MMXEXT(cpu_flags)) {
         ASSIGN_LF(mmxext);
         dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
diff --git a/libavutil/x86/float_dsp_init.c b/libavutil/x86/float_dsp_init.c
index a04d91c..82d8e4b 100644
--- a/libavutil/x86/float_dsp_init.c
+++ b/libavutil/x86/float_dsp_init.c
@@ -57,6 +57,7 @@  float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
 void ff_butterflies_float_sse(float *src0, float *src1, int len);
 
 #if HAVE_6REGS && HAVE_INLINE_ASM
+#if ARCH_X86_32
 static void vector_fmul_window_3dnowext(float *dst, const float *src0,
                                         const float *src1, const float *win,
                                         int len)
@@ -88,6 +89,7 @@  static void vector_fmul_window_3dnowext(float *dst, const float *src0,
         : "r"(dst + len), "r"(src0 + len), "r"(src1), "r"(win + len)
     );
 }
+#endif /* ARCH_X86_32 */
 
 static void vector_fmul_window_sse(float *dst, const float *src0,
                                    const float *src1, const float *win, int len)
@@ -127,9 +129,11 @@  av_cold void ff_float_dsp_init_x86(AVFloatDSPContext *fdsp)
     int cpu_flags = av_get_cpu_flags();
 
 #if HAVE_6REGS && HAVE_INLINE_ASM
+#if ARCH_X86_32
     if (INLINE_AMD3DNOWEXT(cpu_flags)) {
         fdsp->vector_fmul_window  = vector_fmul_window_3dnowext;
     }
+#endif /* ARCH_X86_32 */
     if (INLINE_SSE(cpu_flags)) {
         fdsp->vector_fmul_window = vector_fmul_window_sse;
     }
diff --git a/libswscale/x86/rgb2rgb.c b/libswscale/x86/rgb2rgb.c
index d4f2580..5b4f073 100644
--- a/libswscale/x86/rgb2rgb.c
+++ b/libswscale/x86/rgb2rgb.c
@@ -90,11 +90,24 @@  DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
 #define COMPILE_TEMPLATE_AMD3DNOW 0
 #define COMPILE_TEMPLATE_SSE2 0
 
+#if ARCH_X86_32
 //MMX versions
 #undef RENAME
 #define RENAME(a) a ## _MMX
 #include "rgb2rgb_template.c"
 
+//3DNOW versions
+#undef RENAME
+#define COMPILE_TEMPLATE_MMXEXT 0
+#define COMPILE_TEMPLATE_SSE2 0
+#define COMPILE_TEMPLATE_AMD3DNOW 1
+#define RENAME(a) a ## _3DNOW
+#include "rgb2rgb_template.c"
+#undef COMPILE_TEMPLATE_MMXEXT
+#undef COMPILE_TEMPLATE_SSE2
+#undef COMPILE_TEMPLATE_AMD3DNOW
+#endif /* ARCH_X86_32 */
+
 // MMXEXT versions
 #undef RENAME
 #undef COMPILE_TEMPLATE_MMXEXT
@@ -109,17 +122,6 @@  DECLARE_ASM_CONST(8, uint64_t, blue_15mask)  = 0x0000001f0000001fULL;
 #define RENAME(a) a ## _SSE2
 #include "rgb2rgb_template.c"
 
-//3DNOW versions
-#undef RENAME
-#undef COMPILE_TEMPLATE_MMXEXT
-#undef COMPILE_TEMPLATE_SSE2
-#undef COMPILE_TEMPLATE_AMD3DNOW
-#define COMPILE_TEMPLATE_MMXEXT 0
-#define COMPILE_TEMPLATE_SSE2 0
-#define COMPILE_TEMPLATE_AMD3DNOW 1
-#define RENAME(a) a ## _3DNOW
-#include "rgb2rgb_template.c"
-
 /*
  RGB15->RGB16 original by Strepto/Astral
  ported to gcc & bugfixed : A'rpi
@@ -134,10 +136,12 @@  av_cold void rgb2rgb_init_x86(void)
 #if HAVE_INLINE_ASM
     int cpu_flags = av_get_cpu_flags();
 
+#if ARCH_X86_32
     if (INLINE_MMX(cpu_flags))
         rgb2rgb_init_MMX();
     if (INLINE_AMD3DNOW(cpu_flags))
         rgb2rgb_init_3DNOW();
+#endif /* ARCH_X86_32 */
     if (INLINE_MMXEXT(cpu_flags))
         rgb2rgb_init_MMXEXT();
     if (INLINE_SSE2(cpu_flags))