[v2] dsputil: x86: Convert mpeg4 qpel and dsputil avg to yasm

Message ID 1359178336-9924-1-git-send-email-daniel.d.kang@gmail.com
State Superseded
Headers show

Commit Message

Daniel Kang Jan. 26, 2013, 5:32 a.m.
---
Rebased on head
---
 libavcodec/x86/dsputil.asm            |  986 +++++++++++++++++++++++++++++++++
 libavcodec/x86/dsputil_avg_template.c |  791 +-------------------------
 libavcodec/x86/dsputil_mmx.c          |  923 ++++++++++++------------------
 libavcodec/x86/vc1dsp_mmx.c           |    4 +
 4 files changed, 1351 insertions(+), 1353 deletions(-)

Comments

Diego Biurrun Jan. 26, 2013, 7:49 a.m. | #1
On Sat, Jan 26, 2013 at 12:32:16AM -0500, Daniel Kang wrote:
> ---
> Rebased on head
> ---
>  libavcodec/x86/dsputil.asm            |  986 +++++++++++++++++++++++++++++++++
>  libavcodec/x86/dsputil_avg_template.c |  791 +-------------------------
>  libavcodec/x86/dsputil_mmx.c          |  923 ++++++++++++------------------
>  libavcodec/x86/vc1dsp_mmx.c           |    4 +
>  4 files changed, 1351 insertions(+), 1353 deletions(-)

Can qpel and avg be split?  This patch is once again huge ...

Diego
Diego Biurrun Jan. 26, 2013, 8:23 a.m. | #2
On Sat, Jan 26, 2013 at 12:32:16AM -0500, Daniel Kang wrote:
> --- a/libavcodec/x86/dsputil.asm
> +++ b/libavcodec/x86/dsputil.asm
> @@ -879,3 +884,984 @@ cglobal avg_pixels16, 4,5,4
> +
> +; HPEL mmxext
> +%macro PAVGB_OP 2
> +%if cpuflag(3dnow)
> +    pavgusb %1, %2
> +%else
> +    pavgb   %1, %2
> +%endif
> +%endmacro

We have a macro for this in x86util.asm and it works the other way around.
I'm very suspicious of this doing the right thing on CPUs with mmxext and
3dnow ...

> +; mpeg4 qpel
> +
> +%macro MPEG4_QPEL16_H_LOWPASS 1
> +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8

So it seems like dsputil.asm is becoming the new dumping ground for
functions of all kind.  It doubles in size after your patch and at
around 2k lines it starts to work against our current efforts of
splitting dsputil into sensibly-sized pieces.  If you continue your
porting efforts, it will probably end up around 5k lines or so.

Whenever there is an opportunity to make dsputil less monolithic comes
up, we should exploit it.  That seems to be the case here.

> +%macro QPEL_V_LOW 5
> +    paddw      m0, m1
> +    mova       m4, [pw_20]
> +    pmullw     m4, m0
> +    mova       m0, %4
> +    mova       m5, %1
> +    paddw      m5, m0
> +    psubw      m4, m5
> +    mova       m5, %2
> +    mova       m6, %3
> +    paddw      m5, m3
> +    paddw      m6, m2
> +    paddw      m6, m6
> +    psubw      m5, m6
> +    pmullw     m5, [pw_3]
> +    paddw      m4, [PW_ROUND]
> +    paddw      m5, m4
> +    psraw      m5, 5
> +    packuswb   m5, m5
> +    OP_MOV     %5, m5, m7
> +    SWAP 0,1,2,3
> +%endmacro

nit: SWAP is not special, format its arguments like the rest of the
macro instructions.

> --- a/libavcodec/x86/dsputil_avg_template.c
> +++ b/libavcodec/x86/dsputil_avg_template.c
> @@ -24,781 +24,32 @@
>  //FIXME the following could be optimized too ...
> +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> +    DEF(ff_put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
> +    DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
>  }
> +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> +    DEF(ff_put_pixels8_y2)(block  , pixels  , line_size, h);
> +    DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h);
>  }
> +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> +    DEF(ff_put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
> +    DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
>  }
> +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> +    DEF(ff_avg_pixels8)(block  , pixels  , line_size, h);
> +    DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h);
>  }
> +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> +    DEF(ff_avg_pixels8_x2)(block  , pixels  , line_size, h);
> +    DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h);
>  }
> +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> +    DEF(ff_avg_pixels8_y2)(block  , pixels  , line_size, h);
> +    DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h);
>  }
> +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> +    DEF(ff_avg_pixels8_xy2)(block  , pixels  , line_size, h);
> +    DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
>  }

If you feel motivated, you could fix the formatting as you are changing
all lines anyway.

> --- a/libavcodec/x86/dsputil_mmx.c
> +++ b/libavcodec/x86/dsputil_mmx.c
> @@ -80,6 +80,143 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
> +
> +#if HAVE_YASM
> +/* VC-1-specific */
> +#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
> +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
> +                               int stride, int rnd)
> +{
> +    ff_put_pixels8_mmx(dst, src, stride, 8);
> +}
> +
> +void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
> +                                  int stride, int rnd)
> +{
> +    ff_avg_pixels8_mmxext(dst, src, stride, 8);
> +}
> +
> +
> +/***********************************/
> +/* 3Dnow specific */
> +
> +#define DEF(x) x ## _3dnow
> +
> +#include "dsputil_avg_template.c"
> +
> +#undef DEF
> +
> +/***********************************/
> +/* MMXEXT specific */
> +
> +#define DEF(x) x ## _mmxext
> +
> +#include "dsputil_avg_template.c"
> +
> +#undef DEF
> +
> +#endif /* HAVE_YASM */

Please keep these blocks where they are for now to make the patch
more readable.  We can move them around later.

Diego
Daniel Kang Jan. 26, 2013, 6:01 p.m. | #3
On Sat, Jan 26, 2013 at 3:23 AM, Diego Biurrun <diego@biurrun.de> wrote:
> On Sat, Jan 26, 2013 at 12:32:16AM -0500, Daniel Kang wrote:
>> --- a/libavcodec/x86/dsputil.asm
>> +++ b/libavcodec/x86/dsputil.asm
>> @@ -879,3 +884,984 @@ cglobal avg_pixels16, 4,5,4
>> +
>> +; HPEL mmxext
>> +%macro PAVGB_OP 2
>> +%if cpuflag(3dnow)
>> +    pavgusb %1, %2
>> +%else
>> +    pavgb   %1, %2
>> +%endif
>> +%endmacro
>
> We have a macro for this in x86util.asm and it works the other way around.
> I'm very suspicious of this doing the right thing on CPUs with mmxext and
> 3dnow ...

You're probably right. Fixed.

>> +; mpeg4 qpel
>> +
>> +%macro MPEG4_QPEL16_H_LOWPASS 1
>> +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
>
> So it seems like dsputil.asm is becoming the new dumping ground for
> functions of all kind.  It doubles in size after your patch and at
> around 2k lines it starts to work against our current efforts of
> splitting dsputil into sensibly-sized pieces.  If you continue your
> porting efforts, it will probably end up around 5k lines or so.
>
> Whenever there is an opportunity to make dsputil less monolithic comes
> up, we should exploit it.  That seems to be the case here.

I was trying to avoid drama and bikeshedding re: file names and save
that for another patch. I guess I could split it in this patch if you
want.

>> +%macro QPEL_V_LOW 5
>> +    paddw      m0, m1
>> +    mova       m4, [pw_20]
>> +    pmullw     m4, m0
>> +    mova       m0, %4
>> +    mova       m5, %1
>> +    paddw      m5, m0
>> +    psubw      m4, m5
>> +    mova       m5, %2
>> +    mova       m6, %3
>> +    paddw      m5, m3
>> +    paddw      m6, m2
>> +    paddw      m6, m6
>> +    psubw      m5, m6
>> +    pmullw     m5, [pw_3]
>> +    paddw      m4, [PW_ROUND]
>> +    paddw      m5, m4
>> +    psraw      m5, 5
>> +    packuswb   m5, m5
>> +    OP_MOV     %5, m5, m7
>> +    SWAP 0,1,2,3
>> +%endmacro
>
> nit: SWAP is not special, format its arguments like the rest of the
> macro instructions.

I disagree on this one, I think SWAP is special.

>> --- a/libavcodec/x86/dsputil_avg_template.c
>> +++ b/libavcodec/x86/dsputil_avg_template.c
>> @@ -24,781 +24,32 @@
>>  //FIXME the following could be optimized too ...
>> +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> +    DEF(ff_put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
>> +    DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
>>  }
>> +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> +    DEF(ff_put_pixels8_y2)(block  , pixels  , line_size, h);
>> +    DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h);
>>  }
>> +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> +    DEF(ff_put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
>> +    DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
>>  }
>> +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> +    DEF(ff_avg_pixels8)(block  , pixels  , line_size, h);
>> +    DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h);
>>  }
>> +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> +    DEF(ff_avg_pixels8_x2)(block  , pixels  , line_size, h);
>> +    DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h);
>>  }
>> +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> +    DEF(ff_avg_pixels8_y2)(block  , pixels  , line_size, h);
>> +    DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h);
>>  }
>> +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> +    DEF(ff_avg_pixels8_xy2)(block  , pixels  , line_size, h);
>> +    DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
>>  }
>
> If you feel motivated, you could fix the formatting as you are changing
> all lines anyway.

Fixed.

>> --- a/libavcodec/x86/dsputil_mmx.c
>> +++ b/libavcodec/x86/dsputil_mmx.c
>> @@ -80,6 +80,143 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
>> +
>> +#if HAVE_YASM
>> +/* VC-1-specific */
>> +#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
>> +void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
>> +                               int stride, int rnd)
>> +{
>> +    ff_put_pixels8_mmx(dst, src, stride, 8);
>> +}
>> +
>> +void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
>> +                                  int stride, int rnd)
>> +{
>> +    ff_avg_pixels8_mmxext(dst, src, stride, 8);
>> +}
>> +
>> +
>> +/***********************************/
>> +/* 3Dnow specific */
>> +
>> +#define DEF(x) x ## _3dnow
>> +
>> +#include "dsputil_avg_template.c"
>> +
>> +#undef DEF
>> +
>> +/***********************************/
>> +/* MMXEXT specific */
>> +
>> +#define DEF(x) x ## _mmxext
>> +
>> +#include "dsputil_avg_template.c"
>> +
>> +#undef DEF
>> +
>> +#endif /* HAVE_YASM */
>
> Please keep these blocks where they are for now to make the patch
> more readable.  We can move them around later.

Fixed.
Diego Biurrun Jan. 26, 2013, 6:25 p.m. | #4
On Sat, Jan 26, 2013 at 01:01:09PM -0500, Daniel Kang wrote:
> On Sat, Jan 26, 2013 at 3:23 AM, Diego Biurrun <diego@biurrun.de> wrote:
> > On Sat, Jan 26, 2013 at 12:32:16AM -0500, Daniel Kang wrote:
> >> --- a/libavcodec/x86/dsputil.asm
> >> +++ b/libavcodec/x86/dsputil.asm
> >> @@ -879,3 +884,984 @@ cglobal avg_pixels16, 4,5,4
> >> +; mpeg4 qpel
> >> +
> >> +%macro MPEG4_QPEL16_H_LOWPASS 1
> >> +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
> >
> > So it seems like dsputil.asm is becoming the new dumping ground for
> > functions of all kind.  It doubles in size after your patch and at
> > around 2k lines it starts to work against our current efforts of
> > splitting dsputil into sensibly-sized pieces.  If you continue your
> > porting efforts, it will probably end up around 5k lines or so.
> >
> > Whenever there is an opportunity to make dsputil less monolithic comes
> > up, we should exploit it.  That seems to be the case here.
> 
> I was trying to avoid drama and bikeshedding re: file names and save
> that for another patch. I guess I could split it in this patch if you
> want.

Come on, don't blow the issue out of proportion.  Just come up with a
suitable name, maybe ask one or two other people that know the code
for suitable suggestions.  My suggestion would be mpeg4qpel.asm, maybe
h263qpel.asm, but the former is probably more fitting, not sure.

Even in case you should get three different suggestions and change to one
after the other, it's easy enough with git and will not hinder your
workflow at all.

However, going back and forth after your patch has been pushed just
creates unnecessary churn and annoyance.

> >> +%macro QPEL_V_LOW 5
> >> +    paddw      m0, m1
> >> +    mova       m4, [pw_20]
> >> +    pmullw     m4, m0
> >> +    mova       m0, %4
> >> +    mova       m5, %1
> >> +    paddw      m5, m0
> >> +    psubw      m4, m5
> >> +    mova       m5, %2
> >> +    mova       m6, %3
> >> +    paddw      m5, m3
> >> +    paddw      m6, m2
> >> +    paddw      m6, m6
> >> +    psubw      m5, m6
> >> +    pmullw     m5, [pw_3]
> >> +    paddw      m4, [PW_ROUND]
> >> +    paddw      m5, m4
> >> +    psraw      m5, 5
> >> +    packuswb   m5, m5
> >> +    OP_MOV     %5, m5, m7
> >> +    SWAP 0,1,2,3
> >> +%endmacro
> >
> > nit: SWAP is not special, format its arguments like the rest of the
> > macro instructions.
> 
> I disagree on this one, I think SWAP is special.

The rest of the codebase disagrees with you then.  In the rest of the
files SWAP has spaces after comma and arguments aligned with the other
instructions.

> >> --- a/libavcodec/x86/dsputil_avg_template.c
> >> +++ b/libavcodec/x86/dsputil_avg_template.c
> >> @@ -24,781 +24,32 @@
> >>  //FIXME the following could be optimized too ...
> >> +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> >> +    DEF(ff_put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
> >> +    DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
> >>  }
> >> +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> >> +    DEF(ff_put_pixels8_y2)(block  , pixels  , line_size, h);
> >> +    DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h);
> >>  }
> >> +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> >> +    DEF(ff_put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
> >> +    DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
> >>  }
> >> +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> >> +    DEF(ff_avg_pixels8)(block  , pixels  , line_size, h);
> >> +    DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h);
> >>  }
> >> +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> >> +    DEF(ff_avg_pixels8_x2)(block  , pixels  , line_size, h);
> >> +    DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h);
> >>  }
> >> +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> >> +    DEF(ff_avg_pixels8_y2)(block  , pixels  , line_size, h);
> >> +    DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h);
> >>  }
> >> +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
> >> +    DEF(ff_avg_pixels8_xy2)(block  , pixels  , line_size, h);
> >> +    DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
> >>  }
> >
> > If you feel motivated, you could fix the formatting as you are changing
> > all lines anyway.
> 
> Fixed.

Hehe, sort of :)

Try running the following (GNU) sed command on your tree:

  sed -i -e 's/+/ + /g' -e 's/  ,/,    /g' libavcodec/x86/dsputil_avg_template.c

That should prettyprint it nicely.

> >> --- a/libavcodec/x86/dsputil_mmx.c
> >> +++ b/libavcodec/x86/dsputil_mmx.c
> >> @@ -80,6 +80,143 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
> >> +
> >> +/***********************************/
> >> +/* 3Dnow specific */
> >> +
> >> +#define DEF(x) x ## _3dnow
> >> +
> >> +#include "dsputil_avg_template.c"
> >> +
> >> +#undef DEF
> >> +
> >> +/***********************************/
> >> +/* MMXEXT specific */
> >> +
> >> +#define DEF(x) x ## _mmxext
> >> +
> >> +#include "dsputil_avg_template.c"
> >> +
> >> +#undef DEF
> >> +
> >> +#endif /* HAVE_YASM */
> >
> > Please keep these blocks where they are for now to make the patch
> > more readable.  We can move them around later.
> 
> Fixed.

The template inclusion appears to get moved around still...

Diego
Daniel Kang Jan. 26, 2013, 7:06 p.m. | #5
On Sat, Jan 26, 2013 at 1:25 PM, Diego Biurrun <diego@biurrun.de> wrote:
> On Sat, Jan 26, 2013 at 01:01:09PM -0500, Daniel Kang wrote:
>> On Sat, Jan 26, 2013 at 3:23 AM, Diego Biurrun <diego@biurrun.de> wrote:
>> > On Sat, Jan 26, 2013 at 12:32:16AM -0500, Daniel Kang wrote:
>> >> --- a/libavcodec/x86/dsputil.asm
>> >> +++ b/libavcodec/x86/dsputil.asm
>> >> @@ -879,3 +884,984 @@ cglobal avg_pixels16, 4,5,4
>> >> +; mpeg4 qpel
>> >> +
>> >> +%macro MPEG4_QPEL16_H_LOWPASS 1
>> >> +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
>> >
>> > So it seems like dsputil.asm is becoming the new dumping ground for
>> > functions of all kind.  It doubles in size after your patch and at
>> > around 2k lines it starts to work against our current efforts of
>> > splitting dsputil into sensibly-sized pieces.  If you continue your
>> > porting efforts, it will probably end up around 5k lines or so.
>> >
>> > Whenever there is an opportunity to make dsputil less monolithic comes
>> > up, we should exploit it.  That seems to be the case here.
>>
>> I was trying to avoid drama and bikeshedding re: file names and save
>> that for another patch. I guess I could split it in this patch if you
>> want.
>
> Come on, don't blow the issue out of proportion.  Just come up with a
> suitable name, maybe ask one or two other people that know the code
> for suitable suggestions.  My suggestion would be mpeg4qpel.asm, maybe
> h263qpel.asm, but the former is probably more fitting, not sure.
>
> Even in case you should get three different suggestions and change to one
> after the other, it's easy enough with git and will not hinder your
> workflow at all.
>
> However, going back and forth after your patch has been pushed just
> creates unnecessary churn and annoyance.

Very well, moved to mpeg4qpel.asm

>> >> +%macro QPEL_V_LOW 5
>> >> +    paddw      m0, m1
>> >> +    mova       m4, [pw_20]
>> >> +    pmullw     m4, m0
>> >> +    mova       m0, %4
>> >> +    mova       m5, %1
>> >> +    paddw      m5, m0
>> >> +    psubw      m4, m5
>> >> +    mova       m5, %2
>> >> +    mova       m6, %3
>> >> +    paddw      m5, m3
>> >> +    paddw      m6, m2
>> >> +    paddw      m6, m6
>> >> +    psubw      m5, m6
>> >> +    pmullw     m5, [pw_3]
>> >> +    paddw      m4, [PW_ROUND]
>> >> +    paddw      m5, m4
>> >> +    psraw      m5, 5
>> >> +    packuswb   m5, m5
>> >> +    OP_MOV     %5, m5, m7
>> >> +    SWAP 0,1,2,3
>> >> +%endmacro
>> >
>> > nit: SWAP is not special, format its arguments like the rest of the
>> > macro instructions.
>>
>> I disagree on this one, I think SWAP is special.
>
> The rest of the codebase disagrees with you then.  In the rest of the
> files SWAP has spaces after comma and arguments aligned with the other
> instructions.

Only some of it does, but changed.

>> >> --- a/libavcodec/x86/dsputil_avg_template.c
>> >> +++ b/libavcodec/x86/dsputil_avg_template.c
>> >> @@ -24,781 +24,32 @@
>> >>  //FIXME the following could be optimized too ...
>> >> +static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> >> +    DEF(ff_put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
>> >> +    DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
>> >>  }
>> >> +static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> >> +    DEF(ff_put_pixels8_y2)(block  , pixels  , line_size, h);
>> >> +    DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h);
>> >>  }
>> >> +static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> >> +    DEF(ff_put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
>> >> +    DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
>> >>  }
>> >> +static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> >> +    DEF(ff_avg_pixels8)(block  , pixels  , line_size, h);
>> >> +    DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h);
>> >>  }
>> >> +static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> >> +    DEF(ff_avg_pixels8_x2)(block  , pixels  , line_size, h);
>> >> +    DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h);
>> >>  }
>> >> +static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> >> +    DEF(ff_avg_pixels8_y2)(block  , pixels  , line_size, h);
>> >> +    DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h);
>> >>  }
>> >> +static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
>> >> +    DEF(ff_avg_pixels8_xy2)(block  , pixels  , line_size, h);
>> >> +    DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
>> >>  }
>> >
>> > If you feel motivated, you could fix the formatting as you are changing
>> > all lines anyway.
>>
>> Fixed.
>
> Hehe, sort of :)
>
> Try running the following (GNU) sed command on your tree:
>
>   sed -i -e 's/+/ + /g' -e 's/  ,/,    /g' libavcodec/x86/dsputil_avg_template.c
>
> That should prettyprint it nicely.

Fixed.

>> >> --- a/libavcodec/x86/dsputil_mmx.c
>> >> +++ b/libavcodec/x86/dsputil_mmx.c
>> >> @@ -80,6 +80,143 @@ DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
>> >> +
>> >> +/***********************************/
>> >> +/* 3Dnow specific */
>> >> +
>> >> +#define DEF(x) x ## _3dnow
>> >> +
>> >> +#include "dsputil_avg_template.c"
>> >> +
>> >> +#undef DEF
>> >> +
>> >> +/***********************************/
>> >> +/* MMXEXT specific */
>> >> +
>> >> +#define DEF(x) x ## _mmxext
>> >> +
>> >> +#include "dsputil_avg_template.c"
>> >> +
>> >> +#undef DEF
>> >> +
>> >> +#endif /* HAVE_YASM */
>> >
>> > Please keep these blocks where they are for now to make the patch
>> > more readable.  We can move them around later.
>>
>> Fixed.
>
> The template inclusion appears to get moved around still...

Oops, fixed.
Ronald Bultje Jan. 27, 2013, 12:06 a.m. | #6
Hi,

On Sat, Jan 26, 2013 at 10:01 AM, Daniel Kang <daniel.d.kang@gmail.com> wrote:
> On Sat, Jan 26, 2013 at 3:23 AM, Diego Biurrun <diego@biurrun.de> wrote:
>> On Sat, Jan 26, 2013 at 12:32:16AM -0500, Daniel Kang wrote:
>>> --- a/libavcodec/x86/dsputil.asm
>>> +++ b/libavcodec/x86/dsputil.asm
>>> @@ -879,3 +884,984 @@ cglobal avg_pixels16, 4,5,4
>>> +
>>> +; HPEL mmxext
>>> +%macro PAVGB_OP 2
>>> +%if cpuflag(3dnow)
>>> +    pavgusb %1, %2
>>> +%else
>>> +    pavgb   %1, %2
>>> +%endif
>>> +%endmacro
>>
>> We have a macro for this in x86util.asm and it works the other way around.
>> I'm very suspicious of this doing the right thing on CPUs with mmxext and
>> 3dnow ...
>
> You're probably right. Fixed.
>
>>> +; mpeg4 qpel
>>> +
>>> +%macro MPEG4_QPEL16_H_LOWPASS 1
>>> +cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
>>
>> So it seems like dsputil.asm is becoming the new dumping ground for
>> functions of all kind.  It doubles in size after your patch and at
>> around 2k lines it starts to work against our current efforts of
>> splitting dsputil into sensibly-sized pieces.  If you continue your
>> porting efforts, it will probably end up around 5k lines or so.
>>
>> Whenever there is an opportunity to make dsputil less monolithic comes
>> up, we should exploit it.  That seems to be the case here.
>
> I was trying to avoid drama and bikeshedding re: file names and save
> that for another patch. I guess I could split it in this patch if you
> want.

While at it, please split hpel functions to a new file called
hpeldsp.asm. This will make my life slightly easier later on.

Ronald

Patch

diff --git a/libavcodec/x86/dsputil.asm b/libavcodec/x86/dsputil.asm
index 65f4b37..6ba080f 100644
--- a/libavcodec/x86/dsputil.asm
+++ b/libavcodec/x86/dsputil.asm
@@ -22,6 +22,11 @@ 
 %include "libavutil/x86/x86util.asm"
 
 SECTION_RODATA
+cextern pb_1
+cextern pw_3
+cextern pw_15
+cextern pw_16
+cextern pw_20
 pb_f: times 16 db 15
 pb_zzzzzzzz77777777: times 8 db -1
 pb_7: times 8 db 7
@@ -879,3 +884,984 @@  cglobal avg_pixels16, 4,5,4
     lea          r0, [r0+r2*4]
     jnz       .loop
     REP_RET
+
+
+; HPEL mmxext
+%macro PAVGB_OP 2
+%if cpuflag(3dnow)
+    pavgusb %1, %2
+%else
+    pavgb   %1, %2
+%endif
+%endmacro
+
+; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_PIXELS8_X2 0
+cglobal put_pixels8_x2, 4,5
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*2]
+.loop:
+    mova         m0, [r1]
+    mova         m1, [r1+r2]
+    PAVGB_OP     m0, [r1+1]
+    PAVGB_OP     m1, [r1+r2+1]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    add          r1, r4
+    add          r0, r4
+    mova         m0, [r1]
+    mova         m1, [r1+r2]
+    PAVGB_OP     m0, [r1+1]
+    PAVGB_OP     m1, [r1+r2+1]
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS8_X2
+INIT_MMX 3dnow
+PUT_PIXELS8_X2
+
+
+; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PUT_NO_RND_PIXELS8_L2 0
+cglobal put_no_rnd_pixels8_l2, 6,6
+    movsxdifnidn r4, r4d
+    movsxdifnidn r3, r3d
+    pcmpeqb      m6, m6
+    test        r5d, 1
+    je .loop
+    mova         m0, [r1]
+    mova         m1, [r2]
+    add          r1, r4
+    add          r2, 8
+    pxor         m0, m6
+    pxor         m1, m6
+    PAVGB_OP     m0, m1
+    pxor         m0, m6
+    mova       [r0], m0
+    add          r0, r3
+    dec r5d
+.loop:
+    mova         m0, [r1]
+    add          r1, r4
+    mova         m1, [r1]
+    add          r1, r4
+    mova         m2, [r2]
+    mova         m3, [r2+8]
+    pxor         m0, m6
+    pxor         m1, m6
+    pxor         m2, m6
+    pxor         m3, m6
+    PAVGB_OP     m0, m2
+    PAVGB_OP     m1, m3
+    pxor         m0, m6
+    pxor         m1, m6
+    mova       [r0], m0
+    add          r0, r3
+    mova       [r0], m1
+    add          r0, r3
+    mova         m0, [r1]
+    add          r1, r4
+    mova         m1, [r1]
+    add          r1, r4
+    mova         m2, [r2+16]
+    mova         m3, [r2+24]
+    pxor         m0, m6
+    pxor         m1, m6
+    pxor         m2, m6
+    pxor         m3, m6
+    PAVGB_OP     m0, m2
+    PAVGB_OP     m1, m3
+    pxor         m0, m6
+    pxor         m1, m6
+    mova       [r0], m0
+    add          r0, r3
+    mova       [r0], m1
+    add          r0, r3
+    add          r2, 32
+    sub         r5d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_L2
+
+
+; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_PIXELS_16 0
+cglobal put_pixels16_x2, 4,5
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*2]
+.loop:
+    mova         m0, [r1]
+    mova         m1, [r1+r2]
+    mova         m2, [r1+8]
+    mova         m3, [r1+r2+8]
+    PAVGB_OP     m0, [r1+1]
+    PAVGB_OP     m1, [r1+r2+1]
+    PAVGB_OP     m2, [r1+9]
+    PAVGB_OP     m3, [r1+r2+9]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova     [r0+8], m2
+    mova  [r0+r2+8], m3
+    add          r1, r4
+    add          r0, r4
+    mova         m0, [r1]
+    mova         m1, [r1+r2]
+    mova         m2, [r1+8]
+    mova         m3, [r1+r2+8]
+    PAVGB_OP     m0, [r1+1]
+    PAVGB_OP     m1, [r1+r2+1]
+    PAVGB_OP     m2, [r1+9]
+    PAVGB_OP     m3, [r1+r2+9]
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova     [r0+8], m2
+    mova  [r0+r2+8], m3
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS_16
+INIT_MMX 3dnow
+PUT_PIXELS_16
+
+
+; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
+%macro PUT_NO_RND_PIXELS16_l2 0
+cglobal put_no_rnd_pixels16_l2, 5,5
+    movsxdifnidn r3, r3
+    movsxdifnidn r4, r4d
+    pcmpeqb      m6, m6
+    test        r5d, 1
+    je .loop
+    mova         m0, [r1]
+    mova         m1, [r1+8]
+    mova         m2, [r2]
+    mova         m3, [r2+8]
+    pxor         m0, m6
+    pxor         m1, m6
+    pxor         m2, m6
+    pxor         m3, m6
+    PAVGB_OP     m0, m2
+    PAVGB_OP     m1, m3
+    pxor         m0, m6
+    pxor         m1, m6
+    add          r1, r4
+    add          r2, 16
+    mova       [r0], m0
+    mova     [r0+8], m1
+    add          r0, r3
+    dec r5d
+.loop:
+    mova         m0, [r1]
+    mova         m1, [r1+8]
+    add          r1, r4
+    mova         m2, [r2]
+    mova         m3, [r2+8]
+    pxor         m0, m6
+    pxor         m1, m6
+    pxor         m2, m6
+    pxor         m3, m6
+    PAVGB_OP     m0, m2
+    PAVGB_OP     m1, m3
+    pxor         m0, m6
+    pxor         m1, m6
+    mova       [r0], m0
+    mova     [r0+8], m1
+    add          r0, r3
+    mova         m0, [r1]
+    mova         m1, [r1+8]
+    add          r1, r4
+    mova         m2, [r2+16]
+    mova         m3, [r2+24]
+    pxor         m0, m6
+    pxor         m1, m6
+    pxor         m2, m6
+    pxor         m3, m6
+    PAVGB_OP     m0, m2
+    PAVGB_OP     m1, m3
+    pxor         m0, m6
+    pxor         m1, m6
+    mova       [r0], m0
+    mova     [r0+8], m1
+    add          r0, r3
+    add          r2, 32
+    sub         r5d, 2
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS16_l2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS16_l2
+
+
+; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_NO_RND_PIXELS8_X2 0
+cglobal put_no_rnd_pixels8_x2, 4,5
+    mova         m6, [pb_1]
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*2]
+.loop:
+    mova         m0, [r1]
+    mova         m2, [r1+r2]
+    mova         m1, [r1+1]
+    mova         m3, [r1+r2+1]
+    add          r1, r4
+    psubusb      m0, m6
+    psubusb      m2, m6
+    PAVGB_OP     m0, m1
+    PAVGB_OP     m2, m3
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    mova         m0, [r1]
+    mova         m1, [r1+1]
+    mova         m2, [r1+r2]
+    mova         m3, [r1+r2+1]
+    add          r0, r4
+    add          r1, r4
+    psubusb      m0, m6
+    psubusb      m2, m6
+    PAVGB_OP     m0, m1
+    PAVGB_OP     m2, m3
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_X2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_X2
+
+
+; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
+cglobal put_no_rnd_pixels8_x2_exact, 4,5
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*3]
+    pcmpeqb      m6, m6
+.loop:
+    mova         m0, [r1]
+    mova         m2, [r1+r2]
+    mova         m1, [r1+1]
+    mova         m3, [r1+r2+1]
+    pxor         m0, m6
+    pxor         m2, m6
+    pxor         m1, m6
+    pxor         m3, m6
+    PAVGB_OP     m0, m1
+    PAVGB_OP     m2, m3
+    pxor         m0, m6
+    pxor         m2, m6
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    mova         m0, [r1+r2*2]
+    mova         m1, [r1+r2*2+1]
+    mova         m2, [r1+r4]
+    mova         m3, [r1+r4+1]
+    pxor         m0, m6
+    pxor         m1, m6
+    pxor         m2, m6
+    pxor         m3, m6
+    PAVGB_OP     m0, m1
+    PAVGB_OP     m2, m3
+    pxor         m0, m6
+    pxor         m2, m6
+    mova  [r0+r2*2], m0
+    mova    [r0+r4], m2
+    lea          r1, [r1+r2*4]
+    lea          r0, [r0+r2*4]
+    sub         r3d, 4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_X2_EXACT
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_X2_EXACT
+
+
+; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_PIXELS8_Y2 0
+cglobal put_pixels8_y2, 4,5
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*2]
+    mova         m0, [r1]
+    sub          r0, r2
+.loop:
+    mova         m1, [r1+r2]
+    mova         m2, [r1+r4]
+    add          r1, r4
+    PAVGB_OP     m0, m1
+    PAVGB_OP     m1, m2
+    mova    [r0+r2], m0
+    mova    [r0+r4], m1
+    mova         m1, [r1+r2]
+    mova         m0, [r1+r4]
+    add          r0, r4
+    add          r1, r4
+    PAVGB_OP     m2, m1
+    PAVGB_OP     m1, m0
+    mova    [r0+r2], m2
+    mova    [r0+r4], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_PIXELS8_Y2
+INIT_MMX 3dnow
+PUT_PIXELS8_Y2
+
+
+; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_NO_RND_PIXELS8_Y2 0
+cglobal put_no_rnd_pixels8_y2, 4,5
+    mova         m6, [pb_1]
+    movsxdifnidn r2, r2d
+    lea          r4, [r2+r2]
+    mova         m0, [r1]
+    sub          r0, r2
+.loop:
+    mova         m1, [r1+r2]
+    mova         m2, [r1+r4]
+    add          r1, r4
+    psubusb      m1, m6
+    PAVGB_OP     m0, m1
+    PAVGB_OP     m1, m2
+    mova    [r0+r2], m0
+    mova    [r0+r4], m1
+    mova         m1, [r1+r2]
+    mova         m0, [r1+r4]
+    add          r0, r4
+    add          r1, r4
+    psubusb      m1, m6
+    PAVGB_OP     m2, m1
+    PAVGB_OP     m1, m0
+    mova    [r0+r2], m2
+    mova    [r0+r4], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_Y2
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_Y2
+
+
+; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
+cglobal put_no_rnd_pixels8_y2_exact, 4,5
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*3]
+    mova         m0, [r1]
+    pcmpeqb      m6, m6
+    add          r1, r2
+    pxor         m0, m6
+.loop:
+    mova         m1, [r1]
+    mova         m2, [r1+r2]
+    pxor         m1, m6
+    pxor         m2, m6
+    PAVGB_OP     m0, m1
+    PAVGB_OP     m1, m2
+    pxor         m0, m6
+    pxor         m1, m6
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova         m1, [r1+r2*2]
+    mova         m0, [r1+r4]
+    pxor         m1, m6
+    pxor         m0, m6
+    PAVGB_OP     m2, m1
+    PAVGB_OP     m1, m0
+    pxor         m2, m6
+    pxor         m1, m6
+    mova  [r0+r2*2], m2
+    mova    [r0+r4], m1
+    lea          r1, [r1+r2*4]
+    lea          r0, [r0+r2*4]
+    sub         r3d, 4
+    jg .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+PUT_NO_RND_PIXELS8_Y2_EXACT
+INIT_MMX 3dnow
+PUT_NO_RND_PIXELS8_Y2_EXACT
+
+
+; avg_pixels8(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro AVG_PIXELS8 0
+cglobal avg_pixels8, 4,5
+    movsxdifnidn r2, edx
+    lea          r4, [r2+r2]
+.loop:
+    mova         m0, [r0]
+    mova         m1, [r0+r2]
+    PAVGB_OP     m0, [r1]
+    PAVGB_OP     m1, [r1+r2]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    add          r1, r4
+    add          r0, r4
+    mova         m0, [r0]
+    mova         m1, [r0+r2]
+    PAVGB_OP     m0, [r1]
+    PAVGB_OP     m1, [r1+r2]
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX 3dnow
+AVG_PIXELS8
+
+
+; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro AVG_PIXELS8_X2 0
+cglobal avg_pixels8_x2, 4,5
+    movsxdifnidn r2, edx
+    lea          r4, [r2*2]
+.loop:
+    mova         m0, [r1]
+    mova         m2, [r1+r2]
+    PAVGB_OP     m0, [r1+1]
+    PAVGB_OP     m2, [r1+r2+1]
+    PAVGB_OP     m0, [r0]
+    PAVGB_OP     m2, [r0+r2]
+    add          r1, r4
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    mova         m0, [r1]
+    mova         m2, [r1+r2]
+    PAVGB_OP     m0, [r1+1]
+    PAVGB_OP     m2, [r1+r2+1]
+    add          r0, r4
+    add          r1, r4
+    PAVGB_OP     m0, [r0]
+    PAVGB_OP     m2, [r0+r2]
+    mova       [r0], m0
+    mova    [r0+r2], m2
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_X2
+INIT_MMX 3dnow
+AVG_PIXELS8_X2
+
+
+; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro AVG_PIXELS8_Y2 0
+cglobal avg_pixels8_y2, 4,5
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*2]
+    mova         m0, [r1]
+    sub          r0, r2
+.loop:
+    mova         m1, [r1+r2]
+    mova         m2, [r1+r4]
+    add          r1, r4
+    PAVGB_OP     m0, m1
+    PAVGB_OP     m1, m2
+    mova         m3, [r0+r2]
+    mova         m4, [r0+r4]
+    PAVGB_OP     m0, m3
+    PAVGB_OP     m1, m4
+    mova    [r0+r2], m0
+    mova    [r0+r4], m1
+    mova         m1, [r1+r2]
+    mova         m0, [r1+r4]
+    PAVGB_OP     m2, m1
+    PAVGB_OP     m1, m0
+    add          r0, r4
+    add          r1, r4
+    mova         m3, [r0+r2]
+    mova         m4, [r0+r4]
+    PAVGB_OP     m2, m3
+    PAVGB_OP     m1, m4
+    mova    [r0+r2], m2
+    mova    [r0+r4], m1
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_Y2
+INIT_MMX 3dnow
+AVG_PIXELS8_Y2
+
+
+; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
+%macro AVG_PIXELS8_XY2 0
+cglobal avg_pixels8_xy2, 4,5
+    mova         m6, [pb_1]
+    movsxdifnidn r2, r2d
+    lea          r4, [r2*2]
+    mova         m0, [r1]
+    pavgb        m0, [r1+1]
+.loop:
+    mova         m2, [r1+r4]
+    mova         m1, [r1+r2]
+    psubusb      m2, m6
+    pavgb        m1, [r1+r2+1]
+    pavgb        m2, [r1+r4+1]
+    add          r1, r4
+    pavgb        m0, m1
+    pavgb        m1, m2
+    pavgb        m0, [r0]
+    pavgb        m1, [r0+r2]
+    mova       [r0], m0
+    mova    [r0+r2], m1
+    mova         m1, [r1+r2]
+    mova         m0, [r1+r4]
+    pavgb        m1, [r1+r2+1]
+    pavgb        m0, [r1+r4+1]
+    add          r0, r4
+    add          r1, r4
+    pavgb        m2, m1
+    pavgb        m1, m0
+    pavgb        m2, [r0]
+    pavgb        m1, [r0+r2]
+    mova       [r0], m2
+    mova    [r0+r2], m2
+    add          r0, r4
+    sub         r3d, 4
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+AVG_PIXELS8_XY2
+INIT_MMX 3dnow
+AVG_PIXELS8_XY2
+
+
+
+
+; mpeg4 qpel
+
+%macro MPEG4_QPEL16_H_LOWPASS 1
+cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 8
+    movsxdifnidn r2, r2d
+    movsxdifnidn r3, r3d
+    pxor         m7, m7
+.loop:
+    mova         m0, [r1]
+    mova         m1, m0
+    mova         m2, m0
+    punpcklbw    m0, m7
+    punpckhbw    m1, m7
+    pshufw       m5, m0, 0x90
+    pshufw       m6, m0, 0x41
+    mova         m3, m2
+    mova         m4, m2
+    psllq        m2, 8
+    psllq        m3, 16
+    psllq        m4, 24
+    punpckhbw    m2, m7
+    punpckhbw    m3, m7
+    punpckhbw    m4, m7
+    paddw        m5, m3
+    paddw        m6, m2
+    paddw        m5, m5
+    psubw        m6, m5
+    pshufw       m5, m0, 6
+    pmullw       m6, [pw_3]
+    paddw        m0, m4
+    paddw        m5, m1
+    pmullw       m0, [pw_20]
+    psubw        m0, m5
+    paddw        m6, [PW_ROUND]
+    paddw        m0, m6
+    psraw        m0, 5
+    mova    [rsp-8], m0
+    mova         m0, [r1+5]
+    mova         m5, m0
+    mova         m6, m0
+    psrlq        m0, 8
+    psrlq        m5, 16
+    punpcklbw    m0, m7
+    punpcklbw    m5, m7
+    paddw        m2, m0
+    paddw        m3, m5
+    paddw        m2, m2
+    psubw        m3, m2
+    mova         m2, m6
+    psrlq        m6, 24
+    punpcklbw    m2, m7
+    punpcklbw    m6, m7
+    pmullw       m3, [pw_3]
+    paddw        m1, m2
+    paddw        m4, m6
+    pmullw       m1, [pw_20]
+    psubw        m3, m4
+    paddw        m1, [PW_ROUND]
+    paddw        m3, m1
+    psraw        m3, 5
+    mova         m1, [rsp-8]
+    packuswb     m1, m3
+    OP_MOV     [r0], m1, m4
+    mova         m1, [r1+9]
+    mova         m4, m1
+    mova         m3, m1
+    psrlq        m1, 8
+    psrlq        m4, 16
+    punpcklbw    m1, m7
+    punpcklbw    m4, m7
+    paddw        m5, m1
+    paddw        m0, m4
+    paddw        m5, m5
+    psubw        m0, m5
+    mova         m5, m3
+    psrlq        m3, 24
+    pmullw       m0, [pw_3]
+    punpcklbw    m3, m7
+    paddw        m2, m3
+    psubw        m0, m2
+    mova         m2, m5
+    punpcklbw    m2, m7
+    punpckhbw    m5, m7
+    paddw        m6, m2
+    pmullw       m6, [pw_20]
+    paddw        m0, [PW_ROUND]
+    paddw        m0, m6
+    psraw        m0, 5
+    paddw        m3, m5
+    pshufw       m6, m5, 0xf9
+    paddw        m6, m4
+    pshufw       m4, m5, 0xbe
+    pshufw       m5, m5, 0x6f
+    paddw        m4, m1
+    paddw        m5, m2
+    paddw        m6, m6
+    psubw        m4, m6
+    pmullw       m3, [pw_20]
+    pmullw       m4, [pw_3]
+    psubw        m3, m5
+    paddw        m4, [PW_ROUND]
+    paddw        m4, m3
+    psraw        m4, 5
+    packuswb     m0, m4
+    OP_MOV   [r0+8], m0, m4
+    add          r1, r3
+    add          r0, r2
+    dec r4d
+    jne .loop
+    REP_RET
+%endmacro
+
+%macro PUT_OP 2-3
+    mova %1, %2
+%endmacro
+
+%macro AVG_OP 2-3
+    mova  %3, %1
+    pavgb %2, %3
+    mova  %1, %2
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OP
+MPEG4_QPEL16_H_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OP
+MPEG4_QPEL16_H_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OP
+MPEG4_QPEL16_H_LOWPASS put_no_rnd
+
+
+
+%macro MPEG4_QPEL8_H_LOWPASS 1
+cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
+    movsxdifnidn r2, r2d
+    movsxdifnidn r3, r3d
+    pxor         m7, m7
+.loop:
+    mova         m0, [r1]
+    mova         m1, m0
+    mova         m2, m0
+    punpcklbw    m0, m7
+    punpckhbw    m1, m7
+    pshufw       m5, m0, 0x90
+    pshufw       m6, m0, 0x41
+    mova         m3, m2
+    mova         m4, m2
+    psllq        m2, 8
+    psllq        m3, 16
+    psllq        m4, 24
+    punpckhbw    m2, m7
+    punpckhbw    m3, m7
+    punpckhbw    m4, m7
+    paddw        m5, m3
+    paddw        m6, m2
+    paddw        m5, m5
+    psubw        m6, m5
+    pshufw       m5, m0, 0x6
+    pmullw       m6, [pw_3]
+    paddw        m0, m4
+    paddw        m5, m1
+    pmullw       m0, [pw_20]
+    psubw        m0, m5
+    paddw        m6, [PW_ROUND]
+    paddw        m0, m6
+    psraw        m0, 5
+    movh         m5, [r1+5]
+    punpcklbw    m5, m7
+    pshufw       m6, m5, 0xf9
+    paddw        m1, m5
+    paddw        m2, m6
+    pshufw       m6, m5, 0xbe
+    pshufw       m5, m5, 0x6f
+    paddw        m3, m6
+    paddw        m4, m5
+    paddw        m2, m2
+    psubw        m3, m2
+    pmullw       m1, [pw_20]
+    pmullw       m3, [pw_3]
+    psubw        m3, m4
+    paddw        m1, [PW_ROUND]
+    paddw        m3, m1
+    psraw        m3, 5
+    packuswb     m0, m3
+    OP_MOV     [r0], m0, m4
+    add          r1, r3
+    add          r0, r2
+    dec r4d
+    jne .loop
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OP
+MPEG4_QPEL8_H_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OP
+MPEG4_QPEL8_H_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OP
+MPEG4_QPEL8_H_LOWPASS put_no_rnd
+
+
+
+%macro QPEL_V_LOW 5
+    paddw      m0, m1
+    mova       m4, [pw_20]
+    pmullw     m4, m0
+    mova       m0, %4
+    mova       m5, %1
+    paddw      m5, m0
+    psubw      m4, m5
+    mova       m5, %2
+    mova       m6, %3
+    paddw      m5, m3
+    paddw      m6, m2
+    paddw      m6, m6
+    psubw      m5, m6
+    pmullw     m5, [pw_3]
+    paddw      m4, [PW_ROUND]
+    paddw      m5, m4
+    psraw      m5, 5
+    packuswb   m5, m5
+    OP_MOV     %5, m5, m7
+    SWAP 0,1,2,3
+%endmacro
+
+%macro MPEG4_QPEL16_V_LOWPASS 1
+cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
+    movsxdifnidn r2, r2d
+    movsxdifnidn r3, r3d
+
+    mov         r4d, 17
+    mov          r5, rsp
+    pxor         m7, m7
+.looph:
+    mova         m0, [r1]
+    mova         m1, [r1]
+    mova         m2, [r1+8]
+    mova         m3, [r1+8]
+    punpcklbw    m0, m7
+    punpckhbw    m1, m7
+    punpcklbw    m2, m7
+    punpckhbw    m3, m7
+    mova       [r5], m0
+    mova  [r5+0x88], m1
+    mova [r5+0x110], m2
+    mova [r5+0x198], m3
+    add          r5, 8
+    add          r1, r3
+    dec r4d
+    jne .looph
+
+
+    ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
+    mov         r4d, 4
+    mov          r1, 4
+    neg          r2
+    lea          r1, [r1+r2*8]
+    lea          r1, [r1+r2*4]
+    lea          r1, [r1+r2*2]
+    neg          r2
+    mov          r5, rsp
+.loopv:
+    pxor         m7, m7
+    mova         m0, [r5+ 0x0]
+    mova         m1, [r5+ 0x8]
+    mova         m2, [r5+0x10]
+    mova         m3, [r5+0x18]
+    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
+    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
+    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
+    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
+    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
+    QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
+    QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
+    QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
+    QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
+
+    add    r5, 0x88
+    add    r0, r1
+    dec r4d
+    jne .loopv
+    REP_RET
+%endmacro
+
+%macro PUT_OPH 2-3
+    movh %1, %2
+%endmacro
+
+%macro AVG_OPH 2-3
+    movh  %3, %1
+    pavgb %2, %3
+    movh  %1, %2
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OPH
+MPEG4_QPEL16_V_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OPH
+MPEG4_QPEL16_V_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OPH
+MPEG4_QPEL16_V_LOWPASS put_no_rnd
+
+
+
+%macro MPEG4_QPEL8_V_LOWPASS 1
+cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
+    movsxdifnidn r2, r2d
+    movsxdifnidn r3, r3d
+
+    mov         r4d, 9
+    mov          r5, rsp
+    pxor         m7, m7
+.looph:
+    mova         m0, [r1]
+    mova         m1, [r1]
+    punpcklbw    m0, m7
+    punpckhbw    m1, m7
+    mova       [r5], m0
+    mova  [r5+0x48], m1
+    add          r5, 8
+    add          r1, r3
+    dec r4d
+    jne .looph
+
+
+    ; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
+    mov         r4d, 2
+    mov          r1, 4
+    neg          r2
+    lea          r1, [r1+r2*4]
+    lea          r1, [r1+r2*2]
+    neg          r2
+    mov          r5, rsp
+.loopv:
+    pxor         m7, m7
+    mova         m0, [r5+ 0x0]
+    mova         m1, [r5+ 0x8]
+    mova         m2, [r5+0x10]
+    mova         m3, [r5+0x18]
+    QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
+    QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
+    QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
+    QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
+    lea    r0, [r0+r2*2]
+    QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
+    QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
+
+    add    r5, 0x48
+    add    r0, r1
+    dec r4d
+    jne .loopv
+    REP_RET
+%endmacro
+
+INIT_MMX mmxext
+%define PW_ROUND pw_16
+%define OP_MOV PUT_OPH
+MPEG4_QPEL8_V_LOWPASS put
+%define PW_ROUND pw_16
+%define OP_MOV AVG_OPH
+MPEG4_QPEL8_V_LOWPASS avg
+%define PW_ROUND pw_15
+%define OP_MOV PUT_OPH
+MPEG4_QPEL8_V_LOWPASS put_no_rnd
diff --git a/libavcodec/x86/dsputil_avg_template.c b/libavcodec/x86/dsputil_avg_template.c
index 4fc188c..171a9cb 100644
--- a/libavcodec/x86/dsputil_avg_template.c
+++ b/libavcodec/x86/dsputil_avg_template.c
@@ -24,781 +24,32 @@ 
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 
-/* XXX: we use explicit registers to avoid a gcc 2.95.2 register asm
-   clobber bug - now it will work with 2.95.2 and also with -fPIC
- */
-static void DEF(put_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-#ifndef SKIP_FOR_3DNOW
-static void DEF(put_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(put_no_rnd_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "pcmpeqb %%mm6, %%mm6           \n\t"
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   16(%2), %%mm2           \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels8_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   (%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $8, %2                  \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%1), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        PAVGB" (%3), %%mm1              \n\t"
-        "movq   %%mm1, (%3)             \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $4, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-#endif /* SKIP_FOR_3DNOW */
-
-static void DEF(put_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq 8(%1), %%mm2              \n\t"
-        "movq 8(%1, %3), %%mm3          \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 9(%1), %%mm2             \n\t"
-        PAVGB" 9(%1, %3), %%mm3         \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq %%mm2, 8(%2)              \n\t"
-        "movq %%mm3, 8(%2, %3)          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq 8(%1), %%mm2              \n\t"
-        "movq 8(%1, %3), %%mm3          \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 9(%1), %%mm2             \n\t"
-        PAVGB" 9(%1, %3), %%mm3         \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq %%mm2, 8(%2)              \n\t"
-        "movq %%mm3, 8(%2, %3)          \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-#ifndef SKIP_FOR_3DNOW
-static void DEF(put_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(avg_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" 8(%2), %%mm1             \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        PAVGB" 16(%2), %%mm0            \n\t"
-        PAVGB" 24(%2), %%mm1            \n\t"
-        PAVGB" (%3), %%mm0              \n\t"
-        PAVGB" 8(%3), %%mm1             \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#if !HAVE_EBX_AVAILABLE  //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-
-static void DEF(put_no_rnd_pixels16_l2)(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
-{
-    __asm__ volatile(
-        "pcmpeqb %%mm6, %%mm6           \n\t"
-        "testl $1, %0                   \n\t"
-            " jz 1f                     \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "add    %4, %1                  \n\t"
-        "add    $16, %2                 \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "decl   %0                      \n\t"
-        "1:                             \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   (%2), %%mm2             \n\t"
-        "movq   8(%2), %%mm3            \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "movq   (%1), %%mm0             \n\t"
-        "movq   8(%1), %%mm1            \n\t"
-        "add    %4, %1                  \n\t"
-        "movq   16(%2), %%mm2           \n\t"
-        "movq   24(%2), %%mm3           \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "pxor %%mm6, %%mm2              \n\t"
-        "pxor %%mm6, %%mm3              \n\t"
-        PAVGB" %%mm2, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm1             \n\t"
-        "pxor %%mm6, %%mm0              \n\t"
-        "pxor %%mm6, %%mm1              \n\t"
-        "movq   %%mm0, (%3)             \n\t"
-        "movq   %%mm1, 8(%3)            \n\t"
-        "add    %5, %3                  \n\t"
-        "add    $32, %2                 \n\t"
-        "subl   $2, %0                  \n\t"
-        "jnz    1b                      \n\t"
-#if !HAVE_EBX_AVAILABLE //Note "+bm" and "+mb" are buggy too (with gcc 3.2.2 at least) and cannot be used
-        :"+m"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#else
-        :"+b"(h), "+a"(src1), "+c"(src2), "+d"(dst)
-#endif
-        :"S"((x86_reg)src1Stride), "D"((x86_reg)dstStride)
-        :"memory");
-//the following should be used, though better not with gcc ...
-/*        :"+g"(h), "+r"(src1), "+r"(src2), "+r"(dst)
-        :"r"(src1Stride), "r"(dstStride)
-        :"memory");*/
-}
-#endif /* SKIP_FOR_3DNOW */
-
-/* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "movq 1(%1), %%mm1              \n\t"
-        "movq 1(%1, %3), %%mm3          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm0           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq 1(%1), %%mm1              \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        "movq 1(%1, %3), %%mm3          \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm0           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(put_no_rnd_pixels8_x2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile (
-        "pcmpeqb %%mm6, %%mm6           \n\t"
-        "1:                             \n\t"
-        "movq  (%1),     %%mm0          \n\t"
-        "movq  (%1, %3), %%mm2          \n\t"
-        "movq 1(%1),     %%mm1          \n\t"
-        "movq 1(%1, %3), %%mm3          \n\t"
-        "pxor  %%mm6, %%mm0             \n\t"
-        "pxor  %%mm6, %%mm2             \n\t"
-        "pxor  %%mm6, %%mm1             \n\t"
-        "pxor  %%mm6, %%mm3             \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        "pxor  %%mm6, %%mm0             \n\t"
-        "pxor  %%mm6, %%mm2             \n\t"
-        "movq  %%mm0, (%2)              \n\t"
-        "movq  %%mm2, (%2, %3)          \n\t"
-        "movq  (%1, %3,2), %%mm0        \n\t"
-        "movq 1(%1, %3,2), %%mm1        \n\t"
-        "movq  (%1, %4),   %%mm2        \n\t"
-        "movq 1(%1, %4),   %%mm3        \n\t"
-        "pxor  %%mm6, %%mm0             \n\t"
-        "pxor  %%mm6, %%mm1             \n\t"
-        "pxor  %%mm6, %%mm2             \n\t"
-        "pxor  %%mm6, %%mm3             \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        "pxor  %%mm6, %%mm0             \n\t"
-        "pxor  %%mm6, %%mm2             \n\t"
-        "movq  %%mm0, (%2, %3,2)        \n\t"
-        "movq  %%mm2, (%2, %4)          \n\t"
-        "lea   (%1, %3,4), %1           \n\t"
-        "lea   (%2, %3,4), %2           \n\t"
-        "subl  $4, %0                   \n\t"
-        "jg 1b                          \n\t"
-        : "+g"(h), "+r"(pixels), "+r"(block)
-        : "r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
-        : "memory"
-    );
-}
-
-static void DEF(put_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D" (block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-/* GL: this function does incorrect rounding if overflow */
-static void DEF(put_no_rnd_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm1           \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "psubusb %%mm6, %%mm1           \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D" (block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(put_no_rnd_pixels8_y2_exact)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile (
-        "movq     (%1), %%mm0           \n\t"
-        "pcmpeqb %%mm6, %%mm6           \n\t"
-        "add        %3, %1              \n\t"
-        "pxor    %%mm6, %%mm0           \n\t"
-        "1:                             \n\t"
-        "movq  (%1),     %%mm1          \n\t"
-        "movq  (%1, %3), %%mm2          \n\t"
-        "pxor  %%mm6, %%mm1             \n\t"
-        "pxor  %%mm6, %%mm2             \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "pxor  %%mm6, %%mm0             \n\t"
-        "pxor  %%mm6, %%mm1             \n\t"
-        "movq  %%mm0, (%2)              \n\t"
-        "movq  %%mm1, (%2, %3)          \n\t"
-        "movq  (%1, %3,2), %%mm1        \n\t"
-        "movq  (%1, %4),   %%mm0        \n\t"
-        "pxor  %%mm6, %%mm1             \n\t"
-        "pxor  %%mm6, %%mm0             \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "pxor  %%mm6, %%mm2             \n\t"
-        "pxor  %%mm6, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3,2)         \n\t"
-        "movq %%mm1, (%2, %4)           \n\t"
-        "lea   (%1, %3,4), %1           \n\t"
-        "lea   (%2, %3,4), %2           \n\t"
-        "subl $4, %0                    \n\t"
-        "jg 1b                          \n\t"
-        :"+g"(h), "+r"(pixels), "+r" (block)
-        :"r" ((x86_reg)line_size), "r"((x86_reg)3*line_size)
-        :"memory"
-    );
-}
-
-static void DEF(avg_pixels8)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%2), %%mm0               \n\t"
-        "movq (%2, %3), %%mm1           \n\t"
-        PAVGB" (%1), %%mm0              \n\t"
-        PAVGB" (%1, %3), %%mm1          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "movq (%2), %%mm0               \n\t"
-        "movq (%2, %3), %%mm1           \n\t"
-        PAVGB" (%1), %%mm0              \n\t"
-        PAVGB" (%1, %3), %%mm1          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(avg_pixels8_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "1:                             \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm2         \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm2          \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "movq (%1, %3), %%mm2           \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-        PAVGB" 1(%1, %3), %%mm2         \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm2          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-static void DEF(avg_pixels8_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        "sub %3, %2                     \n\t"
-        "1:                             \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        "movq (%2, %3), %%mm3           \n\t"
-        "movq (%2, %%"REG_a"), %%mm4    \n\t"
-        PAVGB" %%mm3, %%mm0             \n\t"
-        PAVGB" %%mm4, %%mm1             \n\t"
-        "movq %%mm0, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        "movq (%2, %3), %%mm3           \n\t"
-        "movq (%2, %%"REG_a"), %%mm4    \n\t"
-        PAVGB" %%mm3, %%mm2             \n\t"
-        PAVGB" %%mm4, %%mm1             \n\t"
-        "movq %%mm2, (%2, %3)           \n\t"
-        "movq %%mm1, (%2, %%"REG_a")    \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a, "memory");
-}
-
-/* Note this is not correctly rounded, but this function is only
- * used for B-frames so it does not matter. */
-static void DEF(avg_pixels8_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h)
-{
-    MOVQ_BONE(mm6);
-    __asm__ volatile(
-        "lea (%3, %3), %%"REG_a"        \n\t"
-        "movq (%1), %%mm0               \n\t"
-        PAVGB" 1(%1), %%mm0             \n\t"
-         ".p2align 3                    \n\t"
-        "1:                             \n\t"
-        "movq (%1, %%"REG_a"), %%mm2    \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "psubusb %%mm6, %%mm2           \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 1(%1, %%"REG_a"), %%mm2  \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm0             \n\t"
-        PAVGB" %%mm2, %%mm1             \n\t"
-        PAVGB" (%2), %%mm0              \n\t"
-        PAVGB" (%2, %3), %%mm1          \n\t"
-        "movq %%mm0, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "movq (%1, %3), %%mm1           \n\t"
-        "movq (%1, %%"REG_a"), %%mm0    \n\t"
-        PAVGB" 1(%1, %3), %%mm1         \n\t"
-        PAVGB" 1(%1, %%"REG_a"), %%mm0  \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "add %%"REG_a", %1              \n\t"
-        PAVGB" %%mm1, %%mm2             \n\t"
-        PAVGB" %%mm0, %%mm1             \n\t"
-        PAVGB" (%2), %%mm2              \n\t"
-        PAVGB" (%2, %3), %%mm1          \n\t"
-        "movq %%mm2, (%2)               \n\t"
-        "movq %%mm1, (%2, %3)           \n\t"
-        "add %%"REG_a", %2              \n\t"
-        "subl $4, %0                    \n\t"
-        "jnz 1b                         \n\t"
-        :"+g"(h), "+S"(pixels), "+D"(block)
-        :"r" ((x86_reg)line_size)
-        :"%"REG_a,  "memory");
-}
-
 //FIXME the following could be optimized too ...
-static void DEF(put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
-    DEF(put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
+static void DEF(ff_put_no_rnd_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(ff_put_no_rnd_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(ff_put_no_rnd_pixels8_x2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put_pixels8_y2)(block+8, pixels+8, line_size, h);
+static void DEF(ff_put_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(ff_put_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(ff_put_pixels8_y2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
+static void DEF(ff_put_no_rnd_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(ff_put_no_rnd_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(ff_put_no_rnd_pixels8_y2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8)(block+8, pixels+8, line_size, h);
+static void DEF(ff_avg_pixels16)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(ff_avg_pixels8)(block  , pixels  , line_size, h);
+    DEF(ff_avg_pixels8)(block+8, pixels+8, line_size, h);
 }
-static void DEF(avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_x2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_x2)(block+8, pixels+8, line_size, h);
+static void DEF(ff_avg_pixels16_x2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(ff_avg_pixels8_x2)(block  , pixels  , line_size, h);
+    DEF(ff_avg_pixels8_x2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_y2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_y2)(block+8, pixels+8, line_size, h);
+static void DEF(ff_avg_pixels16_y2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(ff_avg_pixels8_y2)(block  , pixels  , line_size, h);
+    DEF(ff_avg_pixels8_y2)(block+8, pixels+8, line_size, h);
 }
-static void DEF(avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
-    DEF(avg_pixels8_xy2)(block  , pixels  , line_size, h);
-    DEF(avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
+static void DEF(ff_avg_pixels16_xy2)(uint8_t *block, const uint8_t *pixels, int line_size, int h){
+    DEF(ff_avg_pixels8_xy2)(block  , pixels  , line_size, h);
+    DEF(ff_avg_pixels8_xy2)(block+8, pixels+8, line_size, h);
 }
diff --git a/libavcodec/x86/dsputil_mmx.c b/libavcodec/x86/dsputil_mmx.c
index f9da04f..a793658 100644
--- a/libavcodec/x86/dsputil_mmx.c
+++ b/libavcodec/x86/dsputil_mmx.c
@@ -80,6 +80,143 @@  DECLARE_ALIGNED(16, const xmm_reg,  ff_pb_FE)   = { 0xFEFEFEFEFEFEFEFEULL, 0xFEF
 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
 
+
+void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h);
+void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
+                                     uint8_t *src2, int dstStride,
+                                     int src1Stride, int h);
+void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+                              int dstStride, int src1Stride, int h);
+void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               int line_size, int h);
+void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+                               int dstStride, int src1Stride, int h);
+void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+                               int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
+                                      int dstStride, int src1Stride, int h);
+void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                     int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                    int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           int line_size, int h);
+void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
+                                          const uint8_t *pixels,
+                                          int line_size, int h);
+void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                                     int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                                    int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
+                                           const uint8_t *pixels,
+                                           int line_size, int h);
+void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
+                                          const uint8_t *pixels,
+                                          int line_size, int h);
+void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
+                           int line_size, int h);
+void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
+                          int line_size, int h);
+void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h);
+void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
+                             int line_size, int h);
+void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
+                               int line_size, int h);
+void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
+                              int line_size, int h);
+
+void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
+static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
+                                   int line_size, int h)
+{
+    ff_put_pixels8_mmxext(block,     pixels,     line_size, h);
+    ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
+}
+
+void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                         int dstStride, int srcStride, int h);
+void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                         int dstStride, int srcStride, int h);
+void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                                 int dstStride, int srcStride,
+                                                 int h);
+void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                        int dstStride, int srcStride, int h);
+void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                        int dstStride, int srcStride, int h);
+void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                                int dstStride, int srcStride,
+                                                int h);
+void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                         int dstStride, int srcStride);
+void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                         int dstStride, int srcStride);
+void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                                 int dstStride, int srcStride);
+void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                        int dstStride, int srcStride);
+void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                        int dstStride, int srcStride);
+void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
+                                                int dstStride, int srcStride);
+#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
+#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
+
+#if HAVE_YASM
+/* VC-1-specific */
+#define ff_put_pixels8_mmx ff_put_pixels8_mmxext
+void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
+                               int stride, int rnd)
+{
+    ff_put_pixels8_mmx(dst, src, stride, 8);
+}
+
+void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
+                                  int stride, int rnd)
+{
+    ff_avg_pixels8_mmxext(dst, src, stride, 8);
+}
+
+
+/***********************************/
+/* 3Dnow specific */
+
+#define DEF(x) x ## _3dnow
+
+#include "dsputil_avg_template.c"
+
+#undef DEF
+
+/***********************************/
+/* MMXEXT specific */
+
+#define DEF(x) x ## _mmxext
+
+#include "dsputil_avg_template.c"
+
+#undef DEF
+
+#endif /* HAVE_YASM */
+
+
 #if HAVE_INLINE_ASM
 
 #define JUMPALIGN()     __asm__ volatile (".p2align 3"::)
@@ -190,32 +327,6 @@  DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
 #undef PAVGB
 #undef OP_AVG
 
-/***********************************/
-/* 3Dnow specific */
-
-#define DEF(x) x ## _3dnow
-#define PAVGB "pavgusb"
-#define SKIP_FOR_3DNOW
-
-#include "dsputil_avg_template.c"
-
-#undef DEF
-#undef PAVGB
-#undef SKIP_FOR_3DNOW
-
-/***********************************/
-/* MMXEXT specific */
-
-#define DEF(x) x ## _mmxext
-
-/* Introduced only in MMXEXT set */
-#define PAVGB "pavgb"
-
-#include "dsputil_avg_template.c"
-
-#undef DEF
-#undef PAVGB
-
 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
 #define put_pixels16_mmxext put_pixels16_mmx
@@ -815,382 +926,15 @@  static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
         }
     }
 }
+#endif /* HAVE_INLINE_ASM */
 
-#define QPEL_V_LOW(m3, m4, m5, m6, pw_20, pw_3, rnd,                      \
-                   in0, in1, in2, in7, out, OP)                           \
-    "paddw               "#m4", "#m3"   \n\t" /* x1 */                    \
-    "movq   "MANGLE(ff_pw_20)", %%mm4   \n\t" /* 20 */                    \
-    "pmullw              "#m3", %%mm4   \n\t" /* 20x1 */                  \
-    "movq               "#in7", "#m3"   \n\t" /* d */                     \
-    "movq               "#in0", %%mm5   \n\t" /* D */                     \
-    "paddw               "#m3", %%mm5   \n\t" /* x4 */                    \
-    "psubw               %%mm5, %%mm4   \n\t" /* 20x1 - x4 */             \
-    "movq               "#in1", %%mm5   \n\t" /* C */                     \
-    "movq               "#in2", %%mm6   \n\t" /* B */                     \
-    "paddw               "#m6", %%mm5   \n\t" /* x3 */                    \
-    "paddw               "#m5", %%mm6   \n\t" /* x2 */                    \
-    "paddw               %%mm6, %%mm6   \n\t" /* 2x2 */                   \
-    "psubw               %%mm6, %%mm5   \n\t" /* -2x2 + x3 */             \
-    "pmullw  "MANGLE(ff_pw_3)", %%mm5   \n\t" /* -6x2 + 3x3 */            \
-    "paddw              "#rnd", %%mm4   \n\t" /* x2 */                    \
-    "paddw               %%mm4, %%mm5   \n\t" /* 20x1 - 6x2 + 3x3 - x4 */ \
-    "psraw                  $5, %%mm5   \n\t"                             \
-    "packuswb            %%mm5, %%mm5   \n\t"                             \
-    OP(%%mm5, out, %%mm7, d)
-
-#define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMXEXT)                        \
-static void OPNAME ## mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst,         \
-                                                    uint8_t *src,         \
-                                                    int dstStride,        \
-                                                    int srcStride,        \
-                                                    int h)                \
-{                                                                         \
-    uint64_t temp;                                                        \
-                                                                          \
-    __asm__ volatile (                                                    \
-        "pxor      %%mm7, %%mm7             \n\t"                         \
-        "1:                                 \n\t"                         \
-        "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
-        "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
-        "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
-        "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
-        "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
-        "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
-        "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
-        "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
-        "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
-        "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
-        "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
-        "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
-        "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
-        "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
-        "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
-        "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
-        "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
-        "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
-        "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
-        "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
-        "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
-        "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
-        "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
-        "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
-        "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
-        "paddw        %6, %%mm6             \n\t"                         \
-        "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
-        "psraw        $5, %%mm0             \n\t"                         \
-        "movq      %%mm0, %5                \n\t"                         \
-        /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
-                                                                          \
-        "movq      5(%0), %%mm0             \n\t" /* FGHIJKLM */          \
-        "movq      %%mm0, %%mm5             \n\t" /* FGHIJKLM */          \
-        "movq      %%mm0, %%mm6             \n\t" /* FGHIJKLM */          \
-        "psrlq        $8, %%mm0             \n\t" /* GHIJKLM0 */          \
-        "psrlq       $16, %%mm5             \n\t" /* HIJKLM00 */          \
-        "punpcklbw %%mm7, %%mm0             \n\t" /* 0G0H0I0J */          \
-        "punpcklbw %%mm7, %%mm5             \n\t" /* 0H0I0J0K */          \
-        "paddw     %%mm0, %%mm2             \n\t" /* b */                 \
-        "paddw     %%mm5, %%mm3             \n\t" /* c */                 \
-        "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
-        "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
-        "movq      %%mm6, %%mm2             \n\t" /* FGHIJKLM */          \
-        "psrlq       $24, %%mm6             \n\t" /* IJKLM000 */          \
-        "punpcklbw %%mm7, %%mm2             \n\t" /* 0F0G0H0I */          \
-        "punpcklbw %%mm7, %%mm6             \n\t" /* 0I0J0K0L */          \
-        "pmullw "MANGLE(ff_pw_3)", %%mm3    \n\t" /* 3c - 6b */           \
-        "paddw     %%mm2, %%mm1             \n\t" /* a */                 \
-        "paddw     %%mm6, %%mm4             \n\t" /* d */                 \
-        "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
-        "psubw     %%mm4, %%mm3             \n\t" /* - 6b +3c - d */      \
-        "paddw        %6, %%mm1             \n\t"                         \
-        "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b +3c - d */  \
-        "psraw        $5, %%mm3             \n\t"                         \
-        "movq         %5, %%mm1             \n\t"                         \
-        "packuswb  %%mm3, %%mm1             \n\t"                         \
-        OP_MMXEXT(%%mm1, (%1), %%mm4, q)                                  \
-        /* mm0 = GHIJ, mm2 = FGHI, mm5 = HIJK, mm6 = IJKL, mm7 = 0 */     \
-                                                                          \
-        "movq      9(%0), %%mm1             \n\t" /* JKLMNOPQ */          \
-        "movq      %%mm1, %%mm4             \n\t" /* JKLMNOPQ */          \
-        "movq      %%mm1, %%mm3             \n\t" /* JKLMNOPQ */          \
-        "psrlq        $8, %%mm1             \n\t" /* KLMNOPQ0 */          \
-        "psrlq       $16, %%mm4             \n\t" /* LMNOPQ00 */          \
-        "punpcklbw %%mm7, %%mm1             \n\t" /* 0K0L0M0N */          \
-        "punpcklbw %%mm7, %%mm4             \n\t" /* 0L0M0N0O */          \
-        "paddw     %%mm1, %%mm5             \n\t" /* b */                 \
-        "paddw     %%mm4, %%mm0             \n\t" /* c */                 \
-        "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
-        "psubw     %%mm5, %%mm0             \n\t" /* c - 2b */            \
-        "movq      %%mm3, %%mm5             \n\t" /* JKLMNOPQ */          \
-        "psrlq       $24, %%mm3             \n\t" /* MNOPQ000 */          \
-        "pmullw "MANGLE(ff_pw_3)", %%mm0    \n\t" /* 3c - 6b */           \
-        "punpcklbw %%mm7, %%mm3             \n\t" /* 0M0N0O0P */          \
-        "paddw     %%mm3, %%mm2             \n\t" /* d */                 \
-        "psubw     %%mm2, %%mm0             \n\t" /* -6b + 3c - d */      \
-        "movq      %%mm5, %%mm2             \n\t" /* JKLMNOPQ */          \
-        "punpcklbw %%mm7, %%mm2             \n\t" /* 0J0K0L0M */          \
-        "punpckhbw %%mm7, %%mm5             \n\t" /* 0N0O0P0Q */          \
-        "paddw     %%mm2, %%mm6             \n\t" /* a */                 \
-        "pmullw "MANGLE(ff_pw_20)", %%mm6   \n\t" /* 20a */               \
-        "paddw        %6, %%mm0             \n\t"                         \
-        "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
-        "psraw        $5, %%mm0             \n\t"                         \
-        /* mm1 = KLMN, mm2 = JKLM, mm3 = MNOP, */                         \
-        /* mm4 = LMNO, mm5 = NOPQ mm7 = 0 */                              \
-                                                                          \
-        "paddw    %%mm5, %%mm3              \n\t" /* a */                 \
-        "pshufw   $0xF9, %%mm5, %%mm6       \n\t" /* 0O0P0Q0Q */          \
-        "paddw    %%mm4, %%mm6              \n\t" /* b */                 \
-        "pshufw   $0xBE, %%mm5, %%mm4       \n\t" /* 0P0Q0Q0P */          \
-        "pshufw   $0x6F, %%mm5, %%mm5       \n\t" /* 0Q0Q0P0O */          \
-        "paddw    %%mm1, %%mm4              \n\t" /* c */                 \
-        "paddw    %%mm2, %%mm5              \n\t" /* d */                 \
-        "paddw    %%mm6, %%mm6              \n\t" /* 2b */                \
-        "psubw    %%mm6, %%mm4              \n\t" /* c - 2b */            \
-        "pmullw "MANGLE(ff_pw_20)", %%mm3   \n\t" /* 20a */               \
-        "pmullw  "MANGLE(ff_pw_3)", %%mm4   \n\t" /* 3c - 6b */           \
-        "psubw    %%mm5, %%mm3              \n\t" /* -6b + 3c - d */      \
-        "paddw       %6, %%mm4              \n\t"                         \
-        "paddw    %%mm3, %%mm4              \n\t" /* 20a - 6b + 3c - d */ \
-        "psraw       $5, %%mm4              \n\t"                         \
-        "packuswb %%mm4, %%mm0              \n\t"                         \
-        OP_MMXEXT(%%mm0, 8(%1), %%mm4, q)                                 \
-                                                                          \
-        "add         %3, %0                 \n\t"                         \
-        "add         %4, %1                 \n\t"                         \
-        "decl        %2                     \n\t"                         \
-        "jnz         1b                     \n\t"                         \
-        : "+a"(src), "+c"(dst), "+D"(h)                                   \
-        : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride),               \
-          /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(temp), "m"(ROUNDER)      \
-        : "memory"                                                        \
-        );                                                                \
-}                                                                         \
-                                                                          \
-static void OPNAME ## mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst,          \
-                                                   uint8_t *src,          \
-                                                   int dstStride,         \
-                                                   int srcStride,         \
-                                                   int h)                 \
-{                                                                         \
-    __asm__ volatile (                                                    \
-        "pxor      %%mm7, %%mm7             \n\t"                         \
-        "1:                                 \n\t"                         \
-        "movq       (%0), %%mm0             \n\t" /* ABCDEFGH */          \
-        "movq      %%mm0, %%mm1             \n\t" /* ABCDEFGH */          \
-        "movq      %%mm0, %%mm2             \n\t" /* ABCDEFGH */          \
-        "punpcklbw %%mm7, %%mm0             \n\t" /* 0A0B0C0D */          \
-        "punpckhbw %%mm7, %%mm1             \n\t" /* 0E0F0G0H */          \
-        "pshufw    $0x90, %%mm0, %%mm5      \n\t" /* 0A0A0B0C */          \
-        "pshufw    $0x41, %%mm0, %%mm6      \n\t" /* 0B0A0A0B */          \
-        "movq      %%mm2, %%mm3             \n\t" /* ABCDEFGH */          \
-        "movq      %%mm2, %%mm4             \n\t" /* ABCDEFGH */          \
-        "psllq        $8, %%mm2             \n\t" /* 0ABCDEFG */          \
-        "psllq       $16, %%mm3             \n\t" /* 00ABCDEF */          \
-        "psllq       $24, %%mm4             \n\t" /* 000ABCDE */          \
-        "punpckhbw %%mm7, %%mm2             \n\t" /* 0D0E0F0G */          \
-        "punpckhbw %%mm7, %%mm3             \n\t" /* 0C0D0E0F */          \
-        "punpckhbw %%mm7, %%mm4             \n\t" /* 0B0C0D0E */          \
-        "paddw     %%mm3, %%mm5             \n\t" /* b */                 \
-        "paddw     %%mm2, %%mm6             \n\t" /* c */                 \
-        "paddw     %%mm5, %%mm5             \n\t" /* 2b */                \
-        "psubw     %%mm5, %%mm6             \n\t" /* c - 2b */            \
-        "pshufw    $0x06, %%mm0, %%mm5      \n\t" /* 0C0B0A0A */          \
-        "pmullw "MANGLE(ff_pw_3)", %%mm6    \n\t" /* 3c - 6b */           \
-        "paddw     %%mm4, %%mm0             \n\t" /* a */                 \
-        "paddw     %%mm1, %%mm5             \n\t" /* d */                 \
-        "pmullw "MANGLE(ff_pw_20)", %%mm0   \n\t" /* 20a */               \
-        "psubw     %%mm5, %%mm0             \n\t" /* 20a - d */           \
-        "paddw        %5, %%mm6             \n\t"                         \
-        "paddw     %%mm6, %%mm0             \n\t" /* 20a - 6b + 3c - d */ \
-        "psraw        $5, %%mm0             \n\t"                         \
-        /* mm1 = EFGH, mm2 = DEFG, mm3 = CDEF, mm4 = BCDE, mm7 = 0 */     \
-                                                                          \
-        "movd      5(%0), %%mm5             \n\t" /* FGHI */              \
-        "punpcklbw %%mm7, %%mm5             \n\t" /* 0F0G0H0I */          \
-        "pshufw    $0xF9, %%mm5, %%mm6      \n\t" /* 0G0H0I0I */          \
-        "paddw     %%mm5, %%mm1             \n\t" /* a */                 \
-        "paddw     %%mm6, %%mm2             \n\t" /* b */                 \
-        "pshufw    $0xBE, %%mm5, %%mm6      \n\t" /* 0H0I0I0H */          \
-        "pshufw    $0x6F, %%mm5, %%mm5      \n\t" /* 0I0I0H0G */          \
-        "paddw     %%mm6, %%mm3             \n\t" /* c */                 \
-        "paddw     %%mm5, %%mm4             \n\t" /* d */                 \
-        "paddw     %%mm2, %%mm2             \n\t" /* 2b */                \
-        "psubw     %%mm2, %%mm3             \n\t" /* c - 2b */            \
-        "pmullw "MANGLE(ff_pw_20)", %%mm1   \n\t" /* 20a */               \
-        "pmullw  "MANGLE(ff_pw_3)", %%mm3   \n\t" /* 3c - 6b */           \
-        "psubw     %%mm4, %%mm3             \n\t" /* -6b + 3c - d */      \
-        "paddw        %5, %%mm1             \n\t"                         \
-        "paddw     %%mm1, %%mm3             \n\t" /* 20a - 6b + 3c - d */ \
-        "psraw        $5, %%mm3             \n\t"                         \
-        "packuswb  %%mm3, %%mm0             \n\t"                         \
-        OP_MMXEXT(%%mm0, (%1), %%mm4, q)                                  \
-                                                                          \
-        "add          %3, %0                \n\t"                         \
-        "add          %4, %1                \n\t"                         \
-        "decl         %2                    \n\t"                         \
-        "jnz          1b                    \n\t"                         \
-        : "+a"(src), "+c"(dst), "+d"(h)                                   \
-        : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride),               \
-          /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER)                 \
-        : "memory"                                                        \
-        );                                                                \
-}
 
+#if HAVE_YASM
 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)                          \
-static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst,      \
-                                                     uint8_t *src,      \
-                                                     int dstStride,     \
-                                                     int srcStride)     \
-{                                                                       \
-    uint64_t temp[17 * 4];                                              \
-    uint64_t *temp_ptr = temp;                                          \
-    int count = 17;                                                     \
-                                                                        \
-    /* FIXME unroll */                                                  \
-    __asm__ volatile (                                                  \
-        "pxor      %%mm7, %%mm7             \n\t"                       \
-        "1:                                 \n\t"                       \
-        "movq       (%0), %%mm0             \n\t"                       \
-        "movq       (%0), %%mm1             \n\t"                       \
-        "movq      8(%0), %%mm2             \n\t"                       \
-        "movq      8(%0), %%mm3             \n\t"                       \
-        "punpcklbw %%mm7, %%mm0             \n\t"                       \
-        "punpckhbw %%mm7, %%mm1             \n\t"                       \
-        "punpcklbw %%mm7, %%mm2             \n\t"                       \
-        "punpckhbw %%mm7, %%mm3             \n\t"                       \
-        "movq      %%mm0, (%1)              \n\t"                       \
-        "movq      %%mm1, 17 * 8(%1)        \n\t"                       \
-        "movq      %%mm2, 2 * 17 * 8(%1)    \n\t"                       \
-        "movq      %%mm3, 3 * 17 * 8(%1)    \n\t"                       \
-        "add          $8, %1                \n\t"                       \
-        "add          %3, %0                \n\t"                       \
-        "decl         %2                    \n\t"                       \
-        "jnz          1b                    \n\t"                       \
-        : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
-        : "r"((x86_reg)srcStride)                                       \
-        : "memory"                                                      \
-        );                                                              \
-                                                                        \
-    temp_ptr = temp;                                                    \
-    count    = 4;                                                       \
-                                                                        \
-    /* FIXME reorder for speed */                                       \
-    __asm__ volatile (                                                  \
-        /* "pxor  %%mm7, %%mm7            \n\t" */                      \
-        "1:                             \n\t"                           \
-        "movq    (%0), %%mm0            \n\t"                           \
-        "movq   8(%0), %%mm1            \n\t"                           \
-        "movq  16(%0), %%mm2            \n\t"                           \
-        "movq  24(%0), %%mm3            \n\t"                           \
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),   8(%0),    (%0),  32(%0), (%1),     OP) \
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),    (%0),    (%0),  40(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),    (%0),   8(%0),  48(%0), (%1),     OP) \
-                                                                        \
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),   8(%0),  16(%0),  56(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0),  16(%0),  24(%0),  64(%0), (%1),     OP) \
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0),  24(%0),  32(%0),  72(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0),  32(%0),  40(%0),  80(%0), (%1),     OP) \
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0),  40(%0),  48(%0),  88(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0),  48(%0),  56(%0),  96(%0), (%1),     OP) \
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0),  56(%0),  64(%0), 104(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0),  64(%0),  72(%0), 112(%0), (%1),     OP) \
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0),  72(%0),  80(%0), 120(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0),  80(%0),  88(%0), 128(%0), (%1),     OP) \
-                                                                        \
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0),  88(%0),  96(%0), 128(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0),  96(%0), 104(%0), 120(%0), (%1),     OP) \
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0), 104(%0), 112(%0), 112(%0), (%1, %3), OP) \
-                                                                        \
-        "add     $136, %0               \n\t"                           \
-        "add       %6, %1               \n\t"                           \
-        "decl      %2                   \n\t"                           \
-        "jnz       1b                   \n\t"                           \
-                                                                        \
-        : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
-        : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
-          /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
-          "g"(4 - 14 * (x86_reg)dstStride)                              \
-        : "memory"                                                      \
-        );                                                              \
-}                                                                       \
-                                                                        \
-static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst,       \
-                                                    uint8_t *src,       \
-                                                    int dstStride,      \
-                                                    int srcStride)      \
-{                                                                       \
-    uint64_t temp[9 * 2];                                               \
-    uint64_t *temp_ptr = temp;                                          \
-    int count = 9;                                                      \
-                                                                        \
-    /* FIXME unroll */                                                  \
-    __asm__ volatile (                                                  \
-        "pxor      %%mm7, %%mm7         \n\t"                           \
-        "1:                             \n\t"                           \
-        "movq       (%0), %%mm0         \n\t"                           \
-        "movq       (%0), %%mm1         \n\t"                           \
-        "punpcklbw %%mm7, %%mm0         \n\t"                           \
-        "punpckhbw %%mm7, %%mm1         \n\t"                           \
-        "movq      %%mm0, (%1)          \n\t"                           \
-        "movq      %%mm1, 9*8(%1)       \n\t"                           \
-        "add          $8, %1            \n\t"                           \
-        "add          %3, %0            \n\t"                           \
-        "decl         %2                \n\t"                           \
-        "jnz          1b                \n\t"                           \
-        : "+r"(src), "+r"(temp_ptr), "+r"(count)                        \
-        : "r"((x86_reg)srcStride)                                       \
-        : "memory"                                                      \
-        );                                                              \
-                                                                        \
-    temp_ptr = temp;                                                    \
-    count    = 2;                                                       \
-                                                                        \
-    /* FIXME reorder for speed */                                       \
-    __asm__ volatile (                                                  \
-        /* "pxor  %%mm7, %%mm7            \n\t" */                      \
-        "1:                             \n\t"                           \
-        "movq    (%0), %%mm0            \n\t"                           \
-        "movq   8(%0), %%mm1            \n\t"                           \
-        "movq  16(%0), %%mm2            \n\t"                           \
-        "movq  24(%0), %%mm3            \n\t"                           \
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0),  8(%0),   (%0), 32(%0), (%1), OP)     \
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5,  8(%0),   (%0),   (%0), 40(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5,   (%0),   (%0),  8(%0), 48(%0), (%1), OP)     \
-                                                                        \
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5,   (%0),  8(%0), 16(%0), 56(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5,  8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)     \
-                                                                        \
-        QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP) \
-        "add       %4, %1               \n\t"                           \
-        QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)     \
-        QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP) \
-                                                                        \
-        "add      $72, %0               \n\t"                           \
-        "add       %6, %1               \n\t"                           \
-        "decl      %2                   \n\t"                           \
-        "jnz       1b                   \n\t"                           \
-                                                                        \
-        : "+r"(temp_ptr), "+r"(dst), "+g"(count)                        \
-        : "r"((x86_reg)dstStride), "r"(2 * (x86_reg)dstStride),         \
-          /* "m"(ff_pw_20), "m"(ff_pw_3), */ "m"(ROUNDER),              \
-          "g"(4 - 6 * (x86_reg)dstStride)                               \
-        : "memory"                                                      \
-        );                                                              \
-}                                                                       \
-                                                                        \
 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src,   \
                                           int stride)                   \
 {                                                                       \
-    OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);                     \
+    ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);              \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1198,16 +942,17 @@  static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
 {                                                                       \
     uint64_t temp[8];                                                   \
     uint8_t * const half = (uint8_t*)temp;                              \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
-                                                stride, 8);             \
-    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
+                                                   stride, 8);          \
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
+                                        stride, stride, 8);             \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
                                          int stride)                    \
 {                                                                       \
-    OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,           \
-                                            stride, 8);                 \
+    ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,    \
+                                                   stride, 8);          \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1215,10 +960,10 @@  static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
 {                                                                       \
     uint64_t temp[8];                                                   \
     uint8_t * const half = (uint8_t*)temp;                              \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,           \
-                                                stride, 8);             \
-    OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,            \
-                                 stride, 8);                            \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
+                                                   stride, 8);          \
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,     \
+                                        stride, 8);                     \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1226,14 +971,17 @@  static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
 {                                                                       \
     uint64_t temp[8];                                                   \
     uint8_t * const half = (uint8_t*)temp;                              \
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
-    OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);    \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
+                                                   8, stride);          \
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
+                                        stride, stride, 8);             \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
                                          int stride)                    \
 {                                                                       \
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);  \
+    ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src,            \
+                                                   stride, stride);     \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1241,9 +989,10 @@  static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
 {                                                                       \
     uint64_t temp[8];                                                   \
     uint8_t * const half = (uint8_t*)temp;                              \
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);  \
-    OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,       \
-                                 stride, 8);                            \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
+                                                   8, stride);          \
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
+                                        stride, 8);                     \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1252,11 +1001,13 @@  static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
     uint64_t half[8 + 9];                                               \
     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
+                                        stride, 9);                     \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
+                                        stride, 8, 8);                  \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1265,12 +1016,13 @@  static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
     uint64_t half[8 + 9];                                               \
     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
-                                     stride, 9);                        \
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
+                                        stride, 9);                     \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
+                                        stride, 8, 8);                  \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1279,11 +1031,13 @@  static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
     uint64_t half[8 + 9];                                               \
     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
+                                        stride, 9);                     \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
+                                        stride, 8, 8);                  \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1292,12 +1046,13 @@  static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
     uint64_t half[8 + 9];                                               \
     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
-                                     stride, 9);                        \
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
+                                        stride, 9);                     \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
+                                        stride, 8, 8);                  \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1306,10 +1061,11 @@  static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
     uint64_t half[8 + 9];                                               \
     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);     \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
+                                        stride, 8, 8);                  \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1318,10 +1074,11 @@  static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
     uint64_t half[8 + 9];                                               \
     uint8_t * const halfH  = ((uint8_t*)half) + 64;                     \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);   \
-    OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, stride, 8, 8); \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
+    ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
+                                        stride, 8, 8);                  \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1329,10 +1086,12 @@  static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
 {                                                                       \
     uint64_t half[8 + 9];                                               \
     uint8_t * const halfH = ((uint8_t*)half);                           \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);  \
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,              \
+                                        8, stride, 9);                  \
+    ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
+                                                   stride, 8);          \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1340,11 +1099,12 @@  static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
 {                                                                       \
     uint64_t half[8 + 9];                                               \
     uint8_t * const halfH = ((uint8_t*)half);                           \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,          \
-                                     stride, 9);                        \
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
+                                        stride, 9);                     \
+    ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
+                                                   stride, 8);          \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
@@ -1352,15 +1112,16 @@  static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
 {                                                                       \
     uint64_t half[9];                                                   \
     uint8_t * const halfH = ((uint8_t*)half);                           \
-    put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,          \
-                                                stride, 9);             \
-    OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);     \
+    ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
+                                                   stride, 9);          \
+    ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
+                                                   stride, 8);          \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src,  \
                                            int stride)                  \
 {                                                                       \
-    OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);                   \
+    ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);            \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1368,16 +1129,17 @@  static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
 {                                                                       \
     uint64_t temp[32];                                                  \
     uint8_t * const half = (uint8_t*)temp;                              \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
-                                                 stride, 16);           \
-    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
+                                                    stride, 16);        \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
+                                         stride, 16);                   \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
                                           int stride)                   \
 {                                                                       \
-    OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,                  \
-                                             stride, stride, 16);       \
+    ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,           \
+                                                    stride, stride, 16);\
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1385,10 +1147,10 @@  static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
 {                                                                       \
     uint64_t temp[32];                                                  \
     uint8_t * const half = (uint8_t*)temp;                              \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,         \
-                                                 stride, 16);           \
-    OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,                   \
-                                  stride, stride, 16);                  \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
+                                                    stride, 16);        \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,            \
+                                         stride, stride, 16);           \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1396,15 +1158,17 @@  static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
 {                                                                       \
     uint64_t temp[32];                                                  \
     uint8_t * const half = (uint8_t*)temp;                              \
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
-                                                 stride);               \
-    OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);  \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
+                                                    stride);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
+                                         stride, 16);                   \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
                                           int stride)                   \
 {                                                                       \
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride); \
+    ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src,           \
+                                                    stride, stride);    \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1412,10 +1176,10 @@  static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
 {                                                                       \
     uint64_t temp[32];                                                  \
     uint8_t * const half = (uint8_t*)temp;                              \
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,         \
-                                                 stride);               \
-    OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,                \
-                                  stride, stride, 16);                  \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
+                                                    stride);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,         \
+                                         stride, stride, 16);           \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1424,13 +1188,14 @@  static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
     uint64_t half[16 * 2 + 17 * 2];                                     \
     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
-                                      stride, 17);                      \
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
-                                                 16, 16);               \
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
+                                         stride, 17);                   \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
+                                         stride, 16, 16);               \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1439,13 +1204,14 @@  static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
     uint64_t half[16 * 2 + 17 * 2];                                     \
     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
-                                      stride, 17);                      \
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
-                                                 16, 16);               \
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
+                                         stride, 17);                   \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
+                                         stride, 16, 16);               \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1454,14 +1220,14 @@  static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
     uint64_t half[16 * 2 + 17 * 2];                                     \
     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
-                                      stride, 17);                      \
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
-                                                 16, 16);               \
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
-                                  16, 16);                              \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
+                                         stride, 17);                   \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
+                                         stride, 16, 16);               \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1470,14 +1236,14 @@  static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
     uint64_t half[16 * 2 + 17 * 2];                                     \
     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
-                                      stride, 17);                      \
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
-                                                 16, 16);               \
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
-                                  16, 16);                              \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
+                                         stride, 17);                   \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
+                                         stride, 16, 16);               \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1486,11 +1252,12 @@  static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
     uint64_t half[16 * 2 + 17 * 2];                                     \
     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
-                                                 16, 16);               \
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);  \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
+                                         stride, 16, 16);               \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1499,12 +1266,12 @@  static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
     uint64_t half[16 * 2 + 17 * 2];                                     \
     uint8_t * const halfH  = ((uint8_t*)half) + 256;                    \
     uint8_t * const halfHV = ((uint8_t*)half);                          \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,         \
-                                                 16, 16);               \
-    OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, stride,      \
-                                  16, 16);                              \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
+                                                    16, 16);            \
+    ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
+                                         stride, 16, 16);               \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1512,11 +1279,12 @@  static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
 {                                                                       \
     uint64_t half[17 * 2];                                              \
     uint8_t * const halfH = ((uint8_t*)half);                           \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,            \
-                                      stride, 17);                      \
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
+                                         stride, 17);                   \
+    ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
+                                                    stride, 16);        \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1524,11 +1292,12 @@  static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
 {                                                                       \
     uint64_t half[17 * 2];                                              \
     uint8_t * const halfH = ((uint8_t*)half);                           \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,        \
-                                      stride, 17);                      \
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
+                                         stride, 17);                   \
+    ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
+                                                    stride, 16);        \
 }                                                                       \
                                                                         \
 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
@@ -1536,9 +1305,10 @@  static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
 {                                                                       \
     uint64_t half[17 * 2];                                              \
     uint8_t * const halfH = ((uint8_t*)half);                           \
-    put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,        \
-                                                 stride, 17);           \
-    OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);   \
+    ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
+                                                    stride, 17);        \
+    ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
+                                                    stride, 16);        \
 }
 
 #define PUT_OP(a, b, temp, size)                \
@@ -1549,13 +1319,13 @@  static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
     "pavgb          "#temp", "#a"       \n\t"   \
     "mov"#size"        "#a", "#b"       \n\t"
 
-QPEL_BASE(put_,        ff_pw_16, _,        PUT_OP)
-QPEL_BASE(avg_,        ff_pw_16, _,        AVG_MMXEXT_OP)
-QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP)
 QPEL_OP(put_,          ff_pw_16, _,        PUT_OP,        mmxext)
 QPEL_OP(avg_,          ff_pw_16, _,        AVG_MMXEXT_OP, mmxext)
 QPEL_OP(put_no_rnd_,   ff_pw_15, _no_rnd_, PUT_OP,        mmxext)
+#endif /* HAVE_YASM */
+
 
+#if HAVE_INLINE_ASM
 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
 {
   put_pixels8_xy2_mmx(dst, src, stride, 8);
@@ -1761,19 +1531,6 @@  void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
     avg_pixels16_mmx(dst, src, stride, 16);
 }
 
-/* VC-1-specific */
-void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
-                               int stride, int rnd)
-{
-    put_pixels8_mmx(dst, src, stride, 8);
-}
-
-void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
-                                  int stride, int rnd)
-{
-    avg_pixels8_mmxext(dst, src, stride, 8);
-}
-
 static void vector_clipf_sse(float *dst, const float *src,
                              float min, float max, int len)
 {
@@ -1950,7 +1707,7 @@  static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     const int bit_depth      = avctx->bits_per_raw_sample;
     const int high_bit_depth = bit_depth > 8;
 
-#if HAVE_INLINE_ASM
+#if HAVE_YASM
     SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
     SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
 
@@ -1960,47 +1717,49 @@  static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
 
     if (!high_bit_depth) {
-        c->put_pixels_tab[0][1] = put_pixels16_x2_mmxext;
-        c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
+        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
+        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
 
-        c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
-        c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
-        c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
+        c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
+        c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
+        c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
 
-        c->put_pixels_tab[1][1] = put_pixels8_x2_mmxext;
-        c->put_pixels_tab[1][2] = put_pixels8_y2_mmxext;
+        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
+        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
 
-        c->avg_pixels_tab[1][0] = avg_pixels8_mmxext;
-        c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmxext;
-        c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmxext;
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
+        c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
     }
 
     if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
         if (!high_bit_depth) {
-            c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
-            c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
-            c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmxext;
-            c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmxext;
+            c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
+            c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 
-            c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
-            c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmxext;
+            c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
+            c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
         }
     }
+#endif /* HAVE_YASM */
 
+#if HAVE_INLINE_ASM
     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
         c->idct_put = ff_idct_xvid_mmxext_put;
         c->idct_add = ff_idct_xvid_mmxext_add;
         c->idct     = ff_idct_xvid_mmxext;
     }
+#endif /* HAVE_INLINE_ASM */
 
+#if HAVE_MMXEXT_EXTERNAL
     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
-        c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmxext;
-        c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmxext;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
     }
-#endif /* HAVE_INLINE_ASM */
 
-#if HAVE_MMXEXT_EXTERNAL
     if (!high_bit_depth && CONFIG_H264CHROMA) {
         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
@@ -2034,41 +1793,39 @@  static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
 {
     const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
-#if HAVE_INLINE_ASM
+#if HAVE_YASM
     if (!high_bit_depth) {
-        c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
-        c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
+        c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
+        c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
 
-        c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
-        c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
-        c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
+        c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
+        c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
+        c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
 
-        c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
-        c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
+        c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
+        c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
 
-        c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
-        c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
-        c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
+        c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
+        c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
+        c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
 
         if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
-            c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
-            c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
-            c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
-            c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
+            c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
+            c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
+            c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
+            c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
 
-            c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
-            c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
+            c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
+            c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
         }
     }
 
     if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
                                avctx->codec_id == AV_CODEC_ID_THEORA)) {
-        c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
-        c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
+        c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
+        c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
     }
-#endif /* HAVE_INLINE_ASM */
 
-#if HAVE_YASM
     if (!high_bit_depth && CONFIG_H264CHROMA) {
         c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
         c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index a64ec41..5037aee 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -697,7 +697,9 @@  static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
 
 av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 {
+#if HAVE_YASM
         dsp->put_vc1_mspel_pixels_tab[ 0] = ff_put_vc1_mspel_mc00_mmx;
+#endif /* HAVE_YASM */
         dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
         dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
         dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
@@ -720,7 +722,9 @@  av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
 
 av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
 {
+#if HAVE_YASM
         dsp->avg_vc1_mspel_pixels_tab[ 0] = ff_avg_vc1_mspel_mc00_mmxext;
+#endif /* HAVE_YASM */
         dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
         dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
         dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;