[07/14] vc1: Change type of array stride parameters to ptrdiff_t

Message ID 1474396595-14910-7-git-send-email-diego@biurrun.de
State New
Headers show

Commit Message

Diego Biurrun Sept. 20, 2016, 6:36 p.m.
This avoids SIMD-optimized functions having to sign-extend their
stride argument manually to be able to do pointer arithmetic.

Also rename all such parameters to "stride" for consistency.
---
 libavcodec/arm/vc1dsp_init_neon.c |  2 +-
 libavcodec/vc1.c                  | 11 ++++++-----
 libavcodec/vc1_block.c            | 21 +++++++++++++--------
 libavcodec/vc1_loopfilter.c       |  8 +++++---
 libavcodec/vc1_pred.c             |  9 ++++++---
 libavcodec/vc1dsp.c               | 26 +++++++++++++-------------
 libavcodec/vc1dsp.h               | 16 ++++++++--------
 libavcodec/x86/vc1dsp.asm         | 22 +++++++++++-----------
 libavcodec/x86/vc1dsp_init.c      | 16 ++++++++--------
 libavcodec/x86/vc1dsp_mmx.c       | 21 +++++++++++----------
 10 files changed, 82 insertions(+), 70 deletions(-)

Comments

Martin Storsjö Sept. 29, 2016, 11:24 a.m. | #1
On Tue, 20 Sep 2016, Diego Biurrun wrote:

> This avoids SIMD-optimized functions having to sign-extend their
> stride argument manually to be able to do pointer arithmetic.
>
> Also rename all such parameters to "stride" for consistency.
> ---
> libavcodec/arm/vc1dsp_init_neon.c |  2 +-
> libavcodec/vc1.c                  | 11 ++++++-----
> libavcodec/vc1_block.c            | 21 +++++++++++++--------
> libavcodec/vc1_loopfilter.c       |  8 +++++---
> libavcodec/vc1_pred.c             |  9 ++++++---
> libavcodec/vc1dsp.c               | 26 +++++++++++++-------------
> libavcodec/vc1dsp.h               | 16 ++++++++--------
> libavcodec/x86/vc1dsp.asm         | 22 +++++++++++-----------
> libavcodec/x86/vc1dsp_init.c      | 16 ++++++++--------
> libavcodec/x86/vc1dsp_mmx.c       | 21 +++++++++++----------
> 10 files changed, 82 insertions(+), 70 deletions(-)

> diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
> index adf08d7..9136ad9 100644
> --- a/libavcodec/x86/vc1dsp.asm
> +++ b/libavcodec/x86/vc1dsp.asm
> @@ -237,19 +237,19 @@ cglobal vc1_h_loop_filter_internal
>     VC1_H_LOOP_FILTER 4, r4
>     ret
>
> -; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
> +; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, ptrdiff_t stride, int pq)
> cglobal vc1_v_loop_filter4, 3,5,0
>     START_V_FILTER
>     call vc1_v_loop_filter_internal
>     RET
>

I don't see the corresponding asm simplification as the commit message 
touts. I.e., this is probably a latent bug; fix that first with the proper 
sign extensions before scrambling things by changing the signature.

// Martin
Diego Biurrun Sept. 29, 2016, 2:53 p.m. | #2
On Thu, Sep 29, 2016 at 02:24:32PM +0300, Martin Storsjö wrote:
> On Tue, 20 Sep 2016, Diego Biurrun wrote:
> 
> > This avoids SIMD-optimized functions having to sign-extend their
> > stride argument manually to be able to do pointer arithmetic.
> >
> > Also rename all such parameters to "stride" for consistency.
> > ---
> > libavcodec/arm/vc1dsp_init_neon.c |  2 +-
> > libavcodec/vc1.c                  | 11 ++++++-----
> > libavcodec/vc1_block.c            | 21 +++++++++++++--------
> > libavcodec/vc1_loopfilter.c       |  8 +++++---
> > libavcodec/vc1_pred.c             |  9 ++++++---
> > libavcodec/vc1dsp.c               | 26 +++++++++++++-------------
> > libavcodec/vc1dsp.h               | 16 ++++++++--------
> > libavcodec/x86/vc1dsp.asm         | 22 +++++++++++-----------
> > libavcodec/x86/vc1dsp_init.c      | 16 ++++++++--------
> > libavcodec/x86/vc1dsp_mmx.c       | 21 +++++++++++----------
> > 10 files changed, 82 insertions(+), 70 deletions(-)
> > --- a/libavcodec/x86/vc1dsp.asm
> > +++ b/libavcodec/x86/vc1dsp.asm
> > @@ -237,19 +237,19 @@ cglobal vc1_h_loop_filter_internal
> >     VC1_H_LOOP_FILTER 4, r4
> >     ret
> >
> > -; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
> > +; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, ptrdiff_t stride, int pq)
> > cglobal vc1_v_loop_filter4, 3,5,0
> >     START_V_FILTER
> >     call vc1_v_loop_filter_internal
> >     RET
> 
> I don't see the corresponding asm simplification as the commit message 
> touts. I.e., this is probably a latent bug; fix that first with the proper 
> sign extensions before scrambling things by changing the signature.

I think I just used the wrong log message on this one. Changed locally
to

    vc1: Change type of array stride parameters to ptrdiff_t

    ptrdiff_t is the correct type for array strides and similar.

    Also rename all such parameters to "stride" for consistency.

Diego
Martin Storsjö Sept. 29, 2016, 7:52 p.m. | #3
On Thu, 29 Sep 2016, Diego Biurrun wrote:

> On Thu, Sep 29, 2016 at 02:24:32PM +0300, Martin Storsjö wrote:
>> On Tue, 20 Sep 2016, Diego Biurrun wrote:
>> 
>> > This avoids SIMD-optimized functions having to sign-extend their
>> > stride argument manually to be able to do pointer arithmetic.
>> >
>> > Also rename all such parameters to "stride" for consistency.
>> > ---
>> > libavcodec/arm/vc1dsp_init_neon.c |  2 +-
>> > libavcodec/vc1.c                  | 11 ++++++-----
>> > libavcodec/vc1_block.c            | 21 +++++++++++++--------
>> > libavcodec/vc1_loopfilter.c       |  8 +++++---
>> > libavcodec/vc1_pred.c             |  9 ++++++---
>> > libavcodec/vc1dsp.c               | 26 +++++++++++++-------------
>> > libavcodec/vc1dsp.h               | 16 ++++++++--------
>> > libavcodec/x86/vc1dsp.asm         | 22 +++++++++++-----------
>> > libavcodec/x86/vc1dsp_init.c      | 16 ++++++++--------
>> > libavcodec/x86/vc1dsp_mmx.c       | 21 +++++++++++----------
>> > 10 files changed, 82 insertions(+), 70 deletions(-)
>> > --- a/libavcodec/x86/vc1dsp.asm
>> > +++ b/libavcodec/x86/vc1dsp.asm
>> > @@ -237,19 +237,19 @@ cglobal vc1_h_loop_filter_internal
>> >     VC1_H_LOOP_FILTER 4, r4
>> >     ret
>> >
>> > -; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
>> > +; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, ptrdiff_t stride, int pq)
>> > cglobal vc1_v_loop_filter4, 3,5,0
>> >     START_V_FILTER
>> >     call vc1_v_loop_filter_internal
>> >     RET
>> 
>> I don't see the corresponding asm simplification as the commit message 
>> touts. I.e., this is probably a latent bug; fix that first with the proper 
>> sign extensions before scrambling things by changing the signature.
>
> I think I just used the wrong log message on this one. Changed locally
> to

No, not really. There's three ways it can be:

1) The function actually doesn't use the stride parameter, and no change 
is needed

2) The function does use it correctly (e.g. doing a sign extension of it 
somewhere, or using it via register names like 'rNd' or so, making it 
explicitly that it's a 32 bit parameter). In those cases, we should 
most probably update the asm accordingly, i.e. remove the sign extension, 
or use 'rN' instead of 'rNd'.

3) The function doesn't use it correctly right now, and we have a bug that 
should be fixed before we change the type.


In this case, I'm pretty sure that the parameter isn't unused in all those 
functions, that would be highly surprising.

On a quick view, it seems like this parameter is used within the 
START_V/H_FILTER macros, as 'r1', which should probably be 'r1d' (or sign 
extended into r1 if one can't use r1d at those places - my x86 asm is very 
rusty).

In general, when changing the type from int to ptrdiff_t, there should be 
a change in every single asm function where you change the signature. 
Otherwise the parameter is either unused, or there's a bug. I think. (If 
you've spent more time looking at it and can explain why this isn't the 
case, then please do.)

// Martin

Patch

diff --git a/libavcodec/arm/vc1dsp_init_neon.c b/libavcodec/arm/vc1dsp_init_neon.c
index 08c07c4..944d184 100644
--- a/libavcodec/arm/vc1dsp_init_neon.c
+++ b/libavcodec/arm/vc1dsp_init_neon.c
@@ -35,7 +35,7 @@  void ff_vc1_inv_trans_8x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *bloc
 void ff_vc1_inv_trans_4x4_dc_neon(uint8_t *dest, ptrdiff_t stride, int16_t *block);
 
 void ff_put_pixels8x8_neon(uint8_t *block, const uint8_t *pixels,
-                           ptrdiff_t line_size, int rnd);
+                           ptrdiff_t stride, int rnd);
 
 void ff_put_vc1_mspel_mc10_neon(uint8_t *dst, const uint8_t *src,
                                 ptrdiff_t stride, int rnd);
diff --git a/libavcodec/vc1.c b/libavcodec/vc1.c
index 7a93e97..27e3071 100644
--- a/libavcodec/vc1.c
+++ b/libavcodec/vc1.c
@@ -64,8 +64,8 @@  enum Imode {
  * @param[in] height Height of this buffer
  * @param[in] stride of this buffer
  */
-static void decode_rowskip(uint8_t* plane, int width, int height, int stride,
-                           GetBitContext *gb)
+static void decode_rowskip(uint8_t* plane, int width, int height,
+                           ptrdiff_t stride, GetBitContext *gb)
 {
     int x, y;
 
@@ -86,8 +86,8 @@  static void decode_rowskip(uint8_t* plane, int width, int height, int stride,
  * @param[in] stride of this buffer
  * @todo FIXME: Optimize
  */
-static void decode_colskip(uint8_t* plane, int width, int height, int stride,
-                           GetBitContext *gb)
+static void decode_colskip(uint8_t* plane, int width, int height,
+                           ptrdiff_t stride, GetBitContext *gb)
 {
     int x, y;
 
@@ -115,7 +115,8 @@  static int bitplane_decoding(uint8_t* data, int *raw_flag, VC1Context *v)
 
     int imode, x, y, code, offset;
     uint8_t invert, *planep = data;
-    int width, height, stride;
+    int width, height;
+    ptrdiff_t stride;
 
     width  = v->s.mb_width;
     height = v->s.mb_height >> v->field_mode;
diff --git a/libavcodec/vc1_block.c b/libavcodec/vc1_block.c
index 0e1018c..eee32f2 100644
--- a/libavcodec/vc1_block.c
+++ b/libavcodec/vc1_block.c
@@ -82,8 +82,8 @@  static void vc1_put_signed_blocks_clamped(VC1Context *v)
 {
     MpegEncContext *s = &v->s;
     int topleft_mb_pos, top_mb_pos;
-    int stride_y, fieldtx = 0;
-    int v_dist;
+    int v_dist, fieldtx = 0;
+    ptrdiff_t stride_y;
 
     /* The put pixels loop is always one MB row behind the decoding loop,
      * because we can only put pixels when overlap filtering is done, and
@@ -348,7 +348,8 @@  static inline void vc1_b_mc(VC1Context *v, int dmv_x[2], int dmv_y[2],
 static inline int vc1_i_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
                                 int16_t **dc_val_ptr, int *dir_ptr)
 {
-    int a, b, c, wrap, pred, scale;
+    int a, b, c, pred, scale;
+    ptrdiff_t wrap;
     int16_t *dc_val;
     static const uint16_t dcpred[32] = {
         -1, 1024,  512,  341,  256,  205,  171,  146,  128,
@@ -414,7 +415,8 @@  static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
                               int a_avail, int c_avail,
                               int16_t **dc_val_ptr, int *dir_ptr)
 {
-    int a, b, c, wrap, pred;
+    int a, b, c, pred;
+    ptrdiff_t wrap;
     int16_t *dc_val;
     int mb_pos = s->mb_x + s->mb_y * s->mb_stride;
     int q1, q2 = 0;
@@ -490,7 +492,8 @@  static inline int ff_vc1_pred_dc(MpegEncContext *s, int overlap, int pq, int n,
 static inline int vc1_coded_block_pred(MpegEncContext * s, int n,
                                        uint8_t **coded_block_ptr)
 {
-    int xy, wrap, pred, a, b, c;
+    int xy, pred, a, b, c;
+    ptrdiff_t wrap;
 
     xy   = s->block_index[n];
     wrap = s->b8_stride;
@@ -1176,7 +1179,7 @@  static int vc1_decode_intra_block(VC1Context *v, int16_t block[64], int n,
  */
 static int vc1_decode_p_block(VC1Context *v, int16_t block[64], int n,
                               int mquant, int ttmb, int first_block,
-                              uint8_t *dst, int linesize, int skip_block,
+                              uint8_t *dst, ptrdiff_t linesize, int skip_block,
                               int *ttmb_out)
 {
     MpegEncContext *s = &v->s;
@@ -1600,7 +1603,8 @@  static int vc1_decode_p_mb_intfr(VC1Context *v)
     int skipped, fourmv = 0, twomv = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0, mvbp;
-    int stride_y, fieldtx;
+    int fieldtx;
+    ptrdiff_t stride_y;
 
     mquant = v->pq; /* Lossy initialization */
 
@@ -2248,7 +2252,8 @@  static int vc1_decode_b_mb_intfr(VC1Context *v)
     int skipped, direct, twomv = 0;
     int block_cbp = 0, pat, block_tt = 0;
     int idx_mbmode = 0, mvbp;
-    int stride_y, fieldtx;
+    int fieldtx;
+    ptrdiff_t stride_y;
     int bmvtype = BMV_TYPE_BACKWARD;
     int dir, dir2;
 
diff --git a/libavcodec/vc1_loopfilter.c b/libavcodec/vc1_loopfilter.c
index 52cff1e..de739a5 100644
--- a/libavcodec/vc1_loopfilter.c
+++ b/libavcodec/vc1_loopfilter.c
@@ -210,7 +210,8 @@  static av_always_inline void vc1_apply_p_v_loop_filter(VC1Context *v, int block_
         block_cbp      = mb_cbp      >> (block_num * 4), bottom_cbp,
         mb_is_intra    = v->is_intra[s->mb_x - s->mb_stride],
         block_is_intra = mb_is_intra >> (block_num * 4), bottom_is_intra;
-    int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize, ttblk;
+    int idx, ttblk;
+    ptrdiff_t linesize  = block_num > 3 ? s->uvlinesize : s->linesize;
     uint8_t *dst;
 
     if (block_num > 3) {
@@ -220,7 +221,7 @@  static av_always_inline void vc1_apply_p_v_loop_filter(VC1Context *v, int block_
     }
     if (s->mb_y != s->end_mb_y || block_num < 2) {
         int16_t (*mv)[2];
-        int mv_stride;
+        ptrdiff_t mv_stride;
 
         if (block_num > 3) {
             bottom_cbp      = v->cbp[s->mb_x]      >> (block_num * 4);
@@ -274,7 +275,8 @@  static av_always_inline void vc1_apply_p_h_loop_filter(VC1Context *v, int block_
         block_cbp      = mb_cbp      >> (block_num * 4), right_cbp,
         mb_is_intra    = v->is_intra[s->mb_x - 1 - s->mb_stride],
         block_is_intra = mb_is_intra >> block_num, right_is_intra;
-    int idx, linesize  = block_num > 3 ? s->uvlinesize : s->linesize, ttblk;
+    int idx, ttblk;
+    ptrdiff_t linesize  = block_num > 3 ? s->uvlinesize : s->linesize;
     uint8_t *dst;
 
     if (block_num > 3) {
diff --git a/libavcodec/vc1_pred.c b/libavcodec/vc1_pred.c
index 25be787..f806ab8 100644
--- a/libavcodec/vc1_pred.c
+++ b/libavcodec/vc1_pred.c
@@ -213,7 +213,8 @@  void ff_vc1_pred_mv(VC1Context *v, int n, int dmv_x, int dmv_y,
                     int pred_flag, int dir)
 {
     MpegEncContext *s = &v->s;
-    int xy, wrap, off = 0;
+    int xy, off = 0;
+    ptrdiff_t wrap;
     int16_t *A, *B, *C;
     int px, py;
     int sum;
@@ -466,7 +467,8 @@  void ff_vc1_pred_mv_intfr(VC1Context *v, int n, int dmv_x, int dmv_y,
                           int mvn, int r_x, int r_y, uint8_t* is_intra, int dir)
 {
     MpegEncContext *s = &v->s;
-    int xy, wrap, off = 0;
+    int xy, off = 0;
+    ptrdiff_t wrap;
     int A[2], B[2], C[2];
     int px = 0, py = 0;
     int a_valid = 0, b_valid = 0, c_valid = 0;
@@ -685,7 +687,8 @@  void ff_vc1_pred_b_mv(VC1Context *v, int dmv_x[2], int dmv_y[2],
                       int direct, int mvtype)
 {
     MpegEncContext *s = &v->s;
-    int xy, wrap, off = 0;
+    int xy, off = 0;
+    ptrdiff_t wrap;
     int16_t *A, *B, *C;
     int px, py;
     int sum;
diff --git a/libavcodec/vc1dsp.c b/libavcodec/vc1dsp.c
index 571309b..a36a9f4 100644
--- a/libavcodec/vc1dsp.c
+++ b/libavcodec/vc1dsp.c
@@ -31,7 +31,7 @@ 
 #include "startcode.h"
 
 /* Apply overlap transform to horizontal edge */
-static void vc1_v_overlap_c(uint8_t *src, int stride)
+static void vc1_v_overlap_c(uint8_t *src, ptrdiff_t stride)
 {
     int i;
     int a, b, c, d;
@@ -55,7 +55,7 @@  static void vc1_v_overlap_c(uint8_t *src, int stride)
 }
 
 /* Apply overlap transform to vertical edge */
-static void vc1_h_overlap_c(uint8_t *src, int stride)
+static void vc1_h_overlap_c(uint8_t *src, ptrdiff_t stride)
 {
     int i;
     int a, b, c, d;
@@ -138,7 +138,7 @@  static void vc1_h_s_overlap_c(int16_t *left, int16_t *right)
  * @return whether other 3 pairs should be filtered or not
  * @see 8.6
  */
-static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
+static av_always_inline int vc1_filter_line(uint8_t *src, ptrdiff_t stride, int pq)
 {
     int a0 = (2 * (src[-2 * stride] - src[1 * stride]) -
               5 * (src[-1 * stride] - src[0 * stride]) + 4) >> 3;
@@ -187,7 +187,7 @@  static av_always_inline int vc1_filter_line(uint8_t *src, int stride, int pq)
  * @param pq block quantizer
  * @see 8.6
  */
-static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
+static inline void vc1_loop_filter(uint8_t *src, int step, ptrdiff_t stride,
                                    int len, int pq)
 {
     int i;
@@ -204,32 +204,32 @@  static inline void vc1_loop_filter(uint8_t *src, int step, int stride,
     }
 }
 
-static void vc1_v_loop_filter4_c(uint8_t *src, int stride, int pq)
+static void vc1_v_loop_filter4_c(uint8_t *src, ptrdiff_t stride, int pq)
 {
     vc1_loop_filter(src, 1, stride, 4, pq);
 }
 
-static void vc1_h_loop_filter4_c(uint8_t *src, int stride, int pq)
+static void vc1_h_loop_filter4_c(uint8_t *src, ptrdiff_t stride, int pq)
 {
     vc1_loop_filter(src, stride, 1, 4, pq);
 }
 
-static void vc1_v_loop_filter8_c(uint8_t *src, int stride, int pq)
+static void vc1_v_loop_filter8_c(uint8_t *src, ptrdiff_t stride, int pq)
 {
     vc1_loop_filter(src, 1, stride, 8, pq);
 }
 
-static void vc1_h_loop_filter8_c(uint8_t *src, int stride, int pq)
+static void vc1_h_loop_filter8_c(uint8_t *src, ptrdiff_t stride, int pq)
 {
     vc1_loop_filter(src, stride, 1, 8, pq);
 }
 
-static void vc1_v_loop_filter16_c(uint8_t *src, int stride, int pq)
+static void vc1_v_loop_filter16_c(uint8_t *src, ptrdiff_t stride, int pq)
 {
     vc1_loop_filter(src, 1, stride, 16, pq);
 }
 
-static void vc1_h_loop_filter16_c(uint8_t *src, int stride, int pq)
+static void vc1_h_loop_filter16_c(uint8_t *src, ptrdiff_t stride, int pq)
 {
     vc1_loop_filter(src, stride, 1, 16, pq);
 }
@@ -538,7 +538,7 @@  static void vc1_inv_trans_4x4_c(uint8_t *dest, ptrdiff_t stride, int16_t *block)
 /* Filter in case of 2 filters */
 #define VC1_MSPEL_FILTER_16B(DIR, TYPE)                                       \
 static av_always_inline int vc1_mspel_ ## DIR ## _filter_16bits(const TYPE *src, \
-                                                                int stride,   \
+                                                                ptrdiff_t stride, \
                                                                 int mode)     \
 {                                                                             \
     switch(mode) {                                                            \
@@ -561,7 +561,7 @@  VC1_MSPEL_FILTER_16B(ver, uint8_t)
 VC1_MSPEL_FILTER_16B(hor, int16_t)
 
 /* Filter used to interpolate fractional pel values */
-static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride,
+static av_always_inline int vc1_mspel_filter(const uint8_t *src, ptrdiff_t stride,
                                              int mode, int r)
 {
     switch (mode) {
@@ -584,7 +584,7 @@  static av_always_inline int vc1_mspel_filter(const uint8_t *src, int stride,
 #define VC1_MSPEL_MC(OP, OPNAME)                                              \
 static av_always_inline void OPNAME ## vc1_mspel_mc(uint8_t *dst,             \
                                                     const uint8_t *src,       \
-                                                    int stride,               \
+                                                    ptrdiff_t stride,               \
                                                     int hmode,                \
                                                     int vmode,                \
                                                     int rnd)                  \
diff --git a/libavcodec/vc1dsp.h b/libavcodec/vc1dsp.h
index a9bd712..41663df 100644
--- a/libavcodec/vc1dsp.h
+++ b/libavcodec/vc1dsp.h
@@ -40,16 +40,16 @@  typedef struct VC1DSPContext {
     void (*vc1_inv_trans_8x4_dc)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
     void (*vc1_inv_trans_4x8_dc)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
     void (*vc1_inv_trans_4x4_dc)(uint8_t *dest, ptrdiff_t stride, int16_t *block);
-    void (*vc1_v_overlap)(uint8_t *src, int stride);
-    void (*vc1_h_overlap)(uint8_t *src, int stride);
+    void (*vc1_v_overlap)(uint8_t *src, ptrdiff_t stride);
+    void (*vc1_h_overlap)(uint8_t *src, ptrdiff_t stride);
     void (*vc1_v_s_overlap)(int16_t *top,  int16_t *bottom);
     void (*vc1_h_s_overlap)(int16_t *left, int16_t *right);
-    void (*vc1_v_loop_filter4)(uint8_t *src, int stride, int pq);
-    void (*vc1_h_loop_filter4)(uint8_t *src, int stride, int pq);
-    void (*vc1_v_loop_filter8)(uint8_t *src, int stride, int pq);
-    void (*vc1_h_loop_filter8)(uint8_t *src, int stride, int pq);
-    void (*vc1_v_loop_filter16)(uint8_t *src, int stride, int pq);
-    void (*vc1_h_loop_filter16)(uint8_t *src, int stride, int pq);
+    void (*vc1_v_loop_filter4)(uint8_t *src, ptrdiff_t stride, int pq);
+    void (*vc1_h_loop_filter4)(uint8_t *src, ptrdiff_t stride, int pq);
+    void (*vc1_v_loop_filter8)(uint8_t *src, ptrdiff_t stride, int pq);
+    void (*vc1_h_loop_filter8)(uint8_t *src, ptrdiff_t stride, int pq);
+    void (*vc1_v_loop_filter16)(uint8_t *src, ptrdiff_t stride, int pq);
+    void (*vc1_h_loop_filter16)(uint8_t *src, ptrdiff_t stride, int pq);
 
     /* put 8x8 block with bicubic interpolation and quarterpel precision
      * last argument is actually round value instead of height
diff --git a/libavcodec/x86/vc1dsp.asm b/libavcodec/x86/vc1dsp.asm
index adf08d7..9136ad9 100644
--- a/libavcodec/x86/vc1dsp.asm
+++ b/libavcodec/x86/vc1dsp.asm
@@ -237,19 +237,19 @@  cglobal vc1_h_loop_filter_internal
     VC1_H_LOOP_FILTER 4, r4
     ret
 
-; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
+; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_v_loop_filter4, 3,5,0
     START_V_FILTER
     call vc1_v_loop_filter_internal
     RET
 
-; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
+; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_h_loop_filter4, 3,5,0
     START_H_FILTER 4
     call vc1_h_loop_filter_internal
     RET
 
-; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
+; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_v_loop_filter8, 3,5,0
     START_V_FILTER
     call vc1_v_loop_filter_internal
@@ -258,7 +258,7 @@  cglobal vc1_v_loop_filter8, 3,5,0
     call vc1_v_loop_filter_internal
     RET
 
-; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
+; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_h_loop_filter8, 3,5,0
     START_H_FILTER 4
     call vc1_h_loop_filter_internal
@@ -271,46 +271,46 @@  INIT_MMX mmxext
 VC1_LF
 
 INIT_XMM sse2
-; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_v_loop_filter8, 3,5,8
     START_V_FILTER
     VC1_V_LOOP_FILTER 8, q
     RET
 
-; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
+; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_h_loop_filter8, 3,6,8
     START_H_FILTER 8
     VC1_H_LOOP_FILTER 8, r5
     RET
 
 INIT_MMX ssse3
-; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_v_loop_filter4, 3,5,0
     START_V_FILTER
     VC1_V_LOOP_FILTER 4, d
     RET
 
-; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
+; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_h_loop_filter4, 3,5,0
     START_H_FILTER 4
     VC1_H_LOOP_FILTER 4, r4
     RET
 
 INIT_XMM ssse3
-; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_v_loop_filter8, 3,5,8
     START_V_FILTER
     VC1_V_LOOP_FILTER 8, q
     RET
 
-; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
+; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_h_loop_filter8, 3,6,8
     START_H_FILTER 8
     VC1_H_LOOP_FILTER 8, r5
     RET
 
 INIT_XMM sse4
-; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
+; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, ptrdiff_t stride, int pq)
 cglobal vc1_h_loop_filter8, 3,5,8
     START_H_FILTER 8
     VC1_H_LOOP_FILTER 8
diff --git a/libavcodec/x86/vc1dsp_init.c b/libavcodec/x86/vc1dsp_init.c
index 8982ff9..fda6c3c 100644
--- a/libavcodec/x86/vc1dsp_init.c
+++ b/libavcodec/x86/vc1dsp_init.c
@@ -33,18 +33,18 @@ 
 #include "config.h"
 
 #define LOOP_FILTER(EXT) \
-void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
-void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
-void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
-void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
+void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, ptrdiff_t stride, int pq); \
+void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, ptrdiff_t stride, int pq); \
+void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, ptrdiff_t stride, int pq); \
+void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, ptrdiff_t stride, int pq); \
 \
-static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
+static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, ptrdiff_t stride, int pq) \
 { \
     ff_vc1_v_loop_filter8_ ## EXT(src,   stride, pq); \
     ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
 } \
 \
-static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
+static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, ptrdiff_t stride, int pq) \
 { \
     ff_vc1_h_loop_filter8_ ## EXT(src,          stride, pq); \
     ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
@@ -55,9 +55,9 @@  LOOP_FILTER(mmxext)
 LOOP_FILTER(sse2)
 LOOP_FILTER(ssse3)
 
-void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
+void ff_vc1_h_loop_filter8_sse4(uint8_t *src, ptrdiff_t stride, int pq);
 
-static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
+static void vc1_h_loop_filter16_sse4(uint8_t *src, ptrdiff_t stride, int pq)
 {
     ff_vc1_h_loop_filter8_sse4(src,          stride, pq);
     ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
diff --git a/libavcodec/x86/vc1dsp_mmx.c b/libavcodec/x86/vc1dsp_mmx.c
index d64ddf0..070b35a 100644
--- a/libavcodec/x86/vc1dsp_mmx.c
+++ b/libavcodec/x86/vc1dsp_mmx.c
@@ -83,7 +83,7 @@ 
 
 /** Sacrificing mm6 allows to pipeline loads from src */
 static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
-                                       const uint8_t *src, x86_reg stride,
+                                       const uint8_t *src, ptrdiff_t stride,
                                        int rnd, int64_t shift)
 {
     __asm__ volatile(
@@ -120,7 +120,7 @@  static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
  * memory.
  */
 #define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
-static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
+static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, ptrdiff_t stride,\
                                              const int16_t *src, int rnd)\
 {\
     int h = 8;\
@@ -169,7 +169,7 @@  VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
  */
 #define VC1_SHIFT2(OP, OPNAME)\
 static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
-                                     x86_reg stride, int rnd, x86_reg offset)\
+                                     ptrdiff_t stride, int rnd, x86_reg offset)\
 {\
     rnd = 8-rnd;\
     __asm__ volatile(\
@@ -273,7 +273,7 @@  VC1_SHIFT2(OP_AVG, avg_)
 #define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4)                    \
 static void                                                             \
 vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \
-                                 x86_reg src_stride,                   \
+                                 ptrdiff_t src_stride,                  \
                                  int rnd, int64_t shift)                \
 {                                                                       \
     int h = 8;                                                          \
@@ -328,7 +328,7 @@  vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src,      \
  */
 #define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME)        \
 static void                                                             \
-OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \
+OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, ptrdiff_t stride,  \
                                  const int16_t *src, int rnd)           \
 {                                                                       \
     int h = 8;                                                          \
@@ -367,7 +367,8 @@  OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride,    \
 #define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME)             \
 static void                                                             \
 OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src,         \
-                        x86_reg stride, int rnd, x86_reg offset)      \
+                              ptrdiff_t stride, int rnd,                \
+                              x86_reg offset)                           \
 {                                                                       \
     int h = 8;                                                          \
     src -= offset;                                                      \
@@ -405,9 +406,9 @@  MSPEL_FILTER13_VER_16B(shift3, "0(%1     )", "0(%1,%3  )", "0(%1,%3,2)", "0(%1,%
 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
 MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
 
-typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
-typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
-typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
+typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, ptrdiff_t src_stride, int rnd, int64_t shift);
+typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, ptrdiff_t dst_stride, const int16_t *src, int rnd);
+typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, ptrdiff_t stride, int rnd, x86_reg offset);
 
 /**
  * Interpolate fractional pel values by applying proper vertical then
@@ -421,7 +422,7 @@  typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_
  * @param  rnd     Rounding bias.
  */
 #define VC1_MSPEL_MC(OP)\
-static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
+static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, ptrdiff_t stride,\
                                int hmode, int vmode, int rnd)\
 {\
     static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\