[1/2] x86: Add missing movsxd for the int stride parameter

Message ID 1465766149-81265-1-git-send-email-martin@martin.st
State Committed
Commit f1a9eee41c4b5ea35db9ff0088ce4e6f1e187f2c
Headers show

Commit Message

Martin Storsjö June 12, 2016, 9:15 p.m.
---
Updated to try to add movsxdifnidn for all occurrances
of "int stride" in libavcodec/x86/h264_idct*.

Using plain movsxd instead of movsxdifnidn within
ifdef ARCH_X86_64.
---
 libavcodec/x86/h264_idct.asm       | 21 +++++++++++++++++++++
 libavcodec/x86/h264_idct_10bit.asm |  8 ++++++++
 2 files changed, 29 insertions(+)

Comments

Martin Storsjö June 15, 2016, 7:47 p.m. | #1
On Mon, 13 Jun 2016, Martin Storsjö wrote:

> ---
> Updated to try to add movsxdifnidn for all occurrances
> of "int stride" in libavcodec/x86/h264_idct*.
>
> Using plain movsxd instead of movsxdifnidn within
> ifdef ARCH_X86_64.
> ---
> libavcodec/x86/h264_idct.asm       | 21 +++++++++++++++++++++
> libavcodec/x86/h264_idct_10bit.asm |  8 ++++++++
> 2 files changed, 29 insertions(+)

If there's no comments on this one, I'll take 
https://lists.libav.org/pipermail/libav-devel/2016-June/077593.html and 
https://lists.libav.org/pipermail/libav-devel/2016-June/077594.html as an 
ok for it and push.

// Martin
Luca Barbato June 15, 2016, 11:31 p.m. | #2
On 15/06/16 21:47, Martin Storsjö wrote:
> On Mon, 13 Jun 2016, Martin Storsjö wrote:
> 
>> ---
>> Updated to try to add movsxdifnidn for all occurrances
>> of "int stride" in libavcodec/x86/h264_idct*.
>>
>> Using plain movsxd instead of movsxdifnidn within
>> ifdef ARCH_X86_64.
>> ---
>> libavcodec/x86/h264_idct.asm       | 21 +++++++++++++++++++++
>> libavcodec/x86/h264_idct_10bit.asm |  8 ++++++++
>> 2 files changed, 29 insertions(+)
> 
> If there's no comments on this one, I'll take
> https://lists.libav.org/pipermail/libav-devel/2016-June/077593.html and
> https://lists.libav.org/pipermail/libav-devel/2016-June/077594.html as
> an ok for it and push.
> 

Sure.
Janne Grunau June 16, 2016, 6:38 a.m. | #3
On 2016-06-13 00:15:49 +0300, Martin Storsjö wrote:
> ---
> Updated to try to add movsxdifnidn for all occurrances
> of "int stride" in libavcodec/x86/h264_idct*.
> 
> Using plain movsxd instead of movsxdifnidn within
> ifdef ARCH_X86_64.
> ---
>  libavcodec/x86/h264_idct.asm       | 21 +++++++++++++++++++++
>  libavcodec/x86/h264_idct_10bit.asm |  8 ++++++++
>  2 files changed, 29 insertions(+)

ok

Janne

Patch

diff --git a/libavcodec/x86/h264_idct.asm b/libavcodec/x86/h264_idct.asm
index 313791a..eb99476 100644
--- a/libavcodec/x86/h264_idct.asm
+++ b/libavcodec/x86/h264_idct.asm
@@ -82,6 +82,7 @@  SECTION .text
 INIT_MMX mmx
 ; void ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
 cglobal h264_idct_add_8, 3, 3, 0
+    movsxdifnidn r2, r2d
     IDCT4_ADD    r0, r1, r2
     RET
 
@@ -204,6 +205,7 @@  cglobal h264_idct_add_8, 3, 3, 0
 INIT_MMX mmx
 ; void ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
 cglobal h264_idct8_add_8, 3, 4, 0
+    movsxdifnidn r2, r2d
     %assign pad 128+4-(stack_offset&7)
     SUB         rsp, pad
 
@@ -272,6 +274,7 @@  cglobal h264_idct8_add_8, 3, 4, 0
 INIT_XMM sse2
 ; void ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
 cglobal h264_idct8_add_8, 3, 4, 10
+    movsxdifnidn  r2, r2d
     IDCT8_ADD_SSE r0, r1, r2, r3
     RET
 
@@ -310,6 +313,7 @@  INIT_MMX mmxext
 ; void ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
 %if ARCH_X86_64
 cglobal h264_idct_dc_add_8, 3, 4, 0
+    movsxd       r2, r2d
     movsx        r3, word [r1]
     mov  dword [r1], 0
     DC_ADD_MMXEXT_INIT r3, r2
@@ -318,6 +322,7 @@  cglobal h264_idct_dc_add_8, 3, 4, 0
 
 ; void ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
 cglobal h264_idct8_dc_add_8, 3, 4, 0
+    movsxd       r2, r2d
     movsx        r3, word [r1]
     mov  dword [r1], 0
     DC_ADD_MMXEXT_INIT r3, r2
@@ -352,6 +357,7 @@  INIT_MMX mmx
 ;                               int16_t *block, int stride,
 ;                               const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
+    movsxdifnidn r3, r3d
     xor          r5, r5
 %ifdef PIC
     lea     picregq, [scan8_mem]
@@ -375,6 +381,7 @@  cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride,
 ;                               int16_t *block, int stride,
 ;                               const uint8_t nnzc[6 * 8])
 cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
+    movsxdifnidn r3, r3d
     %assign pad 128+4-(stack_offset&7)
     SUB         rsp, pad
 
@@ -409,6 +416,7 @@  INIT_MMX mmxext
 ;                                  int16_t *block, int stride,
 ;                                  const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+    movsxdifnidn r3, r3d
     xor          r5, r5
 %ifdef PIC
     lea     picregq, [scan8_mem]
@@ -456,6 +464,7 @@  INIT_MMX mmx
 ;                                    int16_t *block, int stride,
 ;                                    const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
+    movsxdifnidn r3, r3d
     xor          r5, r5
 %ifdef PIC
     lea     picregq, [scan8_mem]
@@ -481,6 +490,7 @@  INIT_MMX mmxext
 ;                                       int16_t *block, int stride,
 ;                                       const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+    movsxdifnidn r3, r3d
     xor          r5, r5
 %ifdef PIC
     lea     picregq, [scan8_mem]
@@ -525,6 +535,7 @@  cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, s
 ;                                  int16_t *block, int stride,
 ;                                  const uint8_t nnzc[6 * 8])
 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+    movsxdifnidn r3, r3d
     %assign pad 128+4-(stack_offset&7)
     SUB         rsp, pad
 
@@ -587,6 +598,7 @@  INIT_XMM sse2
 ;                                int16_t *block, int stride,
 ;                                const uint8_t nnzc[6 * 8])
 cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+    movsxdifnidn r3, r3d
     xor          r5, r5
 %ifdef PIC
     lea     picregq, [scan8_mem]
@@ -638,6 +650,7 @@  INIT_XMM cpuname
 
 INIT_MMX mmx
 h264_idct_add8_mmx_plane:
+    movsxdifnidn r3, r3d
 .nextblock:
     movzx        r6, byte [scan8+r5]
     movzx        r6, byte [r4+r6]
@@ -664,6 +677,7 @@  h264_idct_add8_mmx_plane:
 ;                              int16_t *block, int stride,
 ;                              const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+    movsxdifnidn r3, r3d
     mov          r5, 16
     add          r2, 512
 %ifdef PIC
@@ -684,6 +698,7 @@  cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
     RET
 
 h264_idct_add8_mmxext_plane:
+    movsxdifnidn r3, r3d
 .nextblock:
     movzx        r6, byte [scan8+r5]
     movzx        r6, byte [r4+r6]
@@ -730,6 +745,7 @@  INIT_MMX mmxext
 ;                                 int16_t *block, int stride,
 ;                                 const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
+    movsxdifnidn r3, r3d
     mov          r5, 16
     add          r2, 512
 %if ARCH_X86_64
@@ -751,6 +767,7 @@  cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride,
 
 ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
 h264_idct_dc_add8_mmxext:
+    movsxdifnidn r3, r3d
     movd         m0, [r2   ]          ;  0 0 X D
     mov word [r2+ 0], 0
     punpcklwd    m0, [r2+32]          ;  x X d D
@@ -771,6 +788,7 @@  ALIGN 16
 INIT_XMM sse2
 ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
 h264_add8x4_idct_sse2:
+    movsxdifnidn r3, r3d
     movq   m0, [r2+ 0]
     movq   m1, [r2+ 8]
     movq   m2, [r2+16]
@@ -814,6 +832,7 @@  h264_add8x4_idct_sse2:
 ;                                int16_t *block, int stride,
 ;                                const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
+    movsxdifnidn r3, r3d
 %if ARCH_X86_64
     mov         r5, r0
 %endif
@@ -862,6 +881,7 @@  cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
 ;                                     int16_t *block, int stride,
 ;                                     const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
+    movsxdifnidn r3, r3d
 %if ARCH_X86_64
     mov         r7, r0
 %endif
@@ -914,6 +934,7 @@  cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
 ;                               int16_t *block, int stride,
 ;                               const uint8_t nnzc[6 * 8])
 cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
+    movsxdifnidn r3, r3d
     add          r2, 512
 %if ARCH_X86_64
     mov          r7, r0
diff --git a/libavcodec/x86/h264_idct_10bit.asm b/libavcodec/x86/h264_idct_10bit.asm
index b7d5105..432d74b 100644
--- a/libavcodec/x86/h264_idct_10bit.asm
+++ b/libavcodec/x86/h264_idct_10bit.asm
@@ -77,6 +77,7 @@  SECTION .text
 
 %macro IDCT_ADD_10 0
 cglobal h264_idct_add_10, 3,3
+    movsxdifnidn r2, r2d
     IDCT4_ADD_10 r0, r1, r2
     RET
 %endmacro
@@ -134,6 +135,7 @@  ADD4x4IDCT
 
 %macro IDCT_ADD16_10 0
 cglobal h264_idct_add16_10, 5,6
+    movsxdifnidn r3, r3d
     ADD16_OP 0, 4+1*8
     ADD16_OP 1, 5+1*8
     ADD16_OP 2, 4+2*8
@@ -190,6 +192,7 @@  IDCT_ADD16_10
 
 INIT_MMX mmxext
 cglobal h264_idct_dc_add_10,3,3
+    movsxdifnidn r2, r2d
     movd      m0, [r1]
     mov dword [r1], 0
     paddd     m0, [pd_32]
@@ -205,6 +208,7 @@  cglobal h264_idct_dc_add_10,3,3
 ;-----------------------------------------------------------------------------
 %macro IDCT8_DC_ADD 0
 cglobal h264_idct8_dc_add_10,3,4,7
+    movsxdifnidn r2, r2d
     movd      m0, [r1]
     mov dword[r1], 0
     paddd     m0, [pd_32]
@@ -272,6 +276,7 @@  idct_dc_add %+ SUFFIX:
     ret
 
 cglobal h264_idct_add16intra_10,5,7,8
+    movsxdifnidn r3, r3d
     ADD16_OP_INTRA 0, 4+1*8
     ADD16_OP_INTRA 2, 4+2*8
     ADD16_OP_INTRA 4, 6+1*8
@@ -304,6 +309,7 @@  IDCT_ADD16INTRA_10
 ;-----------------------------------------------------------------------------
 %macro IDCT_ADD8 0
 cglobal h264_idct_add8_10,5,8,7
+    movsxdifnidn r3, r3d
 %if ARCH_X86_64
     mov      r7, r0
 %endif
@@ -438,6 +444,7 @@  IDCT_ADD8
 
 %macro IDCT8_ADD 0
 cglobal h264_idct8_add_10, 3,4,16
+    movsxdifnidn r2, r2d
 %if UNIX64 == 0
     %assign pad 16-gprsize-(stack_offset&15)
     sub  rsp, pad
@@ -560,6 +567,7 @@  IDCT8_ADD
 
 %macro IDCT8_ADD4 0
 cglobal h264_idct8_add4_10, 0,7,16
+    movsxdifnidn r3, r3d
     %assign pad 16-gprsize-(stack_offset&15)
     SUB      rsp, pad
     mov       r5, r0mp