[22/23] vp9lpf/x86: make filter_48/84/88_h work on 32-bit.

Message ID 1475436610-25770-22-git-send-email-anton@khirnov.net
State Committed
Commit 8915320db94c9b3ceb97d6ad92addda690af8c18
Headers show

Commit Message

Anton Khirnov Oct. 2, 2016, 7:30 p.m.
From: "Ronald S. Bultje" <rsbultje@gmail.com>

Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/x86/vp9dsp_init.c | 12 +++------
 libavcodec/x86/vp9lpf.asm    | 62 ++++++++++++++++++++++++++++++++------------
 2 files changed, 48 insertions(+), 26 deletions(-)

Patch

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 6438644..76ea48f 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -285,17 +285,11 @@  av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
     dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_##opt; \
     dsp->loop_filter_mix2[0][0][0] = ff_vp9_loop_filter_h_44_16_##opt; \
     dsp->loop_filter_mix2[0][0][1] = ff_vp9_loop_filter_v_44_16_##opt; \
-    if (ARCH_X86_64) { \
-        dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
-    } \
+    dsp->loop_filter_mix2[0][1][0] = ff_vp9_loop_filter_h_48_16_##opt; \
     dsp->loop_filter_mix2[0][1][1] = ff_vp9_loop_filter_v_48_16_##opt; \
-    if (ARCH_X86_64) { \
-        dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
-    } \
+    dsp->loop_filter_mix2[1][0][0] = ff_vp9_loop_filter_h_84_16_##opt; \
     dsp->loop_filter_mix2[1][0][1] = ff_vp9_loop_filter_v_84_16_##opt; \
-    if (ARCH_X86_64) { \
-        dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
-    } \
+    dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_##opt; \
     dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_##opt; \
 } while (0)
 
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 881bdab..c20eeb8 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -939,9 +939,12 @@  cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     mova                    m3, [P0]
     mova                    m4, [Q0]
     mova                    m5, [Q1]
+%if ARCH_X86_64
     mova                    m6, [Q2]
+%endif
     mova                    m7, [Q3]
     DEFINE_REAL_P7_TO_Q7
+%if ARCH_X86_64
     SBUTTERFLY  bw,  0,  1, 8
     SBUTTERFLY  bw,  2,  3, 8
     SBUTTERFLY  bw,  4,  5, 8
@@ -954,22 +957,47 @@  cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
     SBUTTERFLY  dq,  1,  5, 8
     SBUTTERFLY  dq,  2,  6, 8
     SBUTTERFLY  dq,  3,  7, 8
-    movh   [P7], m0
-    movhps [P6], m0
-    movh   [Q0], m1
-    movhps [Q1], m1
-    movh   [P3], m2
-    movhps [P2], m2
-    movh   [Q4], m3
-    movhps [Q5], m3
-    movh   [P5], m4
-    movhps [P4], m4
-    movh   [Q2], m5
-    movhps [Q3], m5
-    movh   [P1], m6
-    movhps [P0], m6
-    movh   [Q6], m7
-    movhps [Q7], m7
+%else
+    SBUTTERFLY  bw,  0,  1, 6
+    mova  [rsp+64], m1
+    mova        m6, [rsp+96]
+    SBUTTERFLY  bw,  2,  3, 1
+    SBUTTERFLY  bw,  4,  5, 1
+    SBUTTERFLY  bw,  6,  7, 1
+    SBUTTERFLY  wd,  0,  2, 1
+    mova  [rsp+96], m2
+    mova        m1, [rsp+64]
+    SBUTTERFLY  wd,  1,  3, 2
+    SBUTTERFLY  wd,  4,  6, 2
+    SBUTTERFLY  wd,  5,  7, 2
+    SBUTTERFLY  dq,  0,  4, 2
+    SBUTTERFLY  dq,  1,  5, 2
+    movh      [Q0], m1
+    movhps    [Q1], m1
+    mova        m2, [rsp+96]
+    SBUTTERFLY  dq,  2,  6, 1
+    SBUTTERFLY  dq,  3,  7, 1
+%endif
+    SWAP         3, 6
+    SWAP         1, 4
+    movh      [P7], m0
+    movhps    [P6], m0
+    movh      [P5], m1
+    movhps    [P4], m1
+    movh      [P3], m2
+    movhps    [P2], m2
+    movh      [P1], m3
+    movhps    [P0], m3
+%if ARCH_X86_64
+    movh      [Q0], m4
+    movhps    [Q1], m4
+%endif
+    movh      [Q2], m5
+    movhps    [Q3], m5
+    movh      [Q4], m6
+    movhps    [Q5], m6
+    movh      [Q6], m7
+    movhps    [Q7], m7
 %endif
 %endif
 
@@ -979,7 +1007,7 @@  cglobal vp9_loop_filter_%1_%2_16, 2, 6, 16, %3 + %4 + %5, dst, stride, mstride,
 %macro LPF_16_VH 5
 INIT_XMM %5
 LOOPFILTER v, %1, %2,  0, %4
-%if ARCH_X86_64 || %1 == 44
+%if ARCH_X86_64 || %1 != 16
 LOOPFILTER h, %1, %2, %3, %4
 %endif
 %endmacro