[06/23] vp9lpf/x86: add an SSE2 version of vp9_loop_filter_[vh]_88_16

Message ID 1475436610-25770-6-git-send-email-anton@khirnov.net
State Committed
Commit 92d47550ea099fde8c6f4443c94ec768e19ffd26
Headers show

Commit Message

Anton Khirnov Oct. 2, 2016, 7:29 p.m.
From: James Almer <jamrial@gmail.com>

Similar gains as the ssse3 version once again

Additional improvements by Clément Bœsch <u@pkh.me>.

Signed-off-by: James Almer <jamrial@gmail.com>
Signed-off-by: Anton Khirnov <anton@khirnov.net>
---
 libavcodec/x86/vp9dsp_init.c |  3 +++
 libavcodec/x86/vp9lpf.asm    | 20 +++++++++++++++++---
 2 files changed, 20 insertions(+), 3 deletions(-)

Patch

diff --git a/libavcodec/x86/vp9dsp_init.c b/libavcodec/x86/vp9dsp_init.c
index 00a5798..37d53d2 100644
--- a/libavcodec/x86/vp9dsp_init.c
+++ b/libavcodec/x86/vp9dsp_init.c
@@ -226,6 +226,7 @@  void ff_vp9_loop_filter_h_##size1##_##size2##_##opt(uint8_t *dst, ptrdiff_t stri
 lpf_funcs(16, 16, sse2);
 lpf_funcs(16, 16, ssse3);
 lpf_funcs(16, 16, avx);
+lpf_funcs(88, 16, sse2);
 lpf_funcs(88, 16, ssse3);
 lpf_funcs(88, 16, avx);
 
@@ -293,6 +294,8 @@  av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
         init_fpel(1, 1, 32, avg, sse2);
         init_fpel(0, 1, 64, avg, sse2);
         if (ARCH_X86_64) {
+            dsp->loop_filter_mix2[1][1][0] = ff_vp9_loop_filter_h_88_16_sse2;
+            dsp->loop_filter_mix2[1][1][1] = ff_vp9_loop_filter_v_88_16_sse2;
             dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_sse2;
             dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_sse2;
         }
diff --git a/libavcodec/x86/vp9lpf.asm b/libavcodec/x86/vp9lpf.asm
index 183f3f6..bde3fcb 100644
--- a/libavcodec/x86/vp9lpf.asm
+++ b/libavcodec/x86/vp9lpf.asm
@@ -292,6 +292,17 @@  SECTION .text
 %define Q7 dst2q +  strideq
 %endmacro
 
+; ..............AB -> AAAAAAAABBBBBBBB
+%macro SPLATB_MIX 1-2 [mask_mix]
+%if cpuflag(ssse3)
+    pshufb     %1, %2
+%else
+    punpcklbw  %1, %1
+    punpcklwd  %1, %1
+    punpckldq  %1, %1
+%endif
+%endmacro
+
 %macro LOOPFILTER 2 ; %1=v/h %2=size1
     lea mstrideq, [strideq]
     neg mstrideq
@@ -382,11 +393,13 @@  SECTION .text
     SPLATB_REG          m2, I, m0                       ; I I I I ...
     SPLATB_REG          m3, E, m0                       ; E E E E ...
 %elif %2 == 88
+%if cpuflag(ssse3)
     mova                m0, [mask_mix]
+%endif
     movd                m2, Id
     movd                m3, Ed
-    pshufb              m2, m0
-    pshufb              m3, m0
+    SPLATB_MIX          m2, m0
+    SPLATB_MIX          m3, m0
 %endif
     mova                m0, [pb_80]
     pxor                m2, m0
@@ -446,7 +459,7 @@  SECTION .text
     SPLATB_REG          m7, H, m0                       ; H H H H ...
 %else
     movd                m7, Hd
-    pshufb              m7, [mask_mix]
+    SPLATB_MIX          m7
 %endif
     pxor                m7, m8
     pxor                m4, m8
@@ -727,6 +740,7 @@  LPF_16_16_VH sse2
 LPF_16_16_VH ssse3
 LPF_16_16_VH avx
 
+LPF_88_16_VH sse2
 LPF_88_16_VH ssse3
 LPF_88_16_VH avx