[06/15] hevc: Add SSE4 MC functions

Message ID 53AC7955.20003@gmail.com
State New
Headers show

Commit Message

James Almer June 26, 2014, 7:49 p.m.
---
This applies cleanly after PATCH 14/15, and of course requires relevant changes to 
hevc_init.c
I think i got every function right, but in any case fixing any of them is a single 
line change.

In the end, out of 190 functions, only 44 were SSE4.

 libavcodec/x86/hevc_mc.asm | 363 +++++++++++++++++++++++++++++++++++----------
 1 file changed, 281 insertions(+), 82 deletions(-)

-- 1.8.5.5

Comments

Luca Barbato June 26, 2014, 7:59 p.m. | #1
On 26/06/14 21:49, James Almer wrote:
> ---
> This applies cleanly after PATCH 14/15, and of course requires relevant changes to 
> hevc_init.c
> I think i got every function right, but in any case fixing any of them is a single 
> line change.
> 
> In the end, out of 190 functions, only 44 were SSE4.
> 

Great! This weekend I'll make sure to get it sorted =)

Thanks a lot!

lu

Patch

diff --git a/libavcodec/x86/hevc_mc.asm b/libavcodec/x86/hevc_mc.asm
index dac3295..4696fa8 100644
--- a/libavcodec/x86/hevc_mc.asm
+++ b/libavcodec/x86/hevc_mc.asm
@@ -30,8 +30,8 @@  zero:                   times 4  dd 0
 one_per_32:             times 4  dd 1
 
 SECTION .text
-%macro EPEL_TABLE 4
-hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
+%macro EPEL_TABLE 3
+hevc_epel_filters_%1 times %2 d%3 -2, 58
                         times %2 d%3 10, -2
                         times %2 d%3 -4, 54
                         times %2 d%3 16, -2
@@ -49,11 +49,11 @@  hevc_epel_filters_%4_%1 times %2 d%3 -2, 58
 
 
 
-EPEL_TABLE  8, 8, b, sse4
-EPEL_TABLE 10, 4, w, sse4
+EPEL_TABLE  8, 8, b
+EPEL_TABLE 10, 4, w
 
-%macro QPEL_TABLE 4
-hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
+%macro QPEL_TABLE 3
+hevc_qpel_filters_%1 times %2 d%3  -1,  4
                         times %2 d%3 -10, 58
                         times %2 d%3  17, -5
                         times %2 d%3   1,  0
@@ -67,10 +67,10 @@  hevc_qpel_filters_%4_%1 times %2 d%3  -1,  4
                         times %2 d%3   4, -1
 %endmacro
 
-QPEL_TABLE  8, 8, b, sse4
-QPEL_TABLE 10, 4, w, sse4
+QPEL_TABLE  8, 8, b
+QPEL_TABLE 10, 4, w
 
-%define hevc_qpel_filters_sse4_14 hevc_qpel_filters_sse4_10
+%define hevc_qpel_filters_14 hevc_qpel_filters_10
 
 %if ARCH_X86_64
 
@@ -114,9 +114,9 @@  QPEL_TABLE 10, 4, w, sse4
 
 %macro EPEL_FILTER 2-4                            ; bit depth, filter index
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+    lea         rfilterq, [hevc_epel_filters_%1]
 %else
-    %define rfilterq hevc_epel_filters_sse4_%1
+    %define rfilterq hevc_epel_filters_%1
 %endif
     sub              %2q, 1
     shl              %2q, 5                      ; multiply by 32
@@ -131,9 +131,9 @@  QPEL_TABLE 10, 4, w, sse4
 
 %macro EPEL_HV_FILTER 1
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_%1]
+    lea         rfilterq, [hevc_epel_filters_%1]
 %else
-    %define rfilterq hevc_epel_filters_sse4_%1
+    %define rfilterq hevc_epel_filters_%1
 %endif
     sub              mxq, 1
     sub              myq, 1
@@ -144,9 +144,9 @@  QPEL_TABLE 10, 4, w, sse4
     lea           r3srcq, [srcstrideq*3]
 
 %ifdef PIC
-    lea         rfilterq, [hevc_epel_filters_sse4_10]
+    lea         rfilterq, [hevc_epel_filters_10]
 %else
-    %define rfilterq hevc_epel_filters_sse4_10
+    %define rfilterq hevc_epel_filters_10
 %endif
     movdqa           m12, [rfilterq + myq]        ; get 2 first values of filters
     movdqa           m13, [rfilterq + myq+16]     ; get 2 last values of filters
@@ -154,9 +154,9 @@  QPEL_TABLE 10, 4, w, sse4
 
 %macro QPEL_FILTER 2
 %ifdef PIC
-    lea         rfilterq, [hevc_qpel_filters_sse4_%1]
+    lea         rfilterq, [hevc_qpel_filters_%1]
 %else
-    %define rfilterq hevc_qpel_filters_sse4_%1
+    %define rfilterq hevc_qpel_filters_%1
 %endif
     lea              %2q, [%2q*8-8]
     movdqa           m12, [rfilterq + %2q*8]       ; get 4 first values of filters
@@ -389,9 +389,9 @@  QPEL_TABLE 10, 4, w, sse4
 
 %macro QPEL_HV_COMPUTE 4     ; width, bitdepth, filter idx
 %ifdef PIC
-    lea         rfilterq, [hevc_qpel_filters_sse4_%2]
+    lea         rfilterq, [hevc_qpel_filters_%2]
 %else
-    %define rfilterq hevc_qpel_filters_sse4_%2
+    %define rfilterq hevc_qpel_filters_%2
 %endif
 
 %if %2 == 8
@@ -498,7 +498,6 @@  QPEL_TABLE 10, 4, w, sse4
 %endif
 %endmacro
 
-INIT_XMM sse4                                    ; adds ff_ and _sse4 to function name
 ; ******************************
 ; void put_hevc_mc_pixels(int16_t *dst, ptrdiff_t dststride,
 ;                         uint8_t *_src, ptrdiff_t _srcstride,
@@ -514,7 +513,9 @@  cglobal hevc_put_hevc_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,h
     PEL_10STORE%1     dstq, m0, m1
     LOOP_END         dst, dststride, src, srcstride
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_UNI_PEL_PIXELS 2
 cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstride,height
     pxor              m2, m2
 .loop
@@ -525,7 +526,9 @@  cglobal hevc_put_hevc_uni_pel_pixels%1_%2, 5, 5, 3, dst, dststride, src, srcstri
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_BI_PEL_PIXELS 2
 cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstride, src2, src2stride,height
     pxor              m2, m2
     movdqa            m5, [pw_bi_%2]
@@ -541,9 +544,44 @@  cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstrid
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
-
 %endmacro
 
+INIT_XMM sse2
+HEVC_PUT_HEVC_PEL_PIXELS  2, 8
+HEVC_PUT_HEVC_PEL_PIXELS  4, 8
+HEVC_PUT_HEVC_PEL_PIXELS  6, 8
+HEVC_PUT_HEVC_PEL_PIXELS  8, 8
+HEVC_PUT_HEVC_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_PEL_PIXELS 16, 8
+HEVC_PUT_HEVC_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_PEL_PIXELS 8, 10
+
+HEVC_PUT_HEVC_UNI_PEL_PIXELS  4, 8
+HEVC_PUT_HEVC_UNI_PEL_PIXELS  8, 8
+HEVC_PUT_HEVC_UNI_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_UNI_PEL_PIXELS 16, 8
+HEVC_PUT_HEVC_UNI_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_UNI_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_UNI_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_UNI_PEL_PIXELS 8, 10
+
+INIT_XMM ssse3
+HEVC_PUT_HEVC_BI_PEL_PIXELS  4, 8
+HEVC_PUT_HEVC_BI_PEL_PIXELS  8, 8
+HEVC_PUT_HEVC_BI_PEL_PIXELS 12, 8
+HEVC_PUT_HEVC_BI_PEL_PIXELS 16, 8
+HEVC_PUT_HEVC_BI_PEL_PIXELS 2, 10
+HEVC_PUT_HEVC_BI_PEL_PIXELS 4, 10
+HEVC_PUT_HEVC_BI_PEL_PIXELS 6, 10
+HEVC_PUT_HEVC_BI_PEL_PIXELS 8, 10
+
+INIT_XMM sse4
+HEVC_PUT_HEVC_UNI_PEL_PIXELS  2, 8
+HEVC_PUT_HEVC_UNI_PEL_PIXELS  6, 8
+HEVC_PUT_HEVC_BI_PEL_PIXELS  2, 8
+HEVC_PUT_HEVC_BI_PEL_PIXELS  6, 8
 
 ; ******************************
 ; void put_hevc_epel_hX(int16_t *dst, ptrdiff_t dststride,
@@ -552,8 +590,7 @@  cglobal hevc_put_hevc_bi_pel_pixels%1_%2, 7, 7, 6, dst, dststride, src, srcstrid
 ;                       int16_t* mcbuffer)
 ; ******************************
 
-
-%macro HEVC_PUT_HEVC_EPEL 2
+%macro HEVC_PUT_HEVC_EPEL_H 2
 cglobal hevc_put_hevc_epel_h%1_%2, 6, 7, 6, dst, dststride, src, srcstride, height, mx, rfilter
 %assign %%stride ((%2 + 7)/8)
     EPEL_FILTER       %2, mx, m4, m5
@@ -563,7 +600,9 @@  cglobal hevc_put_hevc_epel_h%1_%2, 6, 7, 6, dst, dststride, src, srcstride, heig
     PEL_10STORE%1      dstq, m0, m1
     LOOP_END         dst, dststride, src, srcstride
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_UNI_EPEL_H 2
 cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride, height, mx, rfilter
 %assign %%stride ((%2 + 7)/8)
     movdqa            m6, [pw_%2]
@@ -578,8 +617,11 @@  cglobal hevc_put_hevc_uni_epel_h%1_%2, 6, 7, 7, dst, dststride, src, srcstride,
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_BI_EPEL_H 2
 cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, src2, src2stride,height, mx, rfilter
+%assign %%stride ((%2 + 7)/8)
     movdqa            m6, [pw_bi_%2]
     EPEL_FILTER       %2, mx, m4, m5
 .loop
@@ -594,6 +636,45 @@  cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, s
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
+
+INIT_XMM sse2
+HEVC_PUT_HEVC_EPEL_H 2, 10
+HEVC_PUT_HEVC_EPEL_H 4, 10
+HEVC_PUT_HEVC_EPEL_H 6, 10
+HEVC_PUT_HEVC_EPEL_H 8, 10
+
+INIT_XMM ssse3
+HEVC_PUT_HEVC_EPEL_H 2,  8
+HEVC_PUT_HEVC_EPEL_H 4,  8
+HEVC_PUT_HEVC_EPEL_H 6,  8
+HEVC_PUT_HEVC_EPEL_H 8,  8
+HEVC_PUT_HEVC_EPEL_H 12, 8
+HEVC_PUT_HEVC_EPEL_H 16, 8
+
+HEVC_PUT_HEVC_UNI_EPEL_H 4,  8
+HEVC_PUT_HEVC_UNI_EPEL_H 8,  8
+HEVC_PUT_HEVC_UNI_EPEL_H 12, 8
+HEVC_PUT_HEVC_UNI_EPEL_H 16, 8
+HEVC_PUT_HEVC_UNI_EPEL_H 2, 10
+HEVC_PUT_HEVC_UNI_EPEL_H 4, 10
+HEVC_PUT_HEVC_UNI_EPEL_H 6, 10
+HEVC_PUT_HEVC_UNI_EPEL_H 8, 10
+
+HEVC_PUT_HEVC_BI_EPEL_H 4,  8
+HEVC_PUT_HEVC_BI_EPEL_H 8,  8
+HEVC_PUT_HEVC_BI_EPEL_H 12, 8
+HEVC_PUT_HEVC_BI_EPEL_H 16, 8
+HEVC_PUT_HEVC_BI_EPEL_H 2, 10
+HEVC_PUT_HEVC_BI_EPEL_H 4, 10
+HEVC_PUT_HEVC_BI_EPEL_H 6, 10
+HEVC_PUT_HEVC_BI_EPEL_H 8, 10
+
+INIT_XMM sse4
+HEVC_PUT_HEVC_UNI_EPEL_H 2,  8
+HEVC_PUT_HEVC_UNI_EPEL_H 6,  8
+HEVC_PUT_HEVC_BI_EPEL_H 2,  8
+HEVC_PUT_HEVC_BI_EPEL_H 6,  8
 
 ; ******************************
 ; void put_hevc_epel_v(int16_t *dst, ptrdiff_t dststride,
@@ -602,6 +683,7 @@  cglobal hevc_put_hevc_bi_epel_h%1_%2, 8, 9, 7, dst, dststride, src, srcstride, s
 ;                      int16_t* mcbuffer)
 ; ******************************
 
+%macro HEVC_PUT_HEVC_EPEL_V 2
 cglobal hevc_put_hevc_epel_v%1_%2, 7, 8, 6, dst, dststride, src, srcstride, height, r3src, my, rfilter
     lea           r3srcq, [srcstrideq*3]
     sub             srcq, srcstrideq
@@ -612,7 +694,9 @@  cglobal hevc_put_hevc_epel_v%1_%2, 7, 8, 6, dst, dststride, src, srcstride, heig
     PEL_10STORE%1     dstq, m0, m1
     LOOP_END          dst, dststride, src, srcstride
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_UNI_EPEL_V 2
 cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride, height, r3src, my, rfilter
     lea           r3srcq, [srcstrideq*3]
     movdqa            m6, [pw_%2]
@@ -628,8 +712,9 @@  cglobal hevc_put_hevc_uni_epel_v%1_%2, 7, 8, 7, dst, dststride, src, srcstride,
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
-
+%macro HEVC_PUT_HEVC_BI_EPEL_V 2
 cglobal hevc_put_hevc_bi_epel_v%1_%2, 9, 10, 7, dst, dststride, src, srcstride, src2, src2stride,height, r3src, my, rfilter
     lea           r3srcq, [srcstrideq*3]
     movdqa            m6, [pw_bi_%2]
@@ -649,6 +734,43 @@  cglobal hevc_put_hevc_bi_epel_v%1_%2, 9, 10, 7, dst, dststride, src, srcstride,
     RET
 %endmacro
 
+INIT_XMM sse2
+HEVC_PUT_HEVC_EPEL_V 2, 10
+HEVC_PUT_HEVC_EPEL_V 4, 10
+HEVC_PUT_HEVC_EPEL_V 6, 10
+HEVC_PUT_HEVC_EPEL_V 8, 10
+
+INIT_XMM ssse3
+HEVC_PUT_HEVC_EPEL_V 2,  8
+HEVC_PUT_HEVC_EPEL_V 4,  8
+HEVC_PUT_HEVC_EPEL_V 6,  8
+HEVC_PUT_HEVC_EPEL_V 8,  8
+HEVC_PUT_HEVC_EPEL_V 12, 8
+HEVC_PUT_HEVC_EPEL_V 16, 8
+
+HEVC_PUT_HEVC_UNI_EPEL_V 4,  8
+HEVC_PUT_HEVC_UNI_EPEL_V 8,  8
+HEVC_PUT_HEVC_UNI_EPEL_V 12, 8
+HEVC_PUT_HEVC_UNI_EPEL_V 16, 8
+HEVC_PUT_HEVC_UNI_EPEL_V 2, 10
+HEVC_PUT_HEVC_UNI_EPEL_V 4, 10
+HEVC_PUT_HEVC_UNI_EPEL_V 6, 10
+HEVC_PUT_HEVC_UNI_EPEL_V 8, 10
+
+HEVC_PUT_HEVC_BI_EPEL_V 4,  8
+HEVC_PUT_HEVC_BI_EPEL_V 8,  8
+HEVC_PUT_HEVC_BI_EPEL_V 12, 8
+HEVC_PUT_HEVC_BI_EPEL_V 16, 8
+HEVC_PUT_HEVC_BI_EPEL_V 2, 10
+HEVC_PUT_HEVC_BI_EPEL_V 4, 10
+HEVC_PUT_HEVC_BI_EPEL_V 6, 10
+HEVC_PUT_HEVC_BI_EPEL_V 8, 10
+
+INIT_XMM sse4
+HEVC_PUT_HEVC_UNI_EPEL_V 2,  8
+HEVC_PUT_HEVC_UNI_EPEL_V 6,  8
+HEVC_PUT_HEVC_BI_EPEL_V 2,  8
+HEVC_PUT_HEVC_BI_EPEL_V 6,  8
 
 ; ******************************
 ; void put_hevc_epel_hv(int16_t *dst, ptrdiff_t dststride,
@@ -690,7 +812,9 @@  cglobal hevc_put_hevc_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, h
     movdqa            m6, m7
     LOOP_END         dst, dststride, src, srcstride
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_UNI_EPEL_HV 2
 cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
 %assign %%stride ((%2 + 7)/8)
     sub             srcq, srcstrideq
@@ -728,8 +852,9 @@  cglobal hevc_put_hevc_uni_epel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstrid
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
-
+%macro HEVC_PUT_HEVC_BI_EPEL_HV 2
 cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
 %assign %%stride ((%2 + 7)/8)
     sub             srcq, srcstrideq
@@ -771,13 +896,45 @@  cglobal hevc_put_hevc_bi_epel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride
     RET
 %endmacro
 
+INIT_XMM sse2
+HEVC_PUT_HEVC_EPEL_HV 2, 10
+HEVC_PUT_HEVC_EPEL_HV 4, 10
+HEVC_PUT_HEVC_EPEL_HV 6, 10
+HEVC_PUT_HEVC_EPEL_HV 8, 10
+
+INIT_XMM ssse3
+HEVC_PUT_HEVC_EPEL_HV 2,  8
+HEVC_PUT_HEVC_EPEL_HV 4,  8
+HEVC_PUT_HEVC_EPEL_HV 6,  8
+HEVC_PUT_HEVC_EPEL_HV 8,  8
+
+HEVC_PUT_HEVC_UNI_EPEL_HV 4,  8
+HEVC_PUT_HEVC_UNI_EPEL_HV 8,  8
+HEVC_PUT_HEVC_UNI_EPEL_HV 2, 10
+HEVC_PUT_HEVC_UNI_EPEL_HV 4, 10
+HEVC_PUT_HEVC_UNI_EPEL_HV 6, 10
+HEVC_PUT_HEVC_UNI_EPEL_HV 8, 10
+
+HEVC_PUT_HEVC_BI_EPEL_HV 4,  8
+HEVC_PUT_HEVC_BI_EPEL_HV 8,  8
+HEVC_PUT_HEVC_BI_EPEL_HV 2, 10
+HEVC_PUT_HEVC_BI_EPEL_HV 4, 10
+HEVC_PUT_HEVC_BI_EPEL_HV 6, 10
+HEVC_PUT_HEVC_BI_EPEL_HV 8, 10
+
+INIT_XMM sse4
+HEVC_PUT_HEVC_UNI_EPEL_HV 2,  8
+HEVC_PUT_HEVC_UNI_EPEL_HV 6,  8
+HEVC_PUT_HEVC_BI_EPEL_HV 2,  8
+HEVC_PUT_HEVC_BI_EPEL_HV 6,  8
+
 ; ******************************
 ; void put_hevc_qpel_hX_X_X(int16_t *dst, ptrdiff_t dststride,
 ;                       uint8_t *_src, ptrdiff_t _srcstride,
 ;                       int width, int height, int mx, int my)
 ; ******************************
 
-%macro HEVC_PUT_HEVC_QPEL 2
+%macro HEVC_PUT_HEVC_QPEL_H 2
 cglobal hevc_put_hevc_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
     QPEL_FILTER       %2, mx
 .loop
@@ -789,7 +946,9 @@  cglobal hevc_put_hevc_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, he
     PEL_10STORE%1     dstq, m0, m1
     LOOP_END          dst, dststride, src, srcstride
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_UNI_QPEL_H 2
 cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride, height, mx, rfilter
     movdqa            m9, [pw_%2]
     QPEL_FILTER       %2, mx
@@ -806,7 +965,9 @@  cglobal hevc_put_hevc_uni_qpel_h%1_%2, 6, 7, 15 , dst, dststride, src, srcstride
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_BI_QPEL_H 2
 cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride, src2, src2stride, height, mx, rfilter
     movdqa            m9, [pw_bi_%2]
     QPEL_FILTER       %2, mx
@@ -825,7 +986,31 @@  cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride,
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
+INIT_XMM sse2
+HEVC_PUT_HEVC_QPEL_H 4, 10
+HEVC_PUT_HEVC_QPEL_H 8, 10
+
+INIT_XMM ssse3
+HEVC_PUT_HEVC_QPEL_H 4,  8
+HEVC_PUT_HEVC_QPEL_H 8,  8
+HEVC_PUT_HEVC_QPEL_H 12, 8
+HEVC_PUT_HEVC_QPEL_H 16, 8
+
+HEVC_PUT_HEVC_UNI_QPEL_H 4,  8
+HEVC_PUT_HEVC_UNI_QPEL_H 8,  8
+HEVC_PUT_HEVC_UNI_QPEL_H 12, 8
+HEVC_PUT_HEVC_UNI_QPEL_H 16, 8
+HEVC_PUT_HEVC_UNI_QPEL_H 4, 10
+HEVC_PUT_HEVC_UNI_QPEL_H 8, 10
+
+HEVC_PUT_HEVC_BI_QPEL_H 4,  8
+HEVC_PUT_HEVC_BI_QPEL_H 8,  8
+HEVC_PUT_HEVC_BI_QPEL_H 12, 8
+HEVC_PUT_HEVC_BI_QPEL_H 16, 8
+HEVC_PUT_HEVC_BI_QPEL_H 4, 10
+HEVC_PUT_HEVC_BI_QPEL_H 8, 10
 
 ; ******************************
 ; void put_hevc_qpel_vX_X_X(int16_t *dst, ptrdiff_t dststride,
@@ -833,6 +1018,7 @@  cglobal hevc_put_hevc_bi_qpel_h%1_%2, 8, 9, 16 , dst, dststride, src, srcstride,
 ;                       int width, int height, int mx, int my)
 ; ******************************
 
+%macro HEVC_PUT_HEVC_QPEL_V 2
 cglobal hevc_put_hevc_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, height, r3src, my, rfilter
     lea           r3srcq, [srcstrideq*3]
     QPEL_FILTER       %2, my
@@ -845,7 +1031,9 @@  cglobal hevc_put_hevc_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, h
     PEL_10STORE%1     dstq, m0, m1
     LOOP_END         dst, dststride, src, srcstride
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_UNI_QPEL_V 2
 cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstride, height, r3src, my, rfilter
     movdqa            m9, [pw_%2]
     lea           r3srcq, [srcstrideq*3]
@@ -863,7 +1051,9 @@  cglobal hevc_put_hevc_uni_qpel_v%1_%2, 7, 14, 15 , dst, dststride, src, srcstrid
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_BI_QPEL_V 2
 cglobal hevc_put_hevc_bi_qpel_v%1_%2, 9, 14, 16 , dst, dststride, src, srcstride, src2, src2stride, height, r3src, my, rfilter
     movdqa            m9, [pw_bi_%2]
     lea           r3srcq, [srcstrideq*3]
@@ -885,12 +1075,38 @@  cglobal hevc_put_hevc_bi_qpel_v%1_%2, 9, 14, 16 , dst, dststride, src, srcstride
     RET
 %endmacro
 
+INIT_XMM sse2
+HEVC_PUT_HEVC_QPEL_V 4, 10
+HEVC_PUT_HEVC_QPEL_V 8, 10
+
+INIT_XMM ssse3
+HEVC_PUT_HEVC_QPEL_V 4,  8
+HEVC_PUT_HEVC_QPEL_V 8,  8
+HEVC_PUT_HEVC_QPEL_V 12, 8
+HEVC_PUT_HEVC_QPEL_V 16, 8
+
+HEVC_PUT_HEVC_UNI_QPEL_V 4,  8
+HEVC_PUT_HEVC_UNI_QPEL_V 8,  8
+HEVC_PUT_HEVC_UNI_QPEL_V 12, 8
+HEVC_PUT_HEVC_UNI_QPEL_V 16, 8
+
+HEVC_PUT_HEVC_BI_QPEL_V 4,  8
+HEVC_PUT_HEVC_BI_QPEL_V 8,  8
+HEVC_PUT_HEVC_BI_QPEL_V 12, 8
+HEVC_PUT_HEVC_BI_QPEL_V 16, 8
+HEVC_PUT_HEVC_BI_QPEL_V 4, 10
+HEVC_PUT_HEVC_BI_QPEL_V 8, 10
+
+INIT_XMM sse4
+HEVC_PUT_HEVC_UNI_QPEL_V 4, 10
+HEVC_PUT_HEVC_UNI_QPEL_V 8, 10
 
 ; ******************************
 ; void put_hevc_qpel_hvX_X(int16_t *dst, ptrdiff_t dststride,
 ;                       uint8_t *_src, ptrdiff_t _srcstride,
 ;                       int height, int mx, int my)
 ; ******************************
+
 %macro HEVC_PUT_HEVC_QPEL_HV 2
 cglobal hevc_put_hevc_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
     lea              mxq, [mxq*8-8]
@@ -960,7 +1176,9 @@  cglobal hevc_put_hevc_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, h
 %endif
     LOOP_END         dst, dststride, src, srcstride
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_UNI_QPEL_HV 2
 cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstride, height, mx, my, r3src, rfilter
     lea              mxq, [mxq*8-8]
     lea              myq, [myq*8-8]
@@ -1034,7 +1252,9 @@  cglobal hevc_put_hevc_uni_qpel_hv%1_%2, 7, 9, 12 , dst, dststride, src, srcstrid
     dec          heightd                         ; cmp height
     jnz               .loop                      ; height loop
     RET
+%endmacro
 
+%macro HEVC_PUT_HEVC_BI_QPEL_HV 2
 cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride, src2, src2stride, height, mx, my, r3src, rfilter
     lea              mxq, [mxq*8-8]
     lea              myq, [myq*8-8]
@@ -1112,6 +1332,40 @@  cglobal hevc_put_hevc_bi_qpel_hv%1_%2, 9, 11, 16, dst, dststride, src, srcstride
     RET
 %endmacro
 
+INIT_XMM sse2
+HEVC_PUT_HEVC_QPEL_HV 2, 10
+HEVC_PUT_HEVC_QPEL_HV 4, 10
+HEVC_PUT_HEVC_QPEL_HV 6, 10
+HEVC_PUT_HEVC_QPEL_HV 8, 10
+
+INIT_XMM ssse3
+HEVC_PUT_HEVC_QPEL_HV 2, 8
+HEVC_PUT_HEVC_QPEL_HV 4, 8
+HEVC_PUT_HEVC_QPEL_HV 6, 8
+HEVC_PUT_HEVC_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_BI_QPEL_HV 4, 8
+HEVC_PUT_HEVC_BI_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_BI_QPEL_HV 2, 10
+HEVC_PUT_HEVC_BI_QPEL_HV 4, 10
+HEVC_PUT_HEVC_BI_QPEL_HV 6, 10
+HEVC_PUT_HEVC_BI_QPEL_HV 8, 10
+
+INIT_XMM sse4
+HEVC_PUT_HEVC_UNI_QPEL_HV 2, 8
+HEVC_PUT_HEVC_UNI_QPEL_HV 4, 8
+HEVC_PUT_HEVC_UNI_QPEL_HV 6, 8
+HEVC_PUT_HEVC_UNI_QPEL_HV 8, 8
+
+HEVC_PUT_HEVC_UNI_QPEL_HV 2, 10
+HEVC_PUT_HEVC_UNI_QPEL_HV 4, 10
+HEVC_PUT_HEVC_UNI_QPEL_HV 6, 10
+HEVC_PUT_HEVC_UNI_QPEL_HV 8, 10
+
+HEVC_PUT_HEVC_BI_QPEL_HV 2, 8
+HEVC_PUT_HEVC_BI_QPEL_HV 6, 8
+
 %macro WEIGHTING_FUNCS 2
 cglobal hevc_put_hevc_uni_w%1_%2, 8, 10, 11, dst, dststride, src, srcstride, height, denom, wx, ox, shift
     lea          shiftd, [denomd+14-%2]          ; shift = 14 - bitd + denom
@@ -1202,6 +1456,7 @@  cglobal hevc_put_hevc_bi_w%1_%2, 12, 14, 14, dst, dststride, src, srcstride, src
     RET
 %endmacro
 
+INIT_XMM sse4
 WEIGHTING_FUNCS 2, 8
 WEIGHTING_FUNCS 4, 8
 WEIGHTING_FUNCS 6, 8
@@ -1212,60 +1467,4 @@  WEIGHTING_FUNCS 4, 10
 WEIGHTING_FUNCS 6, 10
 WEIGHTING_FUNCS 8, 10
 
-HEVC_PUT_HEVC_PEL_PIXELS  2, 8
-HEVC_PUT_HEVC_PEL_PIXELS  4, 8
-HEVC_PUT_HEVC_PEL_PIXELS  6, 8
-HEVC_PUT_HEVC_PEL_PIXELS  8, 8
-HEVC_PUT_HEVC_PEL_PIXELS 12, 8
-HEVC_PUT_HEVC_PEL_PIXELS 16, 8
-
-HEVC_PUT_HEVC_PEL_PIXELS 2, 10
-HEVC_PUT_HEVC_PEL_PIXELS 4, 10
-HEVC_PUT_HEVC_PEL_PIXELS 6, 10
-HEVC_PUT_HEVC_PEL_PIXELS 8, 10
-
-
-HEVC_PUT_HEVC_EPEL 2,  8
-HEVC_PUT_HEVC_EPEL 4,  8
-HEVC_PUT_HEVC_EPEL 6,  8
-HEVC_PUT_HEVC_EPEL 8,  8
-HEVC_PUT_HEVC_EPEL 12, 8
-HEVC_PUT_HEVC_EPEL 16, 8
-
-
-HEVC_PUT_HEVC_EPEL 2, 10
-HEVC_PUT_HEVC_EPEL 4, 10
-HEVC_PUT_HEVC_EPEL 6, 10
-HEVC_PUT_HEVC_EPEL 8, 10
-
-
-HEVC_PUT_HEVC_EPEL_HV 2,  8
-HEVC_PUT_HEVC_EPEL_HV 4,  8
-HEVC_PUT_HEVC_EPEL_HV 6,  8
-HEVC_PUT_HEVC_EPEL_HV 8,  8
-
-HEVC_PUT_HEVC_EPEL_HV 2, 10
-HEVC_PUT_HEVC_EPEL_HV 4, 10
-HEVC_PUT_HEVC_EPEL_HV 6, 10
-HEVC_PUT_HEVC_EPEL_HV 8, 10
-
-
-HEVC_PUT_HEVC_QPEL 4,  8
-HEVC_PUT_HEVC_QPEL 8,  8
-HEVC_PUT_HEVC_QPEL 12, 8
-HEVC_PUT_HEVC_QPEL 16, 8
-
-HEVC_PUT_HEVC_QPEL 4, 10
-HEVC_PUT_HEVC_QPEL 8, 10
-
-HEVC_PUT_HEVC_QPEL_HV 2, 8
-HEVC_PUT_HEVC_QPEL_HV 4, 8
-HEVC_PUT_HEVC_QPEL_HV 6, 8
-HEVC_PUT_HEVC_QPEL_HV 8, 8
-
-HEVC_PUT_HEVC_QPEL_HV 2, 10
-HEVC_PUT_HEVC_QPEL_HV 4, 10
-HEVC_PUT_HEVC_QPEL_HV 6, 10
-HEVC_PUT_HEVC_QPEL_HV 8, 10
-
 %endif ; ARCH_X86_64