[x265] [PATCH 1 of 2] asm: Replace MMX version of pixel_avg_w8 by SSE2, the MMX is slower on Skylake platform

Fri Oct 16 01:46:40 CEST 2015

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444947352 18000
# Node ID 25d14acf30a0a9d17daea890e0096170ae876f1a
# Parent  fe65544b6c40d7cd62c2b86275bf98b264b6edb0
asm: Replace MMX version of pixel_avg_w8 by SSE2, the MMX is slower on Skylake platform
---
 source/common/x86/mc-a.asm |   42 ++++++++++++++++++++++++++++++++++++------
 1 files changed, 36 insertions(+), 6 deletions(-)

diff -r fe65544b6c40 -r 25d14acf30a0 source/common/x86/mc-a.asm

--- a/source/common/x86/mc-a.asm	Wed Oct 07 13:42:41 2015 +0530
+++ b/source/common/x86/mc-a.asm	Thu Oct 15 17:15:52 2015 -0500
@@ -3990,8 +3990,12 @@
     test dword r4m, 15
     jz pixel_avg_w%1_sse2
 %endif
+%if (%1 == 8)
+    jmp pixel_avg_w8_unaligned_sse2
+%else
     jmp pixel_avg_w%1_mmx2
 %endif
+%endif
 %endmacro
 
 ;-----------------------------------------------------------------------------
@@ -4050,6 +4054,32 @@
     lea     r4, [r4 + 4 * r5]
 %endmacro
 
+INIT_XMM sse2
+cglobal pixel_avg_w8_unaligned
+    AVG_START
+.height_loop:
+%if HIGH_BIT_DEPTH
+    ; NO TEST BRANCH!
+    movu    m0, [t2]
+    movu    m1, [t2+SIZEOF_PIXEL*t3]
+    movu    m2, [t4]
+    movu    m3, [t4+SIZEOF_PIXEL*t5]
+    pavgw   m0, m2
+    pavgw   m1, m3
+    movu    [t0], m0
+    movu    [t0+SIZEOF_PIXEL*t1], m1
+%else ;!HIGH_BIT_DEPTH
+    movq    m0, [t2]
+    movhps  m0, [t2+SIZEOF_PIXEL*t3]
+    movq    m1, [t4]
+    movhps  m1, [t4+SIZEOF_PIXEL*t5]
+    pavgb   m0, m1
+    movq    [t0], m0
+    movhps  [t0+SIZEOF_PIXEL*t1], m0
+%endif
+    AVG_END
+
+
 ;-------------------------------------------------------------------------------------------------------------------------------
 ;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
 ;-------------------------------------------------------------------------------------------------------------------------------
@@ -4116,11 +4146,11 @@
 AVGH 4, 4
 AVGH 4, 2
 
-AVG_FUNC 8, movq, movq
-AVGH 8, 32
-AVGH 8, 16
-AVGH 8,  8
-AVGH 8,  4
+;AVG_FUNC 8, movq, movq
+;AVGH 8, 32
+;AVGH 8, 16
+;AVGH 8,  8
+;AVGH 8,  4
 
 AVG_FUNC 16, movq, movq
 AVGH 16, 64
@@ -4198,7 +4228,7 @@
 AVGH 4, 4
 AVGH 4, 2
 
-AVG_FUNC 8, movq, movq
+;AVG_FUNC 8, movq, movq
 AVGH 8, 32
 AVGH 8, 16
 AVGH 8,  8