[x265] [PATCH 1 of 2] asm: Replace MMX version of pixel_avg_w8 by SSE2, the MMX is slower on Skylake platform
Min Chen
chenm003 at 163.com
Fri Oct 16 01:46:40 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444947352 18000
# Node ID 25d14acf30a0a9d17daea890e0096170ae876f1a
# Parent fe65544b6c40d7cd62c2b86275bf98b264b6edb0
asm: Replace MMX version of pixel_avg_w8 by SSE2, the MMX is slower on Skylake platform
---
source/common/x86/mc-a.asm | 42 ++++++++++++++++++++++++++++++++++++------
1 files changed, 36 insertions(+), 6 deletions(-)
diff -r fe65544b6c40 -r 25d14acf30a0 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Wed Oct 07 13:42:41 2015 +0530
+++ b/source/common/x86/mc-a.asm Thu Oct 15 17:15:52 2015 -0500
@@ -3990,8 +3990,12 @@
test dword r4m, 15
jz pixel_avg_w%1_sse2
%endif
+%if (%1 == 8)
+ jmp pixel_avg_w8_unaligned_sse2
+%else
jmp pixel_avg_w%1_mmx2
%endif
+%endif
%endmacro
;-----------------------------------------------------------------------------
@@ -4050,6 +4054,32 @@
lea r4, [r4 + 4 * r5]
%endmacro
+INIT_XMM sse2
+cglobal pixel_avg_w8_unaligned
+ AVG_START
+.height_loop:
+%if HIGH_BIT_DEPTH
+ ; NO TEST BRANCH!
+ movu m0, [t2]
+ movu m1, [t2+SIZEOF_PIXEL*t3]
+ movu m2, [t4]
+ movu m3, [t4+SIZEOF_PIXEL*t5]
+ pavgw m0, m2
+ pavgw m1, m3
+ movu [t0], m0
+ movu [t0+SIZEOF_PIXEL*t1], m1
+%else ;!HIGH_BIT_DEPTH
+ movq m0, [t2]
+ movhps m0, [t2+SIZEOF_PIXEL*t3]
+ movq m1, [t4]
+ movhps m1, [t4+SIZEOF_PIXEL*t5]
+ pavgb m0, m1
+ movq [t0], m0
+ movhps [t0+SIZEOF_PIXEL*t1], m0
+%endif
+ AVG_END
+
+
;-------------------------------------------------------------------------------------------------------------------------------
;void pixelavg_pp(pixel dst, intptr_t dstride, const pixel src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int)
;-------------------------------------------------------------------------------------------------------------------------------
@@ -4116,11 +4146,11 @@
AVGH 4, 4
AVGH 4, 2
-AVG_FUNC 8, movq, movq
-AVGH 8, 32
-AVGH 8, 16
-AVGH 8, 8
-AVGH 8, 4
+;AVG_FUNC 8, movq, movq
+;AVGH 8, 32
+;AVGH 8, 16
+;AVGH 8, 8
+;AVGH 8, 4
AVG_FUNC 16, movq, movq
AVGH 16, 64
@@ -4198,7 +4228,7 @@
AVGH 4, 4
AVGH 4, 2
-AVG_FUNC 8, movq, movq
+;AVG_FUNC 8, movq, movq
AVGH 8, 32
AVGH 8, 16
AVGH 8, 8
More information about the x265-devel
mailing list