[x265] [PATCH] asm: sse4 10bit code for sign primitive
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Thu Jun 25 13:15:10 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1435229751 -19800
# Thu Jun 25 16:25:51 2015 +0530
# Node ID a0de1e88f3b1a10d6f8cf656a95e6ec37e1bc134
# Parent b1af4c36f48a4500a4912373ebcda9a5540b5c15
asm: sse4 10bit code for sign primitive
calSign 6.16x 356.91 2197.63
diff -r b1af4c36f48a -r a0de1e88f3b1 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Jun 25 16:25:51 2015 +0530
@@ -1097,6 +1097,7 @@
p.saoCuOrgE3[0] = PFX(saoCuOrgE3_sse4);
p.saoCuOrgE3[1] = PFX(saoCuOrgE3_sse4);
p.saoCuOrgB0 = PFX(saoCuOrgB0_sse4);
+ p.sign = PFX(calSign_sse4);
LUMA_ADDAVG(sse4);
CHROMA_420_ADDAVG(sse4);
diff -r b1af4c36f48a -r a0de1e88f3b1 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Jun 24 10:36:15 2015 -0500
+++ b/source/common/x86/loopfilter.asm Thu Jun 25 16:25:51 2015 +0530
@@ -40,6 +40,7 @@
cextern pw_2
cextern pw_1023
cextern pb_movemask
+cextern pw_1
;============================================================================================================
@@ -1321,6 +1322,48 @@
; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
;============================================================================================================
INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal calSign, 4, 7, 5
+ mova m0, [pw_1]
+ mov r4d, r3d
+ shr r3d, 4
+ add r3d, 1
+ mov r5, r0
+ movu m4, [r0 + r4]
+.loop
+ movu m1, [r1] ; m2 = pRec[x]
+ movu m2, [r2] ; m3 = pTmpU[x]
+
+ pcmpgtw m3, m1, m2
+ pcmpgtw m2, m1
+ pand m3, m0
+ por m3, m2
+ packsswb m3, m3
+ movh [r0], xm3
+
+ movu m1, [r1 + 16] ; m2 = pRec[x]
+ movu m2, [r2 + 16] ; m3 = pTmpU[x]
+
+ pcmpgtw m3, m1, m2
+ pcmpgtw m2, m1
+ pand m3, m0
+ por m3, m2
+ packsswb m3, m3
+ movh [r0 + 8], xm3
+
+ add r0, 16
+ add r1, 32
+ add r2, 32
+ dec r3d
+ jnz .loop
+
+ mov r6, r0
+ sub r6, r5
+ sub r4, r6
+ movu [r0 + r4], m4
+ RET
+%else ; HIGH_BIT_DEPTH
+
cglobal calSign, 4,5,6
mova m0, [pb_128]
mova m1, [pb_1]
@@ -1369,6 +1412,7 @@
.end:
RET
+%endif
INIT_YMM avx2
cglobal calSign, 4, 5, 6
More information about the x265-devel
mailing list