[x265] [PATCH] asm: psyCost_pp_8x8 for HIGH_BIT_DEPTH in sse4: improve 6995c->1070c
Divya Manivannan
divya at multicorewareinc.com
Tue Dec 30 12:49:13 CET 2014
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1419940084 -19800
# Tue Dec 30 17:18:04 2014 +0530
# Node ID 259bf78bae8c6bd9bfefa8e8575de8620d20971c
# Parent f15a798c41f69a053b1694399230b38eec8cb1a5
asm: psyCost_pp_8x8 for HIGH_BIT_DEPTH in sse4: improve 6995c->1070c
diff -r f15a798c41f6 -r 259bf78bae8c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 30 14:35:08 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 30 17:18:04 2014 +0530
@@ -1436,6 +1436,9 @@
INTRA_ANG_SSE4_HIGH(sse4);
p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
+#if X86_64
+ p.psy_cost_pp[BLOCK_8x8] = x265_psyCost_pp_8x8_sse4;
+#endif
}
if (cpuMask & X265_CPU_XOP)
{
diff -r f15a798c41f6 -r 259bf78bae8c source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Dec 30 14:35:08 2014 +0530
+++ b/source/common/x86/pixel-a.asm Tue Dec 30 17:18:04 2014 +0530
@@ -6748,7 +6748,83 @@
INIT_XMM sse4
cglobal psyCost_pp_8x8, 4, 6, 13
- FIX_STRIDES r1, r3
+%if HIGH_BIT_DEPTH
+ FIX_STRIDES r1, r3
+ lea r4, [3 * r1]
+ pxor m10, m10
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, [pw_1]
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, [pd_1]
+ psrld m0, 1
+ psubd m10, m0, m8
+
+ lea r4, [3 * r3]
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r4]
+ lea r5, [r2 + r3 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r4]
+
+ paddw m8, m0, m1
+ paddw m8, m2
+ paddw m8, m3
+ paddw m8, m4
+ paddw m8, m5
+ paddw m8, m6
+ paddw m8, m7
+ pmaddwd m8, [pw_1]
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+ paddd m0, m1
+ paddd m0, m2
+ paddd m0, m3
+ HADDUW m0, m1
+ paddd m0, [pd_1]
+ psrld m0, 1
+ psubd m0, m8
+ psubd m10, m0
+ pabsd m0, m10
+ movd eax, m0
+
+%else ; !HIGH_BIT_DEPTH
lea r4, [3 * r1]
mova m8, [hmul_8p]
@@ -6842,6 +6918,6 @@
psubd m12, m0
pabsd m0, m12
movd eax, m0
-
+%endif ; HIGH_BIT_DEPTH
RET
%endif
More information about the x265-devel
mailing list