[x265] [PATCH] asm: psyCost_ss_16x16 in sse4: improve 31052c->9946c
Divya Manivannan
divya at multicorewareinc.com
Mon Jan 19 06:26:56 CET 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1421645184 -19800
# Mon Jan 19 10:56:24 2015 +0530
# Node ID 298735b1907d5f4044724f6028ba361d1a1baf50
# Parent bbc333bd4a6207c72c682b3ea88794c67996aa83
asm: psyCost_ss_16x16 in sse4: improve 31052c->9946c
diff -r bbc333bd4a62 -r 298735b1907d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 19 09:59:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 19 10:56:24 2015 +0530
@@ -932,6 +932,7 @@
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_sse4;
+ p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_sse4;
#endif
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
}
@@ -1184,6 +1185,7 @@
#if X86_64
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_sse4;
+ p.cu[BLOCK_16x16].psy_cost_ss = x265_psyCost_ss_16x16_sse4;
#endif
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
}
diff -r bbc333bd4a62 -r 298735b1907d source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Mon Jan 19 09:59:33 2015 +0530
+++ b/source/common/x86/pixel-a.asm Mon Jan 19 10:56:24 2015 +0530
@@ -8305,3 +8305,601 @@
movd eax, m0
RET
%endif
+
+%macro psy_cost_ss 0
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+ lea r5, [r0 + r1 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ pabsw m8, m0
+ pabsw m9, m1
+ paddw m8, m9
+ pabsw m10, m2
+ pabsw m11, m3
+ paddw m10, m11
+ paddw m8, m10
+ pabsw m9, m4
+ pabsw m10, m5
+ paddw m9, m10
+ pabsw m11, m6
+ pabsw m12, m7
+ paddw m11, m12
+ paddw m9, m11
+ paddw m8, m9
+ movhlps m9, m8
+ pmovzxwd m8, m8
+ pmovzxwd m9, m9
+ paddd m8, m9
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ pmaddwd m0, m13
+ pmaddwd m1, m13
+ pmaddwd m2, m13
+ pmaddwd m3, m13
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ pmaddwd m4, m13
+ pmaddwd m5, m13
+ pmaddwd m6, m13
+ pmaddwd m7, m13
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m11, m0, m7
+
+ movu m0, [r0]
+ movu m1, [r0 + r1]
+ movu m2, [r0 + r1 * 2]
+ movu m3, [r0 + r4]
+
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pmaddwd m3, m14
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ movu m4, [r5]
+ movu m5, [r5 + r1]
+ movu m6, [r5 + r1 * 2]
+ movu m7, [r5 + r4]
+
+ pmaddwd m4, m14
+ pmaddwd m5, m14
+ pmaddwd m6, m14
+ pmaddwd m7, m14
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m0, m7
+ paddd m0, m11
+
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ paddd m0, [pd_2]
+ psrld m0, 2
+ psubd m12, m0, m8
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r6]
+ lea r5, [r2 + r3 * 4]
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r6]
+
+ pabsw m8, m0
+ pabsw m9, m1
+ paddw m8, m9
+ pabsw m10, m2
+ pabsw m11, m3
+ paddw m10, m11
+ paddw m8, m10
+ pabsw m9, m4
+ pabsw m10, m5
+ paddw m9, m10
+ pabsw m11, m6
+ pabsw m10, m7
+ paddw m11, m10
+ paddw m9, m11
+ paddw m8, m9
+ movhlps m9, m8
+ pmovzxwd m8, m8
+ pmovzxwd m9, m9
+ paddd m8, m9
+ movhlps m9, m8
+ paddd m8, m9
+ psrldq m9, m8, 4
+ paddd m8, m9
+ psrld m8, 2
+
+ pmaddwd m0, m13
+ pmaddwd m1, m13
+ pmaddwd m2, m13
+ pmaddwd m3, m13
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ pmaddwd m4, m13
+ pmaddwd m5, m13
+ pmaddwd m6, m13
+ pmaddwd m7, m13
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m11, m0, m7
+
+ movu m0, [r2]
+ movu m1, [r2 + r3]
+ movu m2, [r2 + r3 * 2]
+ movu m3, [r2 + r6]
+
+ pmaddwd m0, m14
+ pmaddwd m1, m14
+ pmaddwd m2, m14
+ pmaddwd m3, m14
+
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+ psrldq m9, m0, 4
+ psubd m10, m0, m9
+ paddd m0, m9
+ shufps m0, m10, 10001000b
+
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+ psrldq m9, m1, 4
+ psubd m10, m1, m9
+ paddd m1, m9
+ shufps m1, m10, 10001000b
+
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+ psrldq m9, m2, 4
+ psubd m10, m2, m9
+ paddd m2, m9
+ shufps m2, m10, 10001000b
+
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+ psrldq m9, m3, 4
+ psubd m10, m3, m9
+ paddd m3, m9
+ shufps m3, m10, 10001000b
+
+ SUMSUB_BA d, 0, 1, 9
+ SUMSUB_BA d, 2, 3, 9
+ SUMSUB_BA d, 0, 2, 9
+ SUMSUB_BA d, 1, 3, 9
+
+ movu m4, [r5]
+ movu m5, [r5 + r3]
+ movu m6, [r5 + r3 * 2]
+ movu m7, [r5 + r6]
+
+ pmaddwd m4, m14
+ pmaddwd m5, m14
+ pmaddwd m6, m14
+ pmaddwd m7, m14
+
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+ psrldq m9, m4, 4
+ psubd m10, m4, m9
+ paddd m4, m9
+ shufps m4, m10, 10001000b
+
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+ psrldq m9, m5, 4
+ psubd m10, m5, m9
+ paddd m5, m9
+ shufps m5, m10, 10001000b
+
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+ psrldq m9, m6, 4
+ psubd m10, m6, m9
+ paddd m6, m9
+ shufps m6, m10, 10001000b
+
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+ psrldq m9, m7, 4
+ psubd m10, m7, m9
+ paddd m7, m9
+ shufps m7, m10, 10001000b
+
+ SUMSUB_BA d, 4, 5, 9
+ SUMSUB_BA d, 6, 7, 9
+ SUMSUB_BA d, 4, 6, 9
+ SUMSUB_BA d, 5, 7, 9
+
+ SUMSUB_BA d, 0, 4, 9
+ SUMSUB_BA d, 1, 5, 9
+ SUMSUB_BA d, 2, 6, 9
+ SUMSUB_BA d, 3, 7, 9
+
+ pabsd m0, m0
+ pabsd m2, m2
+ pabsd m1, m1
+ pabsd m3, m3
+ pabsd m4, m4
+ pabsd m5, m5
+ pabsd m6, m6
+ pabsd m7, m7
+
+ paddd m0, m2
+ paddd m1, m3
+ paddd m0, m1
+ paddd m5, m4
+ paddd m0, m5
+ paddd m7, m6
+ paddd m0, m7
+ paddd m0, m11
+
+ movhlps m1, m0
+ paddd m0, m1
+ psrldq m1, m0, 4
+ paddd m0, m1
+ paddd m0, [pd_2]
+ psrld m0, 2
+ psubd m0, m8
+
+ psubd m12, m0
+ pabsd m0, m12
+ paddd m15, m0
+%endmacro
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal psyCost_ss_16x16, 4, 9, 16
+
+ mova m13, [hmul_w]
+ mova m14, [pw_1]
+ add r1, r1
+ add r3, r3
+ lea r4, [3 * r1]
+ lea r6, [3 * r3]
+ pxor m15, m15
+ mov r7d, 2
+.loopH:
+ mov r8d, 2
+.loopW:
+ psy_cost_ss
+ add r0, 16
+ add r2, 16
+ dec r8d
+ jnz .loopW
+ lea r0, [r0 + r1 * 8 - 32]
+ lea r2, [r2 + r3 * 8 - 32]
+ dec r7d
+ jnz .loopH
+ movd eax, m15
+ RET
+%endif
diff -r bbc333bd4a62 -r 298735b1907d source/common/x86/pixel.h
--- a/source/common/x86/pixel.h Mon Jan 19 09:59:33 2015 +0530
+++ b/source/common/x86/pixel.h Mon Jan 19 10:56:24 2015 +0530
@@ -225,6 +225,7 @@
int x265_psyCost_pp_64x64_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
int x265_psyCost_ss_4x4_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
int x265_psyCost_ss_8x8_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+int x265_psyCost_ss_16x16_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
#undef DECL_PIXELS
#undef DECL_HEVC_SSD
More information about the x265-devel
mailing list