[x264-devel] x86: faster AVX satd/sa8d/sa8d_satd/hadamard_ac
Jason Garrett-Glaser
git at videolan.org
Wed Feb 27 00:18:05 CET 2013
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Tue Feb 5 01:23:23 2013 -0800| [9fe40b1e0db6cd93652e3a45dbbd8f24dbe0b70e] | committer: Jason Garrett-Glaser
x86: faster AVX satd/sa8d/sa8d_satd/hadamard_ac
Use Conroe-style movddup in AVX transforms; both Sandy Bridge and Bulldozer
do movddup in the load unit, so it's totally free this way.
On Sandy Bridge:
~6% faster sa8d_satd
~5% faster hadamard_ac
~9% faster 32-bit satd
~2% faster sa8d
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9fe40b1e0db6cd93652e3a45dbbd8f24dbe0b70e
---
common/x86/pixel-a.asm | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index f7dc76e..97adec5 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -1431,7 +1431,9 @@ cglobal pixel_satd_8x8_internal
SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
ret
-%if HIGH_BIT_DEPTH == 0 && UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
+; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
+; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
+%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
cglobal pixel_satd_16x4_internal
LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
lea r2, [r2+4*r3]
@@ -4032,6 +4034,9 @@ INTRA_X9
INTRA8_X9
%endif
+; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
+; it's effectively free.
+%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
INIT_XMM avx
SATDS_SSE2
SA8D
More information about the x264-devel
mailing list