[x264-devel] x86: faster AVX satd/sa8d/sa8d_satd/hadamard_ac

Wed Feb 27 00:18:05 CET 2013

x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Tue Feb  5 01:23:23 2013 -0800| [9fe40b1e0db6cd93652e3a45dbbd8f24dbe0b70e] | committer: Jason Garrett-Glaser

x86: faster AVX satd/sa8d/sa8d_satd/hadamard_ac

Use Conroe-style movddup in AVX transforms; both Sandy Bridge and Bulldozer
do movddup in the load unit, so it's totally free this way.

On Sandy Bridge:
~6% faster sa8d_satd
~5% faster hadamard_ac
~9% faster 32-bit satd
~2% faster sa8d

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9fe40b1e0db6cd93652e3a45dbbd8f24dbe0b70e
---

 common/x86/pixel-a.asm |    7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index f7dc76e..97adec5 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -1431,7 +1431,9 @@ cglobal pixel_satd_8x8_internal
     SATD_8x4_SSE vertical, 0, 1, 2, 3, 4, 5, 6
     ret
 
-%if HIGH_BIT_DEPTH == 0 && UNIX64 ; 16x8 regresses on phenom win64, 16x16 is almost the same
+; 16x8 regresses on phenom win64, 16x16 is almost the same (too many spilled registers)
+; These aren't any faster on AVX systems with fast movddup (Bulldozer, Sandy Bridge)
+%if HIGH_BIT_DEPTH == 0 && UNIX64 && notcpuflag(avx)
 cglobal pixel_satd_16x4_internal
     LOAD_SUMSUB_16x4P 0, 1, 2, 3, 4, 8, 5, 9, 6, 7, r0, r2, 11
     lea  r2, [r2+4*r3]
@@ -4032,6 +4034,9 @@ INTRA_X9
 INTRA8_X9
 %endif
 
+; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
+; it's effectively free.
+%define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
 INIT_XMM avx
 SATDS_SSE2
 SA8D