[x264-devel] AVX 32-bit hpel_filter_h
Jason Garrett-Glaser
git at videolan.org
Sat Feb 4 21:10:50 CET 2012
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Mon Jan 23 15:09:38 2012 -0800| [14dc11f7c52fa29576e0003c8c16857a78bf5fbf] | committer: Jason Garrett-Glaser
AVX 32-bit hpel_filter_h
Faster on Sandy Bridge.
Also add details on unsuccessful optimizations in these functions.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=14dc11f7c52fa29576e0003c8c16857a78bf5fbf
---
common/x86/dct-64.asm | 2 +-
common/x86/mc-a2.asm | 12 +++++++++---
common/x86/mc-c.c | 2 +-
3 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/common/x86/dct-64.asm b/common/x86/dct-64.asm
index 78a2484..2a2c386 100644
--- a/common/x86/dct-64.asm
+++ b/common/x86/dct-64.asm
@@ -89,7 +89,7 @@ cextern hsub_mul
%macro IDCT8_1D 11
SUMSUB_BA %1, %6, %2, %10 ; %5=a0, %1=a2
-
+
psra%1 m%10, m%3, 1
padd%1 m%10, m%3
padd%1 m%10, m%5
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 1b8788d..dd2c686 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -461,6 +461,7 @@ cglobal hpel_filter_c, 3,3,9
%else
%define tpw_32 [pw_32]
%endif
+; This doesn't seem to be faster (with AVX) on Sandy Bridge or Bulldozer...
%if cpuflag(misalign)
.loop:
movu m4, [src-4]
@@ -559,11 +560,11 @@ cglobal hpel_filter_h_sse2, 3,3,8
jl .loop
REP_RET
-%if ARCH_X86_64 == 0
;-----------------------------------------------------------------------------
; void hpel_filter_h( uint8_t *dst, uint8_t *src, int width );
;-----------------------------------------------------------------------------
-cglobal hpel_filter_h_ssse3, 3,3
+%macro HPEL_H 0
+cglobal hpel_filter_h, 3,3
add r0, r2
add r1, r2
neg r2
@@ -573,6 +574,9 @@ cglobal hpel_filter_h_ssse3, 3,3
mova m7, [pw_16]
.loop:
mova m2, [src+16]
+ ; Using unaligned loads instead of palignr is marginally slower on SB and significantly
+ ; slower on Bulldozer, despite their fast load units -- even though it would let us avoid
+ ; the repeated loads of constants for pmaddubsw.
palignr m3, m1, m0, 14
palignr m4, m1, m0, 15
palignr m0, m2, m1, 2
@@ -596,7 +600,7 @@ cglobal hpel_filter_h_ssse3, 3,3
add r2, 16
jl .loop
REP_RET
-%endif ; !ARCH_X86_64
+%endmacro
INIT_MMX mmx2
HPEL_V 0
@@ -610,9 +614,11 @@ HPEL_C
INIT_XMM ssse3
HPEL_C
HPEL_V 0
+HPEL_H
INIT_XMM avx
HPEL_C
HPEL_V 0
+HPEL_H
%endif
%if ARCH_X86_64
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 8d3368f..dbe7d34 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -450,7 +450,7 @@ void x264_hpel_filter_avx( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t
#else
HPEL(16, sse2, sse2, sse2, sse2)
HPEL(16, ssse3, ssse3, ssse3, ssse3)
-HPEL(16, avx, avx, avx, ssse3)
+HPEL(16, avx, avx, avx, avx)
#endif
HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
#endif // HIGH_BIT_DEPTH
More information about the x264-devel
mailing list