[x264-devel] x86-64: faster SSSE3 trellis
Jason Garrett-Glaser
git at videolan.org
Mon May 20 23:06:47 CEST 2013
x264 | branch: master | Jason Garrett-Glaser <jason at x264.com> | Mon Apr 29 14:22:23 2013 -0700| [9373d5fa6e7a5cc5bcc756125cbc2e7fe058ea43] | committer: Jason Garrett-Glaser
x86-64: faster SSSE3 trellis
~2% faster trellis.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=9373d5fa6e7a5cc5bcc756125cbc2e7fe058ea43
---
common/x86/trellis-64.asm | 37 ++++++++++++++++++++-----------------
1 file changed, 20 insertions(+), 17 deletions(-)
diff --git a/common/x86/trellis-64.asm b/common/x86/trellis-64.asm
index df95f0a..bd3e803 100644
--- a/common/x86/trellis-64.asm
+++ b/common/x86/trellis-64.asm
@@ -96,6 +96,15 @@ SECTION .text
%endif
%endmacro
+%macro LOAD_DUP 2 ; dst, src
+%if cpuflag(ssse3)
+ movddup %1, %2
+%else
+ movd %1, %2
+ punpcklqdq %1, %1
+%endif
+%endmacro
+
;-----------------------------------------------------------------------------
; int trellis_cabac_4x4_psy(
; const int *unquant_mf, const uint8_t *zigzag, int lambda2,
@@ -186,12 +195,11 @@ cglobal %1, 4,15,9
mov dword levelgt1_ctxm, 9
%endif
%if psy
- movd m6, psy_trellism
+ LOAD_DUP m6, psy_trellism
%define psy_trellis m6
%elif dc
- movd m6, [unquant_mfq]
+ LOAD_DUP m6, [unquant_mfq]
paddd m6, m6
- punpcklqdq m6, m6
%define unquant_mf m6
%endif
%ifdef PIC
@@ -333,13 +341,12 @@ cglobal %1, 4,15,9
movd m0, abs_leveld
mov r6, orig_coefsm
%if HIGH_BIT_DEPTH
- movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+ LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
- movd m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
- psrad m1, 16
+ LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+ psrad m1, 16 ; sign_coef
%endif
punpcklqdq m0, m0 ; quant_coef
- punpcklqdq m1, m1 ; sign_coef
%if cpuflag(ssse3)
pabsd m0, m0
pabsd m2, m1 ; abs_coef
@@ -403,11 +410,10 @@ cglobal %1, 4,15,9
%else
%ifdef PIC
mov r10, unquant_mfm
- movd m3, [r10 + zigzagiq*4]
+ LOAD_DUP m3, [r10 + zigzagiq*4]
%else
- movd m3, [unquant_mfq + zigzagiq*4]
+ LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
%endif
- punpcklqdq m3, m3
pmuludq m0, m3
%endif
paddd m0, [pq_128]
@@ -420,8 +426,7 @@ cglobal %1, 4,15,9
%if dc
psllq m0, 8
%else
- movd m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
- punpcklqdq m5, m5
+ LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
pmuludq m0, m5
%endif
@@ -434,12 +439,11 @@ cglobal %1, 4,15,9
; ssd1[k] -= psy_weight * psy_value;
mov r6, fenc_dctm
%if HIGH_BIT_DEPTH
- movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
+ LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
%else
- movd m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
+ LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
psrad m3, 16 ; orig_coef
%endif
- punpcklqdq m3, m3
%if cpuflag(ssse3)
psignd m4, m1 ; SIGN(unquant_abs_level, sign_coef)
%else
@@ -453,9 +457,8 @@ cglobal %1, 4,15,9
ABSD m3, m4
SWAP 4, 3
%endif
- movd m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
+ LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
pmuludq m1, psy_trellis
- punpcklqdq m1, m1
pmuludq m4, m1
psubq m0, m4
%if %1
More information about the x264-devel
mailing list