[x264-devel] commit: Slightly faster SSE4 SA8D, SSE4 Hadamard_AC, SSE2 SSIM ( Jason Garrett-Glaser )
git version control
git at videolan.org
Wed Apr 15 00:56:16 CEST 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Tue Apr 14 14:47:02 2009 -0700| [37424f87a43a2dcecba61bc24b46506ced32307c] | committer: Jason Garrett-Glaser
Slightly faster SSE4 SA8D, SSE4 Hadamard_AC, SSE2 SSIM
shufps is the most underrated SSE instruction on x86.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=37424f87a43a2dcecba61bc24b46506ced32307c
---
common/x86/pixel-a.asm | 63 +++++++++++++++++++++++++++++-------------------
1 files changed, 38 insertions(+), 25 deletions(-)
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 6cf47ee..bc7e00f 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -429,24 +429,24 @@ cglobal x264_pixel_var_8x8_sse2, 2,4,8
%macro TRANS_SSE4 5-6 ; see above
%ifidn %1, d
-%define mask 10101010b
-%define shift 16
-%elifidn %1, q
-%define mask 11001100b
-%define shift 32
-%endif
mova m%5, m%3
%ifidn %2, ord
- psrl%1 m%3, shift
+ psrl%1 m%3, 16
%endif
- pblendw m%3, m%4, mask
- psll%1 m%4, shift
+ pblendw m%3, m%4, 10101010b
+ psll%1 m%4, 16
%ifidn %2, ord
- pblendw m%4, m%5, 255^mask
+ pblendw m%4, m%5, 01010101b
%else
- psrl%1 m%5, shift
+ psrl%1 m%5, 16
por m%4, m%5
%endif
+%elifidn %1, q
+ mova m%5, m%3
+ shufps m%3, m%4, 10001000b
+ shufps m%5, m%4, 11011101b
+ SWAP %4, %5
+%endif
%endmacro
%macro JDUP_SSE2 2
@@ -1923,29 +1923,43 @@ HADAMARD_AC_SSE2 sse4
; void x264_pixel_ssim_4x4x2_core_sse2( const uint8_t *pix1, int stride1,
; const uint8_t *pix2, int stride2, int sums[2][4] )
;-----------------------------------------------------------------------------
-cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
- pxor m0, m0
- pxor m1, m1
- pxor m2, m2
- pxor m3, m3
- pxor m4, m4
-%rep 4
- movq m5, [r0]
- movq m6, [r2]
+
+%macro SSIM_ITER 1
+ movq m5, [r0+(%1&1)*r1]
+ movq m6, [r2+(%1&1)*r3]
punpcklbw m5, m0
punpcklbw m6, m0
+%if %1==1
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+%endif
+%if %1==0
+ movdqa m1, m5
+ movdqa m2, m6
+%else
paddw m1, m5
paddw m2, m6
+%endif
movdqa m7, m5
pmaddwd m5, m5
pmaddwd m7, m6
pmaddwd m6, m6
+%if %1==0
+ SWAP m3, m5
+ SWAP m4, m7
+%else
paddd m3, m5
paddd m4, m7
+%endif
paddd m3, m6
- add r0, r1
- add r2, r3
-%endrep
+%endmacro
+
+cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
+ pxor m0, m0
+ SSIM_ITER 0
+ SSIM_ITER 1
+ SSIM_ITER 2
+ SSIM_ITER 3
; PHADDW m1, m2
; PHADDD m3, m4
movdqa m7, [pw_1 GLOBAL]
@@ -1971,8 +1985,7 @@ cglobal x264_pixel_ssim_4x4x2_core_sse2, 4,4,8
movq [t0+ 0], m1
movq [t0+ 8], m3
- psrldq m1, 8
- movq [t0+16], m1
+ movhps [t0+16], m1
movq [t0+24], m5
RET
More information about the x264-devel
mailing list