[x264-devel] commit: Cacheline-split SSSE3 chroma MC (Jason Garrett-Glaser )
git version control
git at videolan.org
Fri Jul 17 06:46:49 CEST 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Wed Jul 15 12:43:35 2009 -0700| [f21daff3dc11cf5881f1727c3c9d505f0810d20b] | committer: Jason Garrett-Glaser
Cacheline-split SSSE3 chroma MC
~70% faster chroma MC on 32-bit Conroe
Also slightly faster SSSE3 intra_sad_8x8c
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=f21daff3dc11cf5881f1727c3c9d505f0810d20b
---
common/frame.c | 2 +-
common/x86/mc-a.asm | 76 +++++++++++++++++++++++++++++++++++++++++-------
common/x86/mc-c.c | 5 +++
common/x86/sad-a.asm | 28 ++++++++++++++----
common/x86/x86inc.asm | 8 ++--
tools/checkasm.c | 7 ++--
6 files changed, 101 insertions(+), 25 deletions(-)
diff --git a/common/frame.c b/common/frame.c
index cc4b1b3..23e6824 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -50,7 +50,7 @@ x264_frame_t *x264_frame_new( x264_t *h )
frame->i_plane = 3;
for( i = 0; i < 3; i++ )
{
- frame->i_stride[i] = ALIGN( i_stride >> !!i, 16 );
+ frame->i_stride[i] = ALIGN( i_stride >> !!i, align );
frame->i_width[i] = i_width >> !!i;
frame->i_lines[i] = i_lines >> !!i;
}
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 6b435d8..206bd35 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -25,8 +25,9 @@
%include "x86inc.asm"
-SECTION_RODATA
+SECTION_RODATA 32
+ch_shuffle: db 0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,0,0
pw_4: times 8 dw 4
pw_8: times 8 dw 8
pw_32: times 8 dw 32
@@ -869,8 +870,9 @@ MC_CHROMA mmxext
INIT_XMM
MC_CHROMA sse2, 8
+%macro MC_CHROMA_SSSE3 2
INIT_MMX
-cglobal x264_mc_chroma_ssse3, 0,6,8
+cglobal x264_mc_chroma_ssse3%1, 0,6,%2
MC_CHROMA_START
and r4d, 7
and r5d, 7
@@ -887,7 +889,7 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
mova m5, [pw_32 GLOBAL]
movd m6, r5d
movd m7, r4d
- movifnidn r0, r0mp
+ movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r4d, r7m
SPLATW m6, m6
@@ -925,23 +927,28 @@ cglobal x264_mc_chroma_ssse3, 0,6,8
INIT_XMM
.width8:
- mova m5, [pw_32 GLOBAL]
movd m6, r5d
movd m7, r4d
- movifnidn r0, r0mp
+ movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r4d, r7m
SPLATW m6, m6
SPLATW m7, m7
+%ifidn %1, _cache64
+ mov r5, r2
+ and r5, 0x3f
+ cmp r5, 0x38
+ jge .split
+%endif
+ mova m5, [pw_32 GLOBAL]
movh m0, [r2]
movh m1, [r2+1]
punpcklbw m0, m1
- add r2, r3
.loop8:
- movh m1, [r2]
- movh m2, [r2+1]
- movh m3, [r2+r3]
- movh m4, [r2+r3+1]
+ movh m1, [r2+1*r3]
+ movh m2, [r2+1*r3+1]
+ movh m3, [r2+2*r3]
+ movh m4, [r2+2*r3+1]
punpcklbw m1, m2
punpcklbw m3, m4
lea r2, [r2+2*r3]
@@ -965,6 +972,53 @@ INIT_XMM
lea r0, [r0+2*r1]
jg .loop8
REP_RET
-
+%ifidn %1, _cache64
+.split:
+ and r2, ~7
+ and r5, 7
+%ifdef PIC
+ lea r11, [ch_shuffle GLOBAL]
+ movu m5, [r11 + r5*2]
+%else
+ movu m5, [ch_shuffle + r5*2 GLOBAL]
+%endif
+ movu m0, [r2]
+ pshufb m0, m5
+%ifdef ARCH_X86_64
+ mova m8, [pw_32 GLOBAL]
+ %define round m8
+%else
+ %define round [pw_32 GLOBAL]
+%endif
+.splitloop8:
+ movu m1, [r2+r3]
+ pshufb m1, m5
+ movu m3, [r2+2*r3]
+ pshufb m3, m5
+ lea r2, [r2+2*r3]
+ mova m2, m1
+ mova m4, m3
+ pmaddubsw m0, m7
+ pmaddubsw m1, m6
+ pmaddubsw m2, m7
+ pmaddubsw m3, m6
+ paddw m0, round
+ paddw m2, round
+ paddw m1, m0
+ paddw m3, m2
+ mova m0, m4
+ psrlw m1, 6
+ psrlw m3, 6
+ packuswb m1, m3
+ movh [r0], m1
+ movhps [r0+r1], m1
+ sub r4d, 2
+ lea r0, [r0+2*r1]
+ jg .splitloop8
+ REP_RET
+%endif
; mc_chroma 1d ssse3 is negligibly faster, and definitely not worth the extra code size
+%endmacro
+MC_CHROMA_SSSE3 , 8
+MC_CHROMA_SSSE3 _cache64, 9
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index aede5b8..dcf623a 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -59,6 +59,9 @@ extern void x264_mc_chroma_sse2( uint8_t *src, int i_src_stride,
extern void x264_mc_chroma_ssse3( uint8_t *src, int i_src_stride,
uint8_t *dst, int i_dst_stride,
int dx, int dy, int i_width, int i_height );
+extern void x264_mc_chroma_ssse3_cache64( uint8_t *src, int i_src_stride,
+ uint8_t *dst, int i_dst_stride,
+ int dx, int dy, int i_width, int i_height );
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
@@ -340,6 +343,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->hpel_filter = x264_hpel_filter_ssse3;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_ssse3;
pf->mc_chroma = x264_mc_chroma_ssse3;
+ if( cpu&X264_CPU_CACHELINE_64 )
+ pf->mc_chroma = x264_mc_chroma_ssse3_cache64;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->integral_init4v = x264_integral_init4v_ssse3;
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 68d8584..342a984 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -28,9 +28,8 @@
SECTION_RODATA
pb_3: times 16 db 3
+pb_shuf8x8c: db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
pw_8: times 4 dw 8
-pb_shuf8x8c0: db 0,0,0,0,2,2,2,2
-pb_shuf8x8c1: db 4,4,4,4,6,6,6,6
sw_64: dd 64
SECTION .text
@@ -450,16 +449,32 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
psrlw m0, 2
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
%ifidn %1, ssse3
- movq m1, m0
- pshufb m0, [pb_shuf8x8c0 GLOBAL]
- pshufb m1, [pb_shuf8x8c1 GLOBAL]
+ movq2dq xmm0, m0
+ pshufb xmm0, [pb_shuf8x8c GLOBAL]
+ movq xmm1, [r0+FENC_STRIDE*0]
+ movq xmm2, [r0+FENC_STRIDE*1]
+ movq xmm3, [r0+FENC_STRIDE*2]
+ movq xmm4, [r0+FENC_STRIDE*3]
+ movhps xmm1, [r0+FENC_STRIDE*4]
+ movhps xmm2, [r0+FENC_STRIDE*5]
+ movhps xmm3, [r0+FENC_STRIDE*6]
+ movhps xmm4, [r0+FENC_STRIDE*7]
+ psadbw xmm1, xmm0
+ psadbw xmm2, xmm0
+ psadbw xmm3, xmm0
+ psadbw xmm4, xmm0
+ paddw xmm1, xmm2
+ paddw xmm1, xmm3
+ paddw xmm1, xmm4
+ movhlps xmm0, xmm1
+ paddw xmm1, xmm0
+ movd [r2], xmm1
%else
packuswb m0, m0
punpcklbw m0, m0
movq m1, m0
punpcklbw m0, m0 ; 4x dc0 4x dc1
punpckhbw m1, m1 ; 4x dc2 4x dc3
-%endif
movq m2, [r0+FENC_STRIDE*0]
movq m3, [r0+FENC_STRIDE*1]
movq m4, [r0+FENC_STRIDE*2]
@@ -483,6 +498,7 @@ cglobal x264_intra_sad_x3_8x8c_%1, 3,3
paddw m6, m0
paddw m2, m6
movd [r2], m2
+%endif
RET
%endmacro
diff --git a/common/x86/x86inc.asm b/common/x86/x86inc.asm
index fced5c6..15990d5 100644
--- a/common/x86/x86inc.asm
+++ b/common/x86/x86inc.asm
@@ -37,14 +37,14 @@
; Name of the .rodata section.
; Kludge: Something on OS X fails to align .rodata even given an align attribute,
; so use a different read-only section.
-%macro SECTION_RODATA 0
+%macro SECTION_RODATA 0-1 16
%ifidn __OUTPUT_FORMAT__,macho64
- SECTION .text align=16
+ SECTION .text align=%1
%elifidn __OUTPUT_FORMAT__,macho
- SECTION .text align=16
+ SECTION .text align=%1
fakegot:
%else
- SECTION .rodata align=16
+ SECTION .rodata align=%1
%endif
%endmacro
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 4aba9e1..2d4ec77 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -803,8 +803,8 @@ static int check_mc( int cpu_ref, int cpu_new )
used_asm = 1; \
memset(buf3, 0xCD, 1024); \
memset(buf4, 0xCD, 1024); \
- call_c( mc_c.mc_chroma, dst1, 16, src, 32, dx, dy, w, h ); \
- call_a( mc_a.mc_chroma, dst2, 16, src, 32, dx, dy, w, h ); \
+ call_c( mc_c.mc_chroma, dst1, 16, src, 64, dx, dy, w, h ); \
+ call_a( mc_a.mc_chroma, dst2, 16, src, 64, dx, dy, w, h ); \
/* mc_chroma width=2 may write garbage to the right of dst. ignore that. */\
for( j=0; j<h; j++ ) \
for( i=w; i<4; i++ ) \
@@ -834,8 +834,9 @@ static int check_mc( int cpu_ref, int cpu_new )
ok = 1; used_asm = 0;
for( dy = -1; dy < 9; dy++ )
- for( dx = -1; dx < 9; dx++ )
+ for( dx = -128; dx < 128; dx++ )
{
+ if( rand()&15 ) continue;
MC_TEST_CHROMA( 8, 8 );
MC_TEST_CHROMA( 8, 4 );
MC_TEST_CHROMA( 4, 8 );
More information about the x264-devel
mailing list