[x264-devel] commit: new ssd_8x*_sse2 (Loren Merritt )
git version control
git at videolan.org
Thu Mar 20 22:35:26 CET 2008
x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Thu Mar 20 14:00:08 2008 -0600| [adfab36d395dff335c5a34d050c84ac8e7e1b470]
new ssd_8x*_sse2
align ssd_16x*_sse2
unroll ssd_4x*_mmx
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=adfab36d395dff335c5a34d050c84ac8e7e1b470
---
common/pixel.c | 11 +++--
common/pixel.h | 2 +
common/x86/pixel-a.asm | 99 +++++++++++++++++++++++++++++++++++------------
tools/checkasm.c | 14 +++---
4 files changed, 89 insertions(+), 37 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index ae90845..1d5567b 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -99,14 +99,17 @@ int64_t x264_pixel_ssd_wxh( x264_pixel_function_t *pf, uint8_t *pix1, int i_pix1
{
int64_t i_ssd = 0;
int x, y;
+ int align = !(((long)pix1 | (long)pix2 | i_pix1 | i_pix2) & 15);
#define SSD(size) i_ssd += pf->ssd[size]( pix1 + y*i_pix1 + x, i_pix1, \
pix2 + y*i_pix2 + x, i_pix2 );
for( y = 0; y < i_height-15; y += 16 )
{
- for( x = 0; x < i_width-15; x += 16 )
- SSD(PIXEL_16x16);
- if( x < i_width-7 )
+ x = 0;
+ if( align )
+ for( ; x < i_width-15; x += 16 )
+ SSD(PIXEL_16x16);
+ for( ; x < i_width-7; x += 8 )
SSD(PIXEL_8x16);
}
if( y < i_height-7 )
@@ -610,7 +613,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
// these are faster on both Intel and AMD
if( cpu&X264_CPU_SSE2 )
{
- INIT2( ssd, _sse2 );
+ INIT5( ssd, _sse2 );
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_sse2;
pixf->ssim_end4 = x264_pixel_ssim_end4_sse2;
pixf->sa8d[PIXEL_16x16] = x264_pixel_sa8d_16x16_sse2;
diff --git a/common/pixel.h b/common/pixel.h
index fb5f99e..d533620 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -24,6 +24,8 @@
#ifndef _PIXEL_H
#define _PIXEL_H 1
+// SSD assumes all args aligned
+// other cmp functions assume first arg aligned
typedef int (*x264_pixel_cmp_t) ( uint8_t *, int, uint8_t *, int );
typedef void (*x264_pixel_cmp_x3_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[3] );
typedef void (*x264_pixel_cmp_x4_t) ( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int, int[4] );
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index 557aeb8..b4d0656 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -83,41 +83,64 @@ SECTION .text
paddd mm0, mm4
%endmacro
-%macro SSD_INC_1x8P 0
+%macro SSD_INC_2x16P 0
+ SSD_INC_1x16P
+ SSD_INC_1x16P
+%endmacro
+
+%macro SSD_INC_2x8P 0
movq mm1, [r0]
movq mm2, [r2]
+ movq mm3, [r0+r1]
+ movq mm4, [r2+r3]
movq mm5, mm2
+ movq mm6, mm4
psubusb mm2, mm1
+ psubusb mm4, mm3
psubusb mm1, mm5
- por mm1, mm2 ; mm1 = 8bit abs diff
+ psubusb mm3, mm6
+ por mm1, mm2
+ por mm3, mm4
movq mm2, mm1
+ movq mm4, mm3
punpcklbw mm1, mm7
- punpckhbw mm2, mm7 ; (mm1,mm2) = 16bit abs diff
+ punpcklbw mm3, mm7
+ punpckhbw mm2, mm7
+ punpckhbw mm4, mm7
pmaddwd mm1, mm1
pmaddwd mm2, mm2
+ pmaddwd mm3, mm3
+ pmaddwd mm4, mm4
- add r0, r1
- add r2, r3
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
paddd mm0, mm1
paddd mm0, mm2
+ paddd mm0, mm3
+ paddd mm0, mm4
%endmacro
-%macro SSD_INC_1x4P 0
- movd mm1, [r0]
- movd mm2, [r2]
-
- movq mm5, mm2
- psubusb mm2, mm1
- psubusb mm1, mm5
- por mm1, mm2
- punpcklbw mm1, mm7
- pmaddwd mm1, mm1
-
- add r0, r1
- add r2, r3
- paddd mm0, mm1
+%macro SSD_INC_2x4P 0
+ movd mm1, [r0]
+ movd mm2, [r2]
+ movd mm3, [r0+r1]
+ movd mm4, [r2+r3]
+
+ punpcklbw mm1, mm7
+ punpcklbw mm2, mm7
+ punpcklbw mm3, mm7
+ punpcklbw mm4, mm7
+ psubw mm1, mm2
+ psubw mm3, mm4
+ pmaddwd mm1, mm1
+ pmaddwd mm3, mm3
+
+ lea r0, [r0+2*r1]
+ lea r2, [r2+2*r3]
+ paddd mm0, mm1
+ paddd mm0, mm3
%endmacro
;-----------------------------------------------------------------------------
@@ -127,8 +150,8 @@ SECTION .text
cglobal x264_pixel_ssd_%1x%2_mmx, 4,4
pxor mm7, mm7 ; zero
pxor mm0, mm0 ; mm0 holds the sum
-%rep %2
- SSD_INC_1x%1P
+%rep %2/2
+ SSD_INC_2x%1P
%endrep
movq mm1, mm0
psrlq mm1, 32
@@ -146,10 +169,10 @@ SSD_MMX 4, 8
SSD_MMX 4, 4
%macro SSD_INC_2x16P_SSE2 0
- movdqu xmm1, [r0]
- movdqu xmm2, [r2]
- movdqu xmm3, [r0+r1]
- movdqu xmm4, [r2+r3]
+ movdqa xmm1, [r0]
+ movdqa xmm2, [r2]
+ movdqa xmm3, [r0+r1]
+ movdqa xmm4, [r2+r3]
movdqa xmm5, xmm1
movdqa xmm6, xmm3
@@ -180,6 +203,27 @@ SSD_MMX 4, 4
paddd xmm0, xmm3
%endmacro
+%macro SSD_INC_2x8P_SSE2 0
+ movq xmm1, [r0]
+ movq xmm2, [r2]
+ movq xmm3, [r0+r1]
+ movq xmm4, [r2+r3]
+
+ punpcklbw xmm1,xmm7
+ punpcklbw xmm2,xmm7
+ punpcklbw xmm3,xmm7
+ punpcklbw xmm4,xmm7
+ psubw xmm1,xmm2
+ psubw xmm3,xmm4
+ pmaddwd xmm1,xmm1
+ pmaddwd xmm3,xmm3
+
+ lea r0, [r0+r1*2]
+ lea r2, [r2+r3*2]
+ paddd xmm0, xmm1
+ paddd xmm0, xmm3
+%endmacro
+
;-----------------------------------------------------------------------------
; int x264_pixel_ssd_16x16_sse2 (uint8_t *, int, uint8_t *, int )
;-----------------------------------------------------------------------------
@@ -188,7 +232,7 @@ cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
pxor xmm7, xmm7
pxor xmm0, xmm0
%rep %2/2
- SSD_INC_2x16P_SSE2
+ SSD_INC_2x%1P_SSE2
%endrep
HADDD xmm0, xmm1
movd eax, xmm0
@@ -197,6 +241,9 @@ cglobal x264_pixel_ssd_%1x%2_sse2, 4,4
SSD_SSE2 16, 16
SSD_SSE2 16, 8
+SSD_SSE2 8, 16
+SSD_SSE2 8, 8
+SSD_SSE2 8, 4
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 2b947b0..f8f2e35 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -48,7 +48,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
x264_predict_4x4_init( 0, predict_4x4 );
x264_predict_8x8_filter( buf2+40, edge, ALL_NEIGHBORS, ALL_NEIGHBORS );
-#define TEST_PIXEL( name ) \
+#define TEST_PIXEL( name, align ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
{ \
int res_c, res_asm; \
@@ -57,8 +57,8 @@ static int check_pixel( int cpu_ref, int cpu_new )
for( j=0; j<64; j++ ) \
{ \
used_asm = 1; \
- res_c = call_c( pixel_c.name[i], buf1, 32, buf2+j, 16 ); \
- res_asm = call_a( pixel_asm.name[i], buf1, 32, buf2+j, 16 ); \
+ res_c = call_c( pixel_c.name[i], buf1, 32, buf2+j*!align, 16 ); \
+ res_asm = call_a( pixel_asm.name[i], buf1, 32, buf2+j*!align, 16 ); \
if( res_c != res_asm ) \
{ \
ok = 0; \
@@ -70,10 +70,10 @@ static int check_pixel( int cpu_ref, int cpu_new )
} \
report( "pixel " #name " :" );
- TEST_PIXEL( sad );
- TEST_PIXEL( ssd );
- TEST_PIXEL( satd );
- TEST_PIXEL( sa8d );
+ TEST_PIXEL( sad, 0 );
+ TEST_PIXEL( ssd, 1 );
+ TEST_PIXEL( satd, 0 );
+ TEST_PIXEL( sa8d, 0 );
#define TEST_PIXEL_X( N ) \
for( i = 0, ok = 1, used_asm = 0; i < 7; i++ ) \
More information about the x264-devel
mailing list