[x264-devel] commit: intra_sad_x3_8x8c assembly (Jason Garrett-Glaser )
git version control
git at videolan.org
Tue Mar 31 05:04:45 CEST 2009
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Mon Mar 30 04:07:50 2009 -0700| [d39f8ae1730b07286f1bb281a22d8cd57d0f90b9] | committer: Jason Garrett-Glaser
intra_sad_x3_8x8c assembly
Also fix intra_sad_x3_16x16's use of "n" as a loop variable (broke SWAP)
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d39f8ae1730b07286f1bb281a22d8cd57d0f90b9
---
common/pixel.c | 4 +-
common/pixel.h | 2 +
common/x86/pixel.h | 2 +
common/x86/sad-a.asm | 127 ++++++++++++++++++++++++++++++++++++++++++++++++-
encoder/analyse.c | 7 ++-
encoder/encoder.c | 1 +
tools/checkasm.c | 1 +
7 files changed, 137 insertions(+), 7 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index 76d04e0..24bf430 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -662,8 +662,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
#endif
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_mmxext;
- pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_mmxext;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_mmxext;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_mmxext;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_mmxext;
}
@@ -753,6 +754,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_ssse3;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_ssse3;
#ifdef ARCH_X86_64
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
diff --git a/common/pixel.h b/common/pixel.h
index a08879c..f1c901e 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -97,7 +97,9 @@ typedef struct
void (*intra_mbcmp_x3_16x16)( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sad_x3_16x16) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_mbcmp_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
+ void (*intra_sad_x3_8x8c) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_satd_x3_4x4) ( uint8_t *fenc, uint8_t *fdec , int res[3] );
void (*intra_sa8d_x3_8x8) ( uint8_t *fenc, uint8_t edge[33], int res[3] );
} x264_pixel_function_t;
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 0bb7dfe..1e04dcd 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -81,6 +81,8 @@ void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_mmxext( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_16x16_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_16x16_mmxext ( uint8_t *, uint8_t *, int * );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index c60ca54..25e3708 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -28,6 +28,8 @@
SECTION_RODATA
pb_3: times 16 db 3
+pb_shuf8x8c0: db 0,0,0,0,2,2,2,2
+pb_shuf8x8c1: db 4,4,4,4,6,6,6,6
sw_64: dd 64
SECTION .text
@@ -256,6 +258,125 @@ cglobal x264_pixel_sad_8x16_sse2, 4,4
RET
;-----------------------------------------------------------------------------
+; void intra_sad_x3_8x8c ( uint8_t *fenc, uint8_t *fdec, int res[3] );
+;-----------------------------------------------------------------------------
+
+%macro INTRA_SAD_HV_ITER 2
+%ifidn %2, ssse3
+ movd m1, [r1 + FDEC_STRIDE*(%1-4) - 4]
+ movd m3, [r1 + FDEC_STRIDE*(%1-3) - 4]
+ pshufb m1, m7
+ pshufb m3, m7
+%else
+ movq m1, [r1 + FDEC_STRIDE*(%1-4) - 8]
+ movq m3, [r1 + FDEC_STRIDE*(%1-3) - 8]
+ punpckhbw m1, m1
+ punpckhbw m3, m3
+ pshufw m1, m1, 0xff
+ pshufw m3, m3, 0xff
+%endif
+ movq m4, [r0 + FENC_STRIDE*(%1+0)]
+ movq m5, [r0 + FENC_STRIDE*(%1+1)]
+ psadbw m1, m4
+ psadbw m3, m5
+ psadbw m4, m6
+ psadbw m5, m6
+ paddw m1, m3
+ paddw m4, m5
+%if %1
+ paddw m0, m1
+ paddw m2, m4
+%else
+ SWAP 0,1
+ SWAP 2,4
+%endif
+%endmacro
+
+%macro INTRA_SAD_8x8C 1
+cglobal x264_intra_sad_x3_8x8c_%1, 3,3
+ movq m6, [r1 - FDEC_STRIDE]
+ add r1, FDEC_STRIDE*4
+%ifidn %1,ssse3
+ movq m7, [pb_3 GLOBAL]
+%endif
+ INTRA_SAD_HV_ITER 0, %1
+ INTRA_SAD_HV_ITER 2, %1
+ INTRA_SAD_HV_ITER 4, %1
+ INTRA_SAD_HV_ITER 6, %1
+ movd [r2+4], m0
+ movd [r2+8], m2
+ pxor m7, m7
+ movq m2, [r1 + FDEC_STRIDE*-4 - 8]
+ movq m4, [r1 + FDEC_STRIDE*-2 - 8]
+ movq m3, [r1 + FDEC_STRIDE* 0 - 8]
+ movq m5, [r1 + FDEC_STRIDE* 2 - 8]
+ punpckhbw m2, [r1 + FDEC_STRIDE*-3 - 8]
+ punpckhbw m4, [r1 + FDEC_STRIDE*-1 - 8]
+ punpckhbw m3, [r1 + FDEC_STRIDE* 1 - 8]
+ punpckhbw m5, [r1 + FDEC_STRIDE* 3 - 8]
+ punpckhbw m2, m4
+ punpckhbw m3, m5
+ psrlq m2, 32
+ psrlq m3, 32
+ psadbw m2, m7 ; s2
+ psadbw m3, m7 ; s3
+ movq m1, m6
+ SWAP 0, 6
+ punpckldq m0, m7
+ punpckhdq m1, m7
+ psadbw m0, m7 ; s0
+ psadbw m1, m7 ; s1
+ punpcklwd m0, m1
+ punpcklwd m2, m3
+ punpckldq m0, m2 ;s0 s1 s2 s3
+ pshufw m3, m0, 11110110b ;s2,s1,s3,s3
+ pshufw m0, m0, 01110100b ;s0,s1,s3,s1
+ paddw m0, m3
+ psrlw m0, 2
+ pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
+%ifidn %1, ssse3
+ movq m1, m0
+ pshufb m0, [pb_shuf8x8c0 GLOBAL]
+ pshufb m1, [pb_shuf8x8c1 GLOBAL]
+%else
+ packuswb m0, m0
+ punpcklbw m0, m0
+ movq m1, m0
+ punpcklbw m0, m0 ; 4x dc0 4x dc1
+ punpckhbw m1, m1 ; 4x dc2 4x dc3
+%endif
+ movq m2, [r0+FENC_STRIDE*0]
+ movq m3, [r0+FENC_STRIDE*1]
+ movq m4, [r0+FENC_STRIDE*2]
+ movq m5, [r0+FENC_STRIDE*3]
+ movq m6, [r0+FENC_STRIDE*4]
+ movq m7, [r0+FENC_STRIDE*5]
+ psadbw m2, m0
+ psadbw m3, m0
+ psadbw m4, m0
+ psadbw m5, m0
+ movq m0, [r0+FENC_STRIDE*6]
+ psadbw m6, m1
+ psadbw m7, m1
+ psadbw m0, m1
+ psadbw m1, [r0+FENC_STRIDE*7]
+ paddw m2, m3
+ paddw m4, m5
+ paddw m6, m7
+ paddw m0, m1
+ paddw m2, m4
+ paddw m6, m0
+ paddw m2, m6
+ movd [r2], m2
+ RET
+%endmacro
+
+INIT_MMX
+INTRA_SAD_8x8C mmxext
+INTRA_SAD_8x8C ssse3
+
+
+;-----------------------------------------------------------------------------
; void intra_sad_x3_16x16 ( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
@@ -272,11 +393,11 @@ cglobal x264_intra_sad_x3_16x16_%1,3,5,%2
%ifidn %1, ssse3
mova m1, [pb_3 GLOBAL]
%endif
-%assign n 0
+%assign x 0
%rep 16
- movzx r4d, byte [r1-1+FDEC_STRIDE*n]
+ movzx r4d, byte [r1-1+FDEC_STRIDE*x]
add r3d, r4d
-%assign n n+1
+%assign x x+1
%endrep
add r3d, 16
shr r3d, 5
diff --git a/encoder/analyse.c b/encoder/analyse.c
index e2c40d7..a3c0924 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -539,6 +539,7 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
int i_max;
int predict_mode[4];
+ int b_merged_satd = !!h->pixf.intra_mbcmp_x3_8x8c && !h->mb.b_lossless;
uint8_t *p_dstc[2], *p_srcc[2];
@@ -553,11 +554,11 @@ static void x264_mb_analyse_intra_chroma( x264_t *h, x264_mb_analysis_t *a )
predict_8x8chroma_mode_available( h->mb.i_neighbour, predict_mode, &i_max );
a->i_satd_i8x8chroma = COST_MAX;
- if( i_max == 4 && h->pixf.intra_satd_x3_8x8c && h->pixf.mbcmp[0] == h->pixf.satd[0] )
+ if( i_max == 4 && b_merged_satd )
{
int satdu[4], satdv[4];
- h->pixf.intra_satd_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
- h->pixf.intra_satd_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
+ h->pixf.intra_mbcmp_x3_8x8c( p_srcc[0], p_dstc[0], satdu );
+ h->pixf.intra_mbcmp_x3_8x8c( p_srcc[1], p_dstc[1], satdv );
h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[0] );
h->predict_8x8c[I_PRED_CHROMA_P]( p_dstc[1] );
satdu[I_PRED_CHROMA_P] =
diff --git a/encoder/encoder.c b/encoder/encoder.c
index 8361369..2d46f61 100644
--- a/encoder/encoder.c
+++ b/encoder/encoder.c
@@ -605,6 +605,7 @@ static void mbcmp_init( x264_t *h )
memcpy( h->pixf.mbcmp, satd ? h->pixf.satd : h->pixf.sad_aligned, sizeof(h->pixf.mbcmp) );
memcpy( h->pixf.mbcmp_unaligned, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.mbcmp_unaligned) );
h->pixf.intra_mbcmp_x3_16x16 = satd ? h->pixf.intra_satd_x3_16x16 : h->pixf.intra_sad_x3_16x16;
+ h->pixf.intra_mbcmp_x3_8x8c = satd ? h->pixf.intra_satd_x3_8x8c : h->pixf.intra_sad_x3_8x8c;
satd &= h->param.analyse.i_me_method == X264_ME_TESA;
memcpy( h->pixf.fpelcmp, satd ? h->pixf.satd : h->pixf.sad, sizeof(h->pixf.fpelcmp) );
memcpy( h->pixf.fpelcmp_x3, satd ? h->pixf.satd_x3 : h->pixf.sad_x3, sizeof(h->pixf.fpelcmp_x3) );
diff --git a/tools/checkasm.c b/tools/checkasm.c
index e5f62e2..344e6ae 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -405,6 +405,7 @@ static int check_pixel( int cpu_ref, int cpu_new )
TEST_INTRA_MBCMP( intra_sa8d_x3_8x8 , predict_8x8 , sa8d[PIXEL_8x8] , 1, edge );
report( "intra satd_x3 :" );
TEST_INTRA_MBCMP( intra_sad_x3_16x16 , predict_16x16, sad [PIXEL_16x16], 0 );
+ TEST_INTRA_MBCMP( intra_sad_x3_8x8c , predict_8x8c , sad [PIXEL_8x8] , 0 );
report( "intra sad_x3 :" );
if( pixel_asm.ssim_4x4x2_core != pixel_ref.ssim_4x4x2_core ||
More information about the x264-devel
mailing list