[x264-devel] commit: SSE4 and SSSE3 versions of some intra_sad functions ( Jason Garrett-Glaser )
git at videolan.org
git at videolan.org
Thu Jul 15 04:08:45 CEST 2010
x264 | branch: stable | Jason Garrett-Glaser <darkshikari at gmail.com> | Fri Jun 18 13:58:11 2010 -0700| [ef0c268a2b1a42ff234ef27a535c16b825452ae0] | committer: Jason Garrett-Glaser
SSE4 and SSSE3 versions of some intra_sad functions
Primarily Nehalem-optimized.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=ef0c268a2b1a42ff234ef27a535c16b825452ae0
---
common/pixel.c | 3 +
common/x86/pixel.h | 2 +
common/x86/sad-a.asm | 107 ++++++++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 112 insertions(+), 0 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index a8cb1df..8441c7a 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -856,6 +856,9 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
}
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
+ /* Slower on Conroe, so only enable under SSE4 */
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
}
#endif //HAVE_MMX
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 9bba683..b1b916d 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -80,6 +80,7 @@ DECL_PIXELS( uint64_t, hadamard_ac, sse4, ( uint8_t *pix, int i_stride ))
void x264_intra_satd_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_4x4_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_4x4_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_4x4_sse4 ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_satd_x3_8x8c_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8c_mmxext ( uint8_t *, uint8_t *, int * );
@@ -93,6 +94,7 @@ void x264_intra_sa8d_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_sse2 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sad_x3_8x8_mmxext ( uint8_t *, uint8_t *, int * );
+void x264_intra_sad_x3_8x8_ssse3 ( uint8_t *, uint8_t *, int * );
void x264_intra_sa8d_x3_8x8_core_mmxext( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_sse2 ( uint8_t *, int16_t [2][8], int * );
void x264_intra_sa8d_x3_8x8_core_ssse3 ( uint8_t *, int16_t [2][8], int * );
diff --git a/common/x86/sad-a.asm b/common/x86/sad-a.asm
index 72c1789..10a365c 100644
--- a/common/x86/sad-a.asm
+++ b/common/x86/sad-a.asm
@@ -26,6 +26,19 @@
%include "x86inc.asm"
%include "x86util.asm"
+SECTION_RODATA
+
+h4x4_pred_shuf: db 3,3,3,3,7,7,7,7,11,11,11,11,15,15,15,15
+h4x4_pred_shuf2: db 3,7,11,15,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
+h8x8_pred_shuf: times 8 db 1
+ times 8 db 0
+ times 8 db 3
+ times 8 db 2
+ times 8 db 5
+ times 8 db 4
+ times 8 db 7
+ times 8 db 6
+
SECTION .text
cextern pb_3
@@ -303,6 +316,40 @@ cglobal intra_sad_x3_4x4_mmxext, 3,3
movd [r2+4], mm1 ;H prediction cost
RET
+cglobal intra_sad_x3_4x4_sse4, 3,3
+ movd xmm4, [r1+FDEC_STRIDE*0-4]
+ pinsrd xmm4, [r1+FDEC_STRIDE*1-4], 1
+ pinsrd xmm4, [r1+FDEC_STRIDE*2-4], 2
+ pinsrd xmm4, [r1+FDEC_STRIDE*3-4], 3
+ movd xmm2, [r1-FDEC_STRIDE]
+ pxor xmm3, xmm3
+ movdqa xmm5, xmm4
+ pshufb xmm4, [h4x4_pred_shuf2] ; EFGH
+ pshufb xmm5, [h4x4_pred_shuf] ; EEEEFFFFGGGGHHHH
+ pshufd xmm0, xmm2, 0 ; ABCDABCDABCDABCD
+ punpckldq xmm2, xmm4 ; ABCDEFGH
+ psadbw xmm2, xmm3
+ movd xmm1, [r0+FENC_STRIDE*0]
+ pinsrd xmm1, [r0+FENC_STRIDE*1], 1
+ pinsrd xmm1, [r0+FENC_STRIDE*2], 2
+ pinsrd xmm1, [r0+FENC_STRIDE*3], 3
+ psadbw xmm0, xmm1
+ psadbw xmm5, xmm1
+ psraw xmm2, 2
+ pavgw xmm2, xmm3
+ pshufb xmm2, xmm3 ; DC prediction
+ movdqa xmm3, xmm0
+ punpcklqdq xmm0, xmm5
+ punpckhqdq xmm3, xmm5
+ psadbw xmm2, xmm1
+ paddw xmm0, xmm3
+ movhlps xmm4, xmm2
+ packusdw xmm0, xmm0
+ paddw xmm2, xmm4
+ movq [r2], xmm0 ; V/H prediction costs
+ movd [r2+8], xmm2 ; DC prediction cost
+ RET
+
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8( uint8_t *fenc, uint8_t edge[33], int res[3]);
;-----------------------------------------------------------------------------
@@ -370,6 +417,66 @@ cglobal intra_sad_x3_8x8_mmxext, 3,3
movd [r2+8], m1
RET
+INIT_XMM
+cglobal intra_sad_x3_8x8_ssse3, 3,4,9
+%ifdef PIC
+ lea r11, [h8x8_pred_shuf]
+%define shuf r11
+%else
+%define shuf h8x8_pred_shuf
+%endif
+ movq m0, [r1+7] ; left pixels
+ movq m1, [r1+16] ; top pixels
+ pxor m2, m2
+ pxor m3, m3
+ psadbw m2, m0
+ psadbw m3, m1
+ paddw m2, m3
+ pxor m3, m3 ; V score accumulator
+ psraw m2, 3
+ pavgw m2, m3
+ punpcklqdq m1, m1 ; V prediction
+ pshufb m2, m3 ; DC prediction
+ pxor m4, m4 ; H score accumulator
+ pxor m5, m5 ; DC score accumulator
+ mov r3d, 6
+.loop:
+ movq m6, [r0+FENC_STRIDE*0]
+ movhps m6, [r0+FENC_STRIDE*1]
+ movdqa m7, m0
+ pshufb m7, [shuf+r3*8] ; H prediction
+%ifdef ARCH_X86_64
+ movdqa m8, m1
+ psadbw m7, m6
+ psadbw m8, m6
+ psadbw m6, m2
+ paddw m4, m7
+ paddw m3, m8
+ paddw m5, m6
+%else
+ psadbw m7, m6
+ paddw m4, m7
+ movdqa m7, m1
+ psadbw m7, m6
+ psadbw m6, m2
+ paddw m3, m7
+ paddw m5, m6
+%endif
+ add r0, FENC_STRIDE*2
+ sub r3d, 2
+ jge .loop
+
+ movhlps m0, m3
+ movhlps m1, m4
+ movhlps m2, m5
+ paddw m3, m0
+ paddw m4, m1
+ paddw m5, m2
+ movd [r2+0], m3
+ movd [r2+4], m4
+ movd [r2+8], m5
+ RET
+
;-----------------------------------------------------------------------------
; void intra_sad_x3_8x8c( uint8_t *fenc, uint8_t *fdec, int res[3] );
;-----------------------------------------------------------------------------
More information about the x264-devel
mailing list