[x264-devel] commit: faster lossless zigzag (Loren Merritt )
git version control
git at videolan.org
Sat Mar 22 11:33:57 CET 2008
x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Sat Mar 22 02:46:31 2008 -0600| [36fe32ae368797be584657eed37350faa0e93e78]
faster lossless zigzag
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=36fe32ae368797be584657eed37350faa0e93e78
---
common/dct.c | 16 +++++++++++++++-
common/x86/dct-a.asm | 41 +++++++++++++++++++++++++++++++++++++++++
common/x86/dct.h | 1 +
3 files changed, 57 insertions(+), 1 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index 895306b..8b57055 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -542,8 +542,12 @@ static void zigzag_scan_4x4ac_field( int16_t level[15], int16_t dct[4][4] )
int oe = x+y*FENC_STRIDE;\
int od = x+y*FDEC_STRIDE;\
level[i] = p_src[oe] - p_dst[od];\
- p_dst[od] = p_src[oe];\
}
+#define COPY4x4\
+ *(uint32_t*)(p_dst+0*FDEC_STRIDE) = *(uint32_t*)(p_src+0*FENC_STRIDE);\
+ *(uint32_t*)(p_dst+1*FDEC_STRIDE) = *(uint32_t*)(p_src+1*FENC_STRIDE);\
+ *(uint32_t*)(p_dst+2*FDEC_STRIDE) = *(uint32_t*)(p_src+2*FENC_STRIDE);\
+ *(uint32_t*)(p_dst+3*FDEC_STRIDE) = *(uint32_t*)(p_src+3*FENC_STRIDE);\
static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
{
@@ -551,6 +555,7 @@ static void zigzag_sub_4x4_frame( int16_t level[16], const uint8_t *p_src, uint8
ZIG( 4,1,1) ZIG( 5,0,2) ZIG( 6,0,3) ZIG( 7,1,2)
ZIG( 8,2,1) ZIG( 9,3,0) ZIG(10,3,1) ZIG(11,2,2)
ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,2) ZIG(15,3,3)
+ COPY4x4
}
static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8_t *p_dst )
@@ -559,6 +564,7 @@ static void zigzag_sub_4x4_field( int16_t level[16], const uint8_t *p_src, uint8
ZIG( 4,3,0) ZIG( 5,1,1) ZIG( 6,2,1) ZIG( 7,3,1)
ZIG( 8,0,2) ZIG( 9,1,2) ZIG(10,2,2) ZIG(11,3,2)
ZIG(12,0,3) ZIG(13,1,3) ZIG(14,2,3) ZIG(15,3,3)
+ COPY4x4
}
static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
@@ -567,6 +573,7 @@ static void zigzag_sub_4x4ac_frame( int16_t level[15], const uint8_t *p_src, uin
ZIG( 3,1,1) ZIG( 4,0,2) ZIG( 5,0,3) ZIG( 6,1,2)
ZIG( 7,2,1) ZIG( 8,3,0) ZIG( 9,3,1) ZIG(10,2,2)
ZIG(11,1,3) ZIG(12,2,3) ZIG(13,3,2) ZIG(14,3,3)
+ COPY4x4
}
static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uint8_t *p_dst )
@@ -575,9 +582,11 @@ static void zigzag_sub_4x4ac_field( int16_t level[15], const uint8_t *p_src, uin
ZIG( 3,3,0) ZIG( 4,1,1) ZIG( 5,2,1) ZIG( 6,3,1)
ZIG( 7,0,2) ZIG( 8,1,2) ZIG( 9,2,2) ZIG(10,3,2)
ZIG(11,0,3) ZIG(12,1,3) ZIG(13,2,3) ZIG(14,3,3)
+ COPY4x4
}
#undef ZIG
+#undef COPY4x4
void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
{
@@ -609,6 +618,11 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
pf->sub_4x4 = zigzag_sub_4x4_frame;
pf->sub_4x4ac = zigzag_sub_4x4ac_frame;
+#ifdef HAVE_SSE3
+ if( cpu&X264_CPU_SSSE3 )
+ pf->sub_4x4 = x264_zigzag_sub_4x4_frame_ssse3;
+#endif
+
#ifdef ARCH_PPC
if( cpu&X264_CPU_ALTIVEC )
{
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 0c6d463..0b21f6b 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -27,6 +27,7 @@
SECTION_RODATA
pw_1: times 8 dw 1
pw_32: times 8 dw 32
+pb_zigzag4: db 0,1,4,8,5,2,3,6,9,12,13,10,7,11,14,15
SECTION .text
@@ -290,3 +291,43 @@ cglobal x264_zigzag_scan_4x4_field_mmxext, 2,3
mov [r0+12], r2d
RET
+%ifdef HAVE_SSE3
+;-----------------------------------------------------------------------------
+; void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst )
+;-----------------------------------------------------------------------------
+cglobal x264_zigzag_sub_4x4_frame_ssse3, 3,3
+ movd xmm0, [r1+0*FENC_STRIDE]
+ movd xmm1, [r1+1*FENC_STRIDE]
+ movd xmm2, [r1+2*FENC_STRIDE]
+ movd xmm3, [r1+3*FENC_STRIDE]
+ movd xmm4, [r2+0*FDEC_STRIDE]
+ movd xmm5, [r2+1*FDEC_STRIDE]
+ movd xmm6, [r2+2*FDEC_STRIDE]
+ movd xmm7, [r2+3*FDEC_STRIDE]
+ movd [r2+0*FDEC_STRIDE], xmm0
+ movd [r2+1*FDEC_STRIDE], xmm1
+ movd [r2+2*FDEC_STRIDE], xmm2
+ movd [r2+3*FDEC_STRIDE], xmm3
+ picgetgot r1
+ punpckldq xmm0, xmm1
+ punpckldq xmm2, xmm3
+ punpckldq xmm4, xmm5
+ punpckldq xmm6, xmm7
+ movlhps xmm0, xmm2
+ movlhps xmm4, xmm6
+ movdqa xmm7, [pb_zigzag4 GLOBAL]
+ pshufb xmm0, xmm7
+ pshufb xmm4, xmm7
+ pxor xmm6, xmm6
+ movdqa xmm1, xmm0
+ movdqa xmm5, xmm4
+ punpcklbw xmm0, xmm6
+ punpckhbw xmm1, xmm6
+ punpcklbw xmm4, xmm6
+ punpckhbw xmm5, xmm6
+ psubw xmm0, xmm4
+ psubw xmm1, xmm5
+ movdqa [r0], xmm0
+ movdqa [r0+16], xmm1
+ RET
+%endif
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 5b88dbe..95c9d60 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -47,5 +47,6 @@ void x264_add8x8_idct8_sse2( uint8_t *dst, int16_t dct[8][8] );
void x264_add16x16_idct8_sse2( uint8_t *dst, int16_t dct[4][8][8] );
void x264_zigzag_scan_4x4_field_mmxext( int16_t level[16], int16_t dct[4][4] );
+void x264_zigzag_sub_4x4_frame_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst );
#endif
More information about the x264-devel
mailing list