[x264-devel] commit: SSE2 high bit depth zigzag_interleave_cavlc (Daniel Kang )
git at videolan.org
git at videolan.org
Mon Jan 10 22:01:01 CET 2011
x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Thu Dec 23 12:15:03 2010 -0500| [658b6ae1b5f43b488efd0ab4f6c60b93a1366333] | committer: Jason Garrett-Glaser
SSE2 high bit depth zigzag_interleave_cavlc
Patch from Google Code-In.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=658b6ae1b5f43b488efd0ab4f6c60b93a1366333
---
common/dct.c | 7 +++-
common/x86/dct-a.asm | 80 ++++++++++++++++++++++++++++----------------------
common/x86/dct.h | 2 +-
3 files changed, 51 insertions(+), 38 deletions(-)
diff --git a/common/dct.c b/common/dct.c
index e7926dd..dc5e59f 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -802,12 +802,15 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf, int b_interlaced )
}
pf->interleave_8x8_cavlc = zigzag_interleave_8x8_cavlc;
-#if !HIGH_BIT_DEPTH
#if HAVE_MMX
+#if HIGH_BIT_DEPTH
+ if( cpu&X264_CPU_SSE2 )
+ pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
+#else
if( cpu&X264_CPU_MMX )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_mmx;
if( cpu&X264_CPU_SHUFFLE_IS_FAST )
pf->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_sse2;
+#endif // HIGH_BIT_DEPTH
#endif
-#endif // !HIGH_BIT_DEPTH
}
diff --git a/common/x86/dct-a.asm b/common/x86/dct-a.asm
index 8275627..e31b60c 100644
--- a/common/x86/dct-a.asm
+++ b/common/x86/dct-a.asm
@@ -1267,47 +1267,57 @@ ZIGZAG_SUB_4x4 ac, field
;-----------------------------------------------------------------------------
; void zigzag_interleave_8x8_cavlc( int16_t *dst, int16_t *src, uint8_t *nnz )
;-----------------------------------------------------------------------------
-
-%macro INTERLEAVE 1
- movq m0, [r1+%1*4+ 0]
- movq m1, [r1+%1*4+ 8]
- movq m2, [r1+%1*4+16]
- movq m3, [r1+%1*4+24]
- TRANSPOSE4x4W 0,1,2,3,4
- movq [r0+%1+ 0], m0
- movq [r0+%1+32], m1
- movq [r0+%1+64], m2
- movq [r0+%1+96], m3
-%if %1
+%macro INTERLEAVE 2
+ mova m0, [r1+(%1*4+ 0)*SIZEOF_PIXEL]
+ mova m1, [r1+(%1*4+ 8)*SIZEOF_PIXEL]
+ mova m2, [r1+(%1*4+16)*SIZEOF_PIXEL]
+ mova m3, [r1+(%1*4+24)*SIZEOF_PIXEL]
+ TRANSPOSE4x4%2 0,1,2,3,4
+ mova [r0+(%1+ 0)*SIZEOF_PIXEL], m0
+ mova [r0+(%1+32)*SIZEOF_PIXEL], m1
+ mova [r0+(%1+64)*SIZEOF_PIXEL], m2
+ mova [r0+(%1+96)*SIZEOF_PIXEL], m3
packsswb m0, m1
- por m6, m2
- por m7, m3
- por m5, m0
+%if %1
+ por m6, m2
+ por m7, m3
+ por m5, m0
%else
- packsswb m0, m1
- SWAP m5, m0
- SWAP m6, m2
- SWAP m7, m3
+ SWAP m5, m0
+ SWAP m6, m2
+ SWAP m7, m3
%endif
%endmacro
-INIT_MMX
-cglobal zigzag_interleave_8x8_cavlc_mmx, 3,3
- INTERLEAVE 0
- INTERLEAVE 8
- INTERLEAVE 16
- INTERLEAVE 24
- packsswb m6, m7
- packsswb m5, m6
- packsswb m5, m5
- pxor m0, m0
- pcmpeqb m5, m0
- paddb m5, [pb_1]
- movd r0d, m5
- mov [r2+0], r0w
- shr r0d, 16
- mov [r2+8], r0w
+%macro ZIGZAG_8x8_CAVLC 2
+cglobal zigzag_interleave_8x8_cavlc_%1, 3,3,8*(mmsize/16)
+ INTERLEAVE 0, %2
+ INTERLEAVE 8, %2
+ INTERLEAVE 16, %2
+ INTERLEAVE 24, %2
+ packsswb m6, m7
+ packsswb m5, m6
+ packsswb m5, m5
+ pxor m0, m0
+%ifdef HIGH_BIT_DEPTH
+ packsswb m5, m5
+%endif
+ pcmpeqb m5, m0
+ paddb m5, [pb_1]
+ movd r0d, m5
+ mov [r2+0], r0w
+ shr r0d, 16
+ mov [r2+8], r0w
RET
+%endmacro
+
+%ifdef HIGH_BIT_DEPTH
+INIT_XMM
+ZIGZAG_8x8_CAVLC sse2, D
+%else
+INIT_MMX
+ZIGZAG_8x8_CAVLC mmx , W
+%endif
%macro INTERLEAVE_XMM 1
mova m0, [r1+%1*4+ 0]
diff --git a/common/x86/dct.h b/common/x86/dct.h
index 54a6e44..fb2a607 100644
--- a/common/x86/dct.h
+++ b/common/x86/dct.h
@@ -86,6 +86,6 @@ int x264_zigzag_sub_4x4ac_frame_ssse3( int16_t level[16], const uint8_t *src, u
int x264_zigzag_sub_4x4_field_ssse3 ( int16_t level[16], const uint8_t *src, uint8_t *dst );
int x264_zigzag_sub_4x4ac_field_ssse3( int16_t level[16], const uint8_t *src, uint8_t *dst, int16_t *dc );
void x264_zigzag_interleave_8x8_cavlc_mmx( int16_t *dst, int16_t *src, uint8_t *nnz );
-void x264_zigzag_interleave_8x8_cavlc_sse2( int16_t *dst, int16_t *src, uint8_t *nnz );
+void x264_zigzag_interleave_8x8_cavlc_sse2( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
More information about the x264-devel
mailing list