[x264-devel] commit: MMX/SSE2 versions of high bit depth store_interleave ( Daniel Alexandru Morie )
git at videolan.org
git at videolan.org
Wed Dec 15 04:19:34 CET 2010
x264 | branch: master | Daniel Alexandru Morie <andu.qq at gmail.com> | Tue Dec 7 06:11:02 2010 -0800| [6d9310e0825f756fd53acaa77ea1aca3c7e927fc] | committer: Jason Garrett-Glaser
MMX/SSE2 versions of high bit depth store_interleave
Patch from Google Code-In.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6d9310e0825f756fd53acaa77ea1aca3c7e927fc
---
common/x86/mc-a2.asm | 58 +++++++++++++++++++++++++++++++++++++++++++++----
common/x86/mc-c.c | 6 +++-
2 files changed, 57 insertions(+), 7 deletions(-)
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 7eafb5d..587d3ef 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -880,11 +880,58 @@ cglobal plane_copy_core_mmxext, 6,7
emms
RET
+%ifdef HIGH_BIT_DEPTH
+
+%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
+%if mmsize==16
+ mov%4 m0, [%2]
+ mov%4 m1, [%3]
+ SBUTTERFLY wd, 0, 1, 2
+ mov%5a [%1+ 0], m0
+ mov%5a [%1+16], m1
+%else
+ movq m0, [%2+0]
+ movq m1, [%3+0]
+ SBUTTERFLY wd, 0, 1, 2
+ mov%5q [%1+ 0], m0
+ mov%5q [%1+ 8], m1
+ movq m0, [%2+8]
+ movq m1, [%3+8]
+ SBUTTERFLY wd, 0, 1, 2
+ mov%5q [%1+16], m0
+ mov%5q [%1+24], m1
+%endif
+%endmacro
+
+%macro PLANE_INTERLEAVE 1
+;-----------------------------------------------------------------------------
+; void store_interleave_8x8x2( uint16_t *dst, int i_dst, uint16_t *srcu, uint16_t *srcv )
+;-----------------------------------------------------------------------------
+cglobal store_interleave_8x8x2_%1, 4,5
+ mov r4d, 16
+ FIX_STRIDES r1
+.loop:
+ INTERLEAVE r0, r2, r3, a
+ add r2, FDEC_STRIDEB
+ add r3, FDEC_STRIDEB
+ add r0, r1
+ dec r4d
+ jg .loop
+ REP_RET
+
+%endmacro ; PLANE_INTERLEAVE
+
+INIT_MMX
+PLANE_INTERLEAVE mmxext
+INIT_XMM
+PLANE_INTERLEAVE sse2
+
+%else ;!HIGH_BIT_DEPTH
%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
movq m0, [%2]
%if mmsize==16
-%if %4
+%ifidn %4, a
punpcklbw m0, [%3]
%else
movq m1, [%3]
@@ -969,8 +1016,8 @@ cglobal plane_copy_interleave_core_%1, 6,7
mov r6d, r6m
neg r6
.loopx:
- INTERLEAVE r0+r6*2, r2+r6, r4+r6, 0, nt
- INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt
+ INTERLEAVE r0+r6*2, r2+r6, r4+r6, u, nt
+ INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, u, nt
add r6, 16
jl .loopx
.pad:
@@ -1001,8 +1048,8 @@ cglobal plane_copy_interleave_core_%1, 6,7
cglobal store_interleave_8x8x2_%1, 4,5
mov r4d, 4
.loop:
- INTERLEAVE r0, r2, r3, 1
- INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1
+ INTERLEAVE r0, r2, r3, a
+ INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, a
add r2, FDEC_STRIDE*2
add r3, FDEC_STRIDE*2
lea r0, [r0+r1*2]
@@ -1088,6 +1135,7 @@ PLANE_INTERLEAVE sse2
PLANE_DEINTERLEAVE sse2
PLANE_DEINTERLEAVE ssse3
+%endif ; HIGH_BIT_DEPTH
; These functions are not general-use; not only do the SSE ones require aligned input,
; but they also will fail if given a non-mod16 size or a size less than 64.
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 6859e3c..e95daeb 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -101,8 +101,8 @@ void x264_plane_copy_deinterleave_sse2( uint8_t *dstu, int i_dstu,
void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
uint8_t *dstv, int i_dstv,
uint8_t *src, int i_src, int w, int h );
-void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv );
-void x264_store_interleave_8x8x2_sse2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv );
+void x264_store_interleave_8x8x2_mmxext( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
+void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
void x264_load_deinterleave_8x8x2_fenc_mmx( uint8_t *dst, uint8_t *src, int i_src );
void x264_load_deinterleave_8x8x2_fenc_sse2( uint8_t *dst, uint8_t *src, int i_src );
void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
@@ -448,6 +448,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->memzero_aligned = x264_memzero_aligned_mmx;
pf->integral_init4v = x264_integral_init4v_mmx;
pf->integral_init8v = x264_integral_init8v_mmx;
+ pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext;
if( !(cpu&X264_CPU_MMXEXT) )
return;
@@ -474,6 +475,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init4v = x264_integral_init4v_sse2;
pf->integral_init8v = x264_integral_init8v_sse2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
+ pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
if( cpu&X264_CPU_SSE2_IS_SLOW )
return;
More information about the x264-devel
mailing list