[x264-devel] commit: MMX/SSE2 versions of high bit depth store_interleave ( Daniel Alexandru Morie )

Wed Dec 15 04:19:34 CET 2010

x264 | branch: master | Daniel Alexandru Morie <andu.qq at gmail.com> | Tue Dec  7 06:11:02 2010 -0800| [6d9310e0825f756fd53acaa77ea1aca3c7e927fc] | committer: Jason Garrett-Glaser 

MMX/SSE2 versions of high bit depth store_interleave

Patch from Google Code-In.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=6d9310e0825f756fd53acaa77ea1aca3c7e927fc
---

 common/x86/mc-a2.asm |   58 +++++++++++++++++++++++++++++++++++++++++++++----
 common/x86/mc-c.c    |    6 +++-
 2 files changed, 57 insertions(+), 7 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 7eafb5d..587d3ef 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -880,11 +880,58 @@ cglobal plane_copy_core_mmxext, 6,7
     emms
     RET
 
+%ifdef HIGH_BIT_DEPTH
+
+%macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
+%if mmsize==16
+    mov%4       m0, [%2]
+    mov%4       m1, [%3]
+    SBUTTERFLY  wd, 0, 1, 2
+    mov%5a [%1+ 0], m0
+    mov%5a [%1+16], m1
+%else
+    movq        m0, [%2+0]
+    movq        m1, [%3+0]
+    SBUTTERFLY  wd, 0, 1, 2
+    mov%5q [%1+ 0], m0
+    mov%5q [%1+ 8], m1
+    movq        m0, [%2+8]
+    movq        m1, [%3+8]
+    SBUTTERFLY  wd, 0, 1, 2
+    mov%5q [%1+16], m0
+    mov%5q [%1+24], m1
+%endif
+%endmacro
+
+%macro PLANE_INTERLEAVE 1
+;-----------------------------------------------------------------------------
+; void store_interleave_8x8x2( uint16_t *dst, int i_dst, uint16_t *srcu, uint16_t *srcv )
+;-----------------------------------------------------------------------------
+cglobal store_interleave_8x8x2_%1, 4,5
+    mov    r4d, 16
+    FIX_STRIDES r1
+.loop:
+    INTERLEAVE r0, r2, r3, a
+    add    r2, FDEC_STRIDEB
+    add    r3, FDEC_STRIDEB
+    add    r0, r1
+    dec    r4d
+    jg .loop
+    REP_RET
+
+%endmacro ; PLANE_INTERLEAVE
+
+INIT_MMX
+PLANE_INTERLEAVE mmxext
+INIT_XMM
+PLANE_INTERLEAVE sse2
+
+%else ;!HIGH_BIT_DEPTH
 
 %macro INTERLEAVE 4-5 ; dst, srcu, srcv, is_aligned, nt_hint
     movq   m0, [%2]
 %if mmsize==16
-%if %4
+%ifidn %4, a
     punpcklbw m0, [%3]
 %else
     movq   m1, [%3]
@@ -969,8 +1016,8 @@ cglobal plane_copy_interleave_core_%1, 6,7
     mov    r6d, r6m
     neg    r6
 .loopx:
-    INTERLEAVE r0+r6*2,    r2+r6,   r4+r6,   0, nt
-    INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, 0, nt
+    INTERLEAVE r0+r6*2,    r2+r6,   r4+r6,   u, nt
+    INTERLEAVE r0+r6*2+16, r2+r6+8, r4+r6+8, u, nt
     add    r6, 16
     jl .loopx
 .pad:
@@ -1001,8 +1048,8 @@ cglobal plane_copy_interleave_core_%1, 6,7
 cglobal store_interleave_8x8x2_%1, 4,5
     mov    r4d, 4
 .loop:
-    INTERLEAVE r0, r2, r3, 1
-    INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, 1
+    INTERLEAVE r0, r2, r3, a
+    INTERLEAVE r0+r1, r2+FDEC_STRIDE, r3+FDEC_STRIDE, a
     add    r2, FDEC_STRIDE*2
     add    r3, FDEC_STRIDE*2
     lea    r0, [r0+r1*2]
@@ -1088,6 +1135,7 @@ PLANE_INTERLEAVE sse2
 PLANE_DEINTERLEAVE sse2
 PLANE_DEINTERLEAVE ssse3
 
+%endif ; HIGH_BIT_DEPTH
 
 ; These functions are not general-use; not only do the SSE ones require aligned input,
 ; but they also will fail if given a non-mod16 size or a size less than 64.
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 6859e3c..e95daeb 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -101,8 +101,8 @@ void x264_plane_copy_deinterleave_sse2( uint8_t *dstu, int i_dstu,
 void x264_plane_copy_deinterleave_ssse3( uint8_t *dstu, int i_dstu,
                                          uint8_t *dstv, int i_dstv,
                                          uint8_t *src, int i_src, int w, int h );
-void x264_store_interleave_8x8x2_mmxext( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv );
-void x264_store_interleave_8x8x2_sse2( uint8_t *dst, int i_dst, uint8_t *srcu, uint8_t *srcv );
+void x264_store_interleave_8x8x2_mmxext( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
+void x264_store_interleave_8x8x2_sse2( pixel *dst, int i_dst, pixel *srcu, pixel *srcv );
 void x264_load_deinterleave_8x8x2_fenc_mmx( uint8_t *dst, uint8_t *src, int i_src );
 void x264_load_deinterleave_8x8x2_fenc_sse2( uint8_t *dst, uint8_t *src, int i_src );
 void x264_load_deinterleave_8x8x2_fenc_ssse3( uint8_t *dst, uint8_t *src, int i_src );
@@ -448,6 +448,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->memzero_aligned = x264_memzero_aligned_mmx;
     pf->integral_init4v = x264_integral_init4v_mmx;
     pf->integral_init8v = x264_integral_init8v_mmx;
+    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext;
 
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
@@ -474,6 +475,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->integral_init4v = x264_integral_init4v_sse2;
     pf->integral_init8v = x264_integral_init8v_sse2;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_sse2;
+    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_sse2;
 
     if( cpu&X264_CPU_SSE2_IS_SLOW )
         return;