[x264-devel] commit: MMX version of high bit depth plane_copy (Daniel Kang )

Mon Jan 10 22:00:59 CET 2011

x264 | branch: master | Daniel Kang <daniel.d.kang at gmail.com> | Sun Dec 19 16:31:59 2010 -0500| [8d96f7d34c736f3924842871930b602bc83c6264] | committer: Jason Garrett-Glaser 

MMX version of high bit depth plane_copy
And various cosmetics.

Patch from Google Code-In

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=8d96f7d34c736f3924842871930b602bc83c6264
---

 common/frame.c       |    8 ++++----
 common/mc.c          |    2 +-
 common/mc.h          |    2 +-
 common/x86/mc-a2.asm |   46 ++++++++++++++++++++++++----------------------
 common/x86/mc-c.c    |   29 ++++++++++++-----------------
 tools/checkasm.c     |    4 ++--
 6 files changed, 44 insertions(+), 47 deletions(-)

diff --git a/common/frame.c b/common/frame.c
index faf6f3a..678e9a2 100644
--- a/common/frame.c
+++ b/common/frame.c
@@ -287,13 +287,13 @@ int x264_frame_copy_picture( x264_t *h, x264_frame_t *dst, x264_picture_t *src )
     uint8_t *pix[3];
     int stride[3];
     get_plane_ptr( h, src, &pix[0], &stride[0], 0, 0, 0 );
-    h->mc.plane_copy( dst->plane[0], dst->i_stride[0], pix[0], stride[0],
-                      h->param.i_width, h->param.i_height );
+    h->mc.plane_copy( dst->plane[0], dst->i_stride[0], (pixel*)pix[0],
+                      stride[0]/sizeof(pixel), h->param.i_width, h->param.i_height );
     if( i_csp == X264_CSP_NV12 )
     {
         get_plane_ptr( h, src, &pix[1], &stride[1], 1, 0, 1 );
-        h->mc.plane_copy( dst->plane[1], dst->i_stride[1], pix[1], stride[1],
-                          h->param.i_width, h->param.i_height>>1 );
+        h->mc.plane_copy( dst->plane[1], dst->i_stride[1], (pixel*)pix[1],
+                          stride[1]/sizeof(pixel), h->param.i_width, h->param.i_height>>1 );
     }
     else
     {
diff --git a/common/mc.c b/common/mc.c
index 96cc650..5f8c260 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -280,7 +280,7 @@ MC_COPY( 8 )
 MC_COPY( 4 )
 
 void x264_plane_copy_c( pixel *dst, int i_dst,
-                        uint8_t *src, int i_src, int w, int h )
+                        pixel *src, int i_src, int w, int h )
 {
     while( h-- )
     {
diff --git a/common/mc.h b/common/mc.h
index 3667fdf..92d0ded 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -88,7 +88,7 @@ typedef struct
     void (*load_deinterleave_8x8x2_fdec)( pixel *dst, pixel *src, int i_src );
 
     void (*plane_copy)( pixel *dst, int i_dst,
-                        uint8_t *src, int i_src, int w, int h );
+                        pixel *src, int i_src, int w, int h );
     void (*plane_copy_interleave)( pixel *dst, int i_dst,
                                    pixel *srcu, int i_srcu,
                                    pixel *srcv, int i_srcv, int w, int h );
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index 79594d9..1b75dfe 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -825,11 +825,13 @@ HPEL ssse3
 %endif ; !HIGH_BIT_DEPTH
 
 ;-----------------------------------------------------------------------------
-; void plane_copy_core( uint8_t *dst, int i_dst,
-;                       uint8_t *src, int i_src, int w, int h)
+; void plane_copy_core( pixel *dst, int i_dst,
+;                       pixel *src, int i_src, int w, int h)
 ;-----------------------------------------------------------------------------
 ; assumes i_dst and w are multiples of 16, and i_dst>w
+INIT_MMX
 cglobal plane_copy_core_mmxext, 6,7
+    FIX_STRIDES r1d, r3d, r4d
     movsxdifnidn r1, r1d
     movsxdifnidn r3, r3d
     movsxdifnidn r4, r4d
@@ -840,22 +842,22 @@ cglobal plane_copy_core_mmxext, 6,7
     sub    r6d, 63
 .loopx:
     prefetchnta [r2+256]
-    movq   mm0, [r2   ]
-    movq   mm1, [r2+ 8]
-    movntq [r0   ], mm0
-    movntq [r0+ 8], mm1
-    movq   mm2, [r2+16]
-    movq   mm3, [r2+24]
-    movntq [r0+16], mm2
-    movntq [r0+24], mm3
-    movq   mm4, [r2+32]
-    movq   mm5, [r2+40]
-    movntq [r0+32], mm4
-    movntq [r0+40], mm5
-    movq   mm6, [r2+48]
-    movq   mm7, [r2+56]
-    movntq [r0+48], mm6
-    movntq [r0+56], mm7
+    movq   m0, [r2   ]
+    movq   m1, [r2+ 8]
+    movntq [r0   ], m0
+    movntq [r0+ 8], m1
+    movq   m2, [r2+16]
+    movq   m3, [r2+24]
+    movntq [r0+16], m2
+    movntq [r0+24], m3
+    movq   m4, [r2+32]
+    movq   m5, [r2+40]
+    movntq [r0+32], m4
+    movntq [r0+40], m5
+    movq   m6, [r2+48]
+    movq   m7, [r2+56]
+    movntq [r0+48], m6
+    movntq [r0+56], m7
     add    r2,  64
     add    r0,  64
     sub    r6d, 64
@@ -864,10 +866,10 @@ cglobal plane_copy_core_mmxext, 6,7
     add    r6d, 63
     jle .end16
 .loop16:
-    movq   mm0, [r2  ]
-    movq   mm1, [r2+8]
-    movntq [r0  ], mm0
-    movntq [r0+8], mm1
+    movq   m0, [r2  ]
+    movq   m1, [r2+8]
+    movntq [r0  ], m0
+    movntq [r0+8], m1
     add    r2,  16
     add    r0,  16
     sub    r6d, 16
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 8ddcfc0..1b135cc 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -84,8 +84,8 @@ void x264_mc_copy_w16_sse3( uint8_t *, int, uint8_t *, int, int );
 void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
 void x264_prefetch_fenc_mmxext( uint8_t *, int, uint8_t *, int, int );
 void x264_prefetch_ref_mmxext( uint8_t *, int, int );
-void x264_plane_copy_core_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
-void x264_plane_copy_c( uint8_t *, int, uint8_t *, int, int w, int h );
+void x264_plane_copy_core_mmxext( pixel *, int, pixel *, int, int w, int h);
+void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
 void x264_plane_copy_interleave_core_mmxext( pixel *dst, int i_dst,
                                              pixel *srcu, int i_srcu,
                                              pixel *srcv, int i_srcv, int w, int h );
@@ -426,23 +426,24 @@ HPEL(16, sse2, sse2, sse2, sse2)
 HPEL(16, ssse3, ssse3, ssse3, ssse3)
 #endif
 HPEL(16, sse2_misalign, sse2, sse2_misalign, sse2)
+#endif // HIGH_BIT_DEPTH
 
-static void x264_plane_copy_mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int w, int h )
+static void x264_plane_copy_mmxext( pixel *dst, int i_dst, pixel *src, int i_src, int w, int h )
 {
+    int c_w = 16/sizeof(pixel) - 1;
     if( w < 256 ) { // tiny resolutions don't want non-temporal hints. dunno the exact threshold.
         x264_plane_copy_c( dst, i_dst, src, i_src, w, h );
-    } else if( !(w&15) ) {
+    } else if( !(w&c_w) ) {
         x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, w, h );
     } else if( i_src > 0 ) {
         // have to use plain memcpy on the last line (in memory order) to avoid overreading src
-        x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+15)&~15, h-1 );
-        memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w );
+        x264_plane_copy_core_mmxext( dst, i_dst, src, i_src, (w+c_w)&~c_w, h-1 );
+        memcpy( dst+i_dst*(h-1), src+i_src*(h-1), w*sizeof(pixel) );
     } else {
-        memcpy( dst, src, w );
-        x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+15)&~15, h-1 );
+        memcpy( dst, src, w*sizeof(pixel) );
+        x264_plane_copy_core_mmxext( dst+i_dst, i_dst, src+i_src, i_src, (w+c_w)&~c_w, h-1 );
     }
 }
-#endif // HIGH_BIT_DEPTH
 
 #define PLANE_INTERLEAVE(cpu) \
 static void x264_plane_copy_interleave_##cpu( pixel *dst, int i_dst,\
@@ -483,12 +484,13 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->memzero_aligned = x264_memzero_aligned_mmx;
     pf->integral_init4v = x264_integral_init4v_mmx;
     pf->integral_init8v = x264_integral_init8v_mmx;
-    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext;
 
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
 
+    pf->plane_copy = x264_plane_copy_mmxext;
     pf->plane_copy_interleave = x264_plane_copy_interleave_mmxext;
+    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext;
 
     pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_mmxext;
     pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_mmxext;
@@ -558,13 +560,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( (cpu&X264_CPU_SHUFFLE_IS_FAST) && !(cpu&X264_CPU_SLOW_ATOM) )
         pf->integral_init4v = x264_integral_init4v_ssse3;
 #else // !HIGH_BIT_DEPTH
-    pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmxext;
-    pf->load_deinterleave_8x8x2_fenc = x264_load_deinterleave_8x8x2_fenc_mmx;
-    pf->load_deinterleave_8x8x2_fdec = x264_load_deinterleave_8x8x2_fdec_mmx;
-
-    pf->plane_copy = x264_plane_copy_mmxext;
-    pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_mmx;
-
     pf->frame_init_lowres_core = x264_frame_init_lowres_core_mmxext;
 
     pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 873a972..845bd5c 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -1080,7 +1080,7 @@ static int check_mc( int cpu_ref, int cpu_new )
             int src_stride = plane_specs[i].src_stride;
             int dst_stride = (w + 127) & ~63;
             assert( dst_stride * h <= 0x1000 );
-            uint8_t *src1 = buf1 + X264_MAX(0, -src_stride) * (h-1);
+            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
             memset( pbuf3, 0, 0x1000*sizeof(pixel) );
             memset( pbuf4, 0, 0x1000*sizeof(pixel) );
             call_c( mc_c.plane_copy, pbuf3, dst_stride, src1, src_stride, w, h );
@@ -1106,7 +1106,7 @@ static int check_mc( int cpu_ref, int cpu_new )
             int src_stride = (plane_specs[i].src_stride + 1) >> 1;
             int dst_stride = (2*w + 127) & ~63;
             assert( dst_stride * h <= 0x1000 );
-            uint8_t *src1 = buf1 + X264_MAX(0, -src_stride) * (h-1);
+            pixel *src1 = pbuf1 + X264_MAX(0, -src_stride) * (h-1);
             memset( pbuf3, 0, 0x1000*sizeof(pixel) );
             memset( pbuf4, 0, 0x1000*sizeof(pixel) );
             call_c( mc_c.plane_copy_interleave, pbuf3, dst_stride, src1, src_stride, src1+1024, src_stride+16, w, h );