[x264-devel] commit: memcpy_aligned_sse2 (Jason Garrett-Glaser )

Mon Mar 17 09:06:36 CET 2008

x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Sun Mar 16 23:58:04 2008 -0600| [91991ba67aa9a7256b4bdf8d1d9be183ec2daa2b]

memcpy_aligned_sse2

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=91991ba67aa9a7256b4bdf8d1d9be183ec2daa2b
---

 common/cabac.h       |    5 ++-
 common/macroblock.c  |    2 +-
 common/mc.c          |    1 +
 common/mc.h          |    2 +
 common/x86/mc-a2.asm |   53 ++++++++++++++++++++++++++++++++++++++++++++++++++
 common/x86/mc-c.c    |    8 ++++++-
 encoder/rdo.c        |   21 ++++++++++---------
 7 files changed, 78 insertions(+), 14 deletions(-)

diff --git a/common/cabac.h b/common/cabac.h
index affd254..f829162 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -27,7 +27,9 @@
 typedef struct
 {
     /* context */
-    uint8_t state[460];
+    DECLARE_ALIGNED( uint8_t, state[460], 16 );
+
+    int f8_bits_encoded; // only if using x264_cabac_size_decision()
 
     /* state */
     int i_low;
@@ -36,7 +38,6 @@ typedef struct
     /* bit stream */
     int i_queue;
     int i_bytes_outstanding;
-    int f8_bits_encoded; // only if using x264_cabac_size_decision()
 
     uint8_t *p_start;
     uint8_t *p;
diff --git a/common/macroblock.c b/common/macroblock.c
index e0307e1..ead6a7c 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -502,7 +502,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
         for( l = 0; l < 2; l++ )
             for( i = 0; i < 4; i++ )
                 h->mb.cache.direct_ref[l][i] = h->mb.cache.ref[l][x264_scan8[i*4]];
-        memcpy(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
+        h->mc.memcpy_aligned(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
     }
 
     return b_available;
diff --git a/common/mc.c b/common/mc.c
index 488c541..f753204 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -372,6 +372,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
 
     pf->prefetch_fenc = prefetch_fenc_null;
     pf->prefetch_ref  = prefetch_ref_null;
+    pf->memcpy_aligned = memcpy;
 
 #ifdef HAVE_MMX
     x264_mc_init_mmx( cpu, pf );
diff --git a/common/mc.h b/common/mc.h
index 1a2f64f..b2f5ed2 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -65,6 +65,8 @@ typedef struct
                            uint8_t *pix_uv, int stride_uv, int mb_x );
     /* prefetch the next few macroblocks of a hpel reference frame */
     void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
+    
+    void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
 
 } x264_mc_functions_t;
 
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index a55859b..bf3090e 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -336,3 +336,56 @@ cglobal x264_plane_copy_mmxext, 6,7
     emms
     RET
 
+;-----------------------------------------------------------------------------
+; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+;-----------------------------------------------------------------------------
+cglobal x264_memcpy_aligned_mmx, 3,3
+    test r2d, 16
+    jz .copy32
+    sub r2d, 16
+    movq mm0, [r1 + r2 + 0]
+    movq mm1, [r1 + r2 + 8]
+    movq [r0 + r2 + 0], mm0
+    movq [r0 + r2 + 8], mm1
+.copy32:
+        sub r2d, 32
+        movq mm0, [r1 + r2 +  0]
+        movq mm1, [r1 + r2 +  8]
+        movq mm2, [r1 + r2 + 16]
+        movq mm3, [r1 + r2 + 24]
+        movq [r0 + r2 +  0], mm0
+        movq [r0 + r2 +  8], mm1
+        movq [r0 + r2 + 16], mm2
+        movq [r0 + r2 + 24], mm3
+    jg .copy32
+    REP_RET
+
+;-----------------------------------------------------------------------------
+; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
+;-----------------------------------------------------------------------------
+cglobal x264_memcpy_aligned_sse2, 3,3
+    test r2d, 16
+    jz .copy32
+    sub r2d, 16
+    movdqa xmm0, [r1 + r2]
+    movdqa [r0 + r2], xmm0
+.copy32:
+    test r2d, 32
+    jz .copy64
+    sub r2d, 32
+    movdqa xmm0, [r1 + r2 +  0]
+    movdqa xmm1, [r1 + r2 + 16]
+    movdqa [r0 + r2 +  0], xmm0
+    movdqa [r0 + r2 + 16], xmm1
+.copy64:
+        sub r2d, 64
+        movdqa xmm0, [r1 + r2 +  0]
+        movdqa xmm1, [r1 + r2 + 16]
+        movdqa xmm2, [r1 + r2 + 32]
+        movdqa xmm3, [r1 + r2 + 48]
+        movdqa [r0 + r2 +  0], xmm0
+        movdqa [r0 + r2 + 16], xmm1
+        movdqa [r0 + r2 + 32], xmm2
+        movdqa [r0 + r2 + 48], xmm3
+    jg .copy64
+    REP_RET
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 5ab5909..ec343de 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -56,6 +56,8 @@ extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
 extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
 extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
                                      int i_stride, int i_width, int i_height );
+extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
+extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
 
 #define AVG_WEIGHT(W,H) \
 void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
@@ -144,6 +146,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_mmx;
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_mmx;
+    pf->memcpy_aligned = x264_memcpy_aligned_mmx;
 
     if( !(cpu&X264_CPU_MMXEXT) )
         return;
@@ -175,5 +178,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
     pf->prefetch_ref  = x264_prefetch_ref_mmxext;
 
-    /* todo: use sse2 */
+    if( !(cpu&X264_CPU_SSE2) )
+        return;
+    
+    pf->memcpy_aligned = x264_memcpy_aligned_sse2;
 }
diff --git a/encoder/rdo.c b/encoder/rdo.c
index e4cb45a..e5c4464 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -82,8 +82,8 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
     }
     else if( h->param.b_cabac )
     {
-        x264_cabac_t cabac_tmp = h->cabac;
-        cabac_tmp.f8_bits_encoded = 0;
+        x264_cabac_t cabac_tmp;
+        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
         x264_macroblock_size_cabac( h, &cabac_tmp );
         i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
@@ -124,8 +124,8 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
 
     if( h->param.b_cabac )
     {
-        x264_cabac_t cabac_tmp = h->cabac;
-        cabac_tmp.f8_bits_encoded = 0;
+        x264_cabac_t cabac_tmp;
+        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
         x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
         i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
@@ -146,8 +146,8 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
 
     if( h->param.b_cabac )
     {
-        x264_cabac_t cabac_tmp = h->cabac;
-        cabac_tmp.f8_bits_encoded = 0;
+        x264_cabac_t cabac_tmp;
+        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
         x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
         i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
@@ -168,8 +168,9 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
 
     if( h->param.b_cabac )
     {
-        x264_cabac_t cabac_tmp = h->cabac;
-        cabac_tmp.f8_bits_encoded = 0;
+        x264_cabac_t cabac_tmp;
+        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+        
         x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
         i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }
@@ -194,8 +195,8 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
 
     if( h->param.b_cabac )
     {
-        x264_cabac_t cabac_tmp = h->cabac;
-        cabac_tmp.f8_bits_encoded = 0;
+        x264_cabac_t cabac_tmp;
+        h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
         x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
         i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
     }