[x264-devel] commit: memcpy_aligned_sse2 (Jason Garrett-Glaser )
git version control
git at videolan.org
Mon Mar 17 09:06:36 CET 2008
x264 | branch: master | Jason Garrett-Glaser <darkshikari at gmail.com> | Sun Mar 16 23:58:04 2008 -0600| [91991ba67aa9a7256b4bdf8d1d9be183ec2daa2b]
memcpy_aligned_sse2
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=91991ba67aa9a7256b4bdf8d1d9be183ec2daa2b
---
common/cabac.h | 5 ++-
common/macroblock.c | 2 +-
common/mc.c | 1 +
common/mc.h | 2 +
common/x86/mc-a2.asm | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/mc-c.c | 8 ++++++-
encoder/rdo.c | 21 ++++++++++---------
7 files changed, 78 insertions(+), 14 deletions(-)
diff --git a/common/cabac.h b/common/cabac.h
index affd254..f829162 100644
--- a/common/cabac.h
+++ b/common/cabac.h
@@ -27,7 +27,9 @@
typedef struct
{
/* context */
- uint8_t state[460];
+ DECLARE_ALIGNED( uint8_t, state[460], 16 );
+
+ int f8_bits_encoded; // only if using x264_cabac_size_decision()
/* state */
int i_low;
@@ -36,7 +38,6 @@ typedef struct
/* bit stream */
int i_queue;
int i_bytes_outstanding;
- int f8_bits_encoded; // only if using x264_cabac_size_decision()
uint8_t *p_start;
uint8_t *p;
diff --git a/common/macroblock.c b/common/macroblock.c
index e0307e1..ead6a7c 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -502,7 +502,7 @@ int x264_mb_predict_mv_direct16x16( x264_t *h, int *b_changed )
for( l = 0; l < 2; l++ )
for( i = 0; i < 4; i++ )
h->mb.cache.direct_ref[l][i] = h->mb.cache.ref[l][x264_scan8[i*4]];
- memcpy(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
+ h->mc.memcpy_aligned(h->mb.cache.direct_mv, h->mb.cache.mv, sizeof(h->mb.cache.mv));
}
return b_available;
diff --git a/common/mc.c b/common/mc.c
index 488c541..f753204 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -372,6 +372,7 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf )
pf->prefetch_fenc = prefetch_fenc_null;
pf->prefetch_ref = prefetch_ref_null;
+ pf->memcpy_aligned = memcpy;
#ifdef HAVE_MMX
x264_mc_init_mmx( cpu, pf );
diff --git a/common/mc.h b/common/mc.h
index 1a2f64f..b2f5ed2 100644
--- a/common/mc.h
+++ b/common/mc.h
@@ -65,6 +65,8 @@ typedef struct
uint8_t *pix_uv, int stride_uv, int mb_x );
/* prefetch the next few macroblocks of a hpel reference frame */
void (*prefetch_ref)( uint8_t *pix, int stride, int parity );
+
+ void *(*memcpy_aligned)( void *dst, const void *src, size_t n );
} x264_mc_functions_t;
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index a55859b..bf3090e 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -336,3 +336,56 @@ cglobal x264_plane_copy_mmxext, 6,7
emms
RET
+;-----------------------------------------------------------------------------
+; void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
+;-----------------------------------------------------------------------------
+cglobal x264_memcpy_aligned_mmx, 3,3
+ test r2d, 16
+ jz .copy32
+ sub r2d, 16
+ movq mm0, [r1 + r2 + 0]
+ movq mm1, [r1 + r2 + 8]
+ movq [r0 + r2 + 0], mm0
+ movq [r0 + r2 + 8], mm1
+.copy32:
+ sub r2d, 32
+ movq mm0, [r1 + r2 + 0]
+ movq mm1, [r1 + r2 + 8]
+ movq mm2, [r1 + r2 + 16]
+ movq mm3, [r1 + r2 + 24]
+ movq [r0 + r2 + 0], mm0
+ movq [r0 + r2 + 8], mm1
+ movq [r0 + r2 + 16], mm2
+ movq [r0 + r2 + 24], mm3
+ jg .copy32
+ REP_RET
+
+;-----------------------------------------------------------------------------
+; void *x264_memcpy_aligned_sse2( void *dst, const void *src, size_t n );
+;-----------------------------------------------------------------------------
+cglobal x264_memcpy_aligned_sse2, 3,3
+ test r2d, 16
+ jz .copy32
+ sub r2d, 16
+ movdqa xmm0, [r1 + r2]
+ movdqa [r0 + r2], xmm0
+.copy32:
+ test r2d, 32
+ jz .copy64
+ sub r2d, 32
+ movdqa xmm0, [r1 + r2 + 0]
+ movdqa xmm1, [r1 + r2 + 16]
+ movdqa [r0 + r2 + 0], xmm0
+ movdqa [r0 + r2 + 16], xmm1
+.copy64:
+ sub r2d, 64
+ movdqa xmm0, [r1 + r2 + 0]
+ movdqa xmm1, [r1 + r2 + 16]
+ movdqa xmm2, [r1 + r2 + 32]
+ movdqa xmm3, [r1 + r2 + 48]
+ movdqa [r0 + r2 + 0], xmm0
+ movdqa [r0 + r2 + 16], xmm1
+ movdqa [r0 + r2 + 32], xmm2
+ movdqa [r0 + r2 + 48], xmm3
+ jg .copy64
+ REP_RET
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 5ab5909..ec343de 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -56,6 +56,8 @@ extern void x264_mc_chroma_mmxext( uint8_t *src, int i_src_stride,
extern void x264_plane_copy_mmxext( uint8_t *, int, uint8_t *, int, int w, int h);
extern void x264_hpel_filter_mmxext( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
int i_stride, int i_width, int i_height );
+extern void *x264_memcpy_aligned_mmx( void * dst, const void * src, size_t n );
+extern void *x264_memcpy_aligned_sse2( void * dst, const void * src, size_t n );
#define AVG_WEIGHT(W,H) \
void x264_pixel_avg_weight_ ## W ## x ## H ## _mmxext( uint8_t *dst, int i_dst, uint8_t *src, int i_src, int i_weight_dst ) \
@@ -144,6 +146,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_mmx;
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_mmx;
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_mmx;
+ pf->memcpy_aligned = x264_memcpy_aligned_mmx;
if( !(cpu&X264_CPU_MMXEXT) )
return;
@@ -175,5 +178,8 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->prefetch_fenc = x264_prefetch_fenc_mmxext;
pf->prefetch_ref = x264_prefetch_ref_mmxext;
- /* todo: use sse2 */
+ if( !(cpu&X264_CPU_SSE2) )
+ return;
+
+ pf->memcpy_aligned = x264_memcpy_aligned_sse2;
}
diff --git a/encoder/rdo.c b/encoder/rdo.c
index e4cb45a..e5c4464 100644
--- a/encoder/rdo.c
+++ b/encoder/rdo.c
@@ -82,8 +82,8 @@ static int x264_rd_cost_mb( x264_t *h, int i_lambda2 )
}
else if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_macroblock_size_cabac( h, &cabac_tmp );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
@@ -124,8 +124,8 @@ int x264_rd_cost_part( x264_t *h, int i_lambda2, int i8, int i_pixel )
if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_partition_size_cabac( h, &cabac_tmp, i8, i_pixel );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
@@ -146,8 +146,8 @@ int x264_rd_cost_i8x8( x264_t *h, int i_lambda2, int i8, int i_mode )
if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_partition_i8x8_size_cabac( h, &cabac_tmp, i8, i_mode );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
@@ -168,8 +168,9 @@ int x264_rd_cost_i4x4( x264_t *h, int i_lambda2, int i4, int i_mode )
if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
+
x264_partition_i4x4_size_cabac( h, &cabac_tmp, i4, i_mode );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
@@ -194,8 +195,8 @@ int x264_rd_cost_i8x8_chroma( x264_t *h, int i_lambda2, int i_mode, int b_dct )
if( h->param.b_cabac )
{
- x264_cabac_t cabac_tmp = h->cabac;
- cabac_tmp.f8_bits_encoded = 0;
+ x264_cabac_t cabac_tmp;
+ h->mc.memcpy_aligned( &cabac_tmp, &h->cabac, offsetof(x264_cabac_t,i_low) );
x264_i8x8_chroma_size_cabac( h, &cabac_tmp );
i_bits = ( cabac_tmp.f8_bits_encoded * i_lambda2 + 128 ) >> 8;
}
More information about the x264-devel
mailing list