[x264-devel] Faster chroma weight cost calculation
Henrik Gramner
git at videolan.org
Wed Apr 25 00:17:10 CEST 2012
x264 | branch: master | Henrik Gramner <hengar-6 at student.ltu.se> | Sat Apr 7 00:40:09 2012 +0200| [df6252cfed7c23fbe883456f4e0607a7f8e91ad8] | committer: Jason Garrett-Glaser
Faster chroma weight cost calculation
New assembly function with SSE2, SSSE3 and XOP implementations for calculating absolute sum of differences.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=df6252cfed7c23fbe883456f4e0607a7f8e91ad8
---
common/pixel.c | 15 ++++++++++
common/pixel.h | 1 +
common/x86/pixel-a.asm | 67 ++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/pixel.h | 3 ++
encoder/slicetype.c | 15 +++--------
tools/checkasm.c | 15 ++++++++++
6 files changed, 105 insertions(+), 11 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index 9f52c79..d160921 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -703,6 +703,15 @@ int x264_field_vsad( x264_t *h, int mb_x, int mb_y )
return (score_field < score_frame);
}
+static int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height )
+{
+ int sum = 0;
+ for( int y = 0; y < height; y++, pix1 += stride1, pix2 += stride2 )
+ for( int x = 0; x < 8; x++ )
+ sum += pix1[x] - pix2[x];
+ return abs( sum );
+}
+
/****************************************************************************
* successive elimination
****************************************************************************/
@@ -814,6 +823,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->ssim_4x4x2_core = ssim_4x4x2_core;
pixf->ssim_end4 = ssim_end4;
pixf->vsad = pixel_vsad;
+ pixf->asd8 = pixel_asd8;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4;
pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4;
@@ -888,6 +898,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _sse2 );
}
pixf->vsad = x264_pixel_vsad_sse2;
+ pixf->asd8 = x264_pixel_asd8_sse2;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_sse2;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_sse2;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_sse2;
@@ -915,6 +926,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT4( hadamard_ac, _ssse3 );
}
pixf->vsad = x264_pixel_vsad_ssse3;
+ pixf->asd8 = x264_pixel_asd8_ssse3;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
@@ -951,6 +963,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
if( cpu&X264_CPU_XOP )
{
pixf->vsad = x264_pixel_vsad_xop;
+ pixf->asd8 = x264_pixel_asd8_xop;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
@@ -1035,6 +1048,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_sse2;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_sse2;
pixf->vsad = x264_pixel_vsad_sse2;
+ pixf->asd8 = x264_pixel_asd8_sse2;
}
if( (cpu&X264_CPU_SSE2) && !(cpu&X264_CPU_SSE2_IS_SLOW) )
@@ -1126,6 +1140,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_ssse3;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_ssse3;
+ pixf->asd8 = x264_pixel_asd8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
diff --git a/common/pixel.h b/common/pixel.h
index 1375b90..90d1b4b 100644
--- a/common/pixel.h
+++ b/common/pixel.h
@@ -89,6 +89,7 @@ typedef struct
x264_pixel_cmp_x4_t fpelcmp_x4[7];
x264_pixel_cmp_t sad_aligned[8]; /* Aligned SAD for mbcmp */
int (*vsad)( pixel *, intptr_t, int );
+ int (*asd8)( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
uint64_t (*var[4])( pixel *pix, intptr_t stride );
int (*var2[4])( pixel *pix1, intptr_t stride1,
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index b880323..40872d2 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -4001,6 +4001,73 @@ SSIM
INIT_XMM avx
SSIM
+;-----------------------------------------------------------------------------
+; int pixel_asd8( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+;-----------------------------------------------------------------------------
+%macro ASD8 0
+cglobal pixel_asd8, 5,5
+ pxor m0, m0
+ pxor m1, m1
+.loop:
+%if HIGH_BIT_DEPTH
+ paddw m0, [r0]
+ paddw m1, [r2]
+ paddw m0, [r0+2*r1]
+ paddw m1, [r2+2*r3]
+ lea r0, [r0+4*r1]
+ paddw m0, [r0]
+ paddw m1, [r2+4*r3]
+ lea r2, [r2+4*r3]
+ paddw m0, [r0+2*r1]
+ paddw m1, [r2+2*r3]
+ lea r0, [r0+4*r1]
+ lea r2, [r2+4*r3]
+%else
+ movq m2, [r0]
+ movq m3, [r2]
+ movhps m2, [r0+r1]
+ movhps m3, [r2+r3]
+ lea r0, [r0+2*r1]
+ psadbw m2, m1
+ psadbw m3, m1
+ movq m4, [r0]
+ movq m5, [r2+2*r3]
+ lea r2, [r2+2*r3]
+ movhps m4, [r0+r1]
+ movhps m5, [r2+r3]
+ lea r0, [r0+2*r1]
+ paddw m0, m2
+ psubw m0, m3
+ psadbw m4, m1
+ psadbw m5, m1
+ lea r2, [r2+2*r3]
+ paddw m0, m4
+ psubw m0, m5
+%endif
+ sub r4d, 4
+ jg .loop
+%if HIGH_BIT_DEPTH
+ psubw m0, m1
+ HADDW m0, m1
+ ABSD m1, m0
+%else
+ movhlps m1, m0
+ paddw m0, m1
+ ABSW m1, m0
+%endif
+ movd eax, m1
+ RET
+%endmacro
+
+INIT_XMM sse2
+ASD8
+INIT_XMM ssse3
+ASD8
+%if HIGH_BIT_DEPTH
+INIT_XMM xop
+ASD8
+%endif
+
;=============================================================================
; Successive Elimination ADS
;=============================================================================
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index 5f71c2a..c59e357 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -159,6 +159,9 @@ int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_xop ( pixel *src, intptr_t stride, int height );
+int x264_pixel_asd8_sse2 ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+int x264_pixel_asd8_ssse3( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
+int x264_pixel_asd8_xop ( pixel *pix1, intptr_t stride1, pixel *pix2, intptr_t stride2, int height );
#define DECL_ADS( size, suffix ) \
int x264_pixel_ads##size##_##suffix( int enc_dc[size], uint16_t *sums, int delta,\
diff --git a/encoder/slicetype.c b/encoder/slicetype.c
index 9bd862d..1aa4891 100644
--- a/encoder/slicetype.c
+++ b/encoder/slicetype.c
@@ -220,15 +220,12 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
{
unsigned int cost = 0;
int i_stride = fenc->i_stride[1];
- int i_offset = i_stride / 2;
int i_lines = fenc->i_lines[1];
int i_width = fenc->i_width[1];
- pixel *src = ref + i_offset;
+ pixel *src = ref + (i_stride >> 1);
ALIGNED_ARRAY_16( pixel, buf, [8*16] );
int pixoff = 0;
- int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int height = 16 >> CHROMA_V_SHIFT;
- ALIGNED_16( static pixel flat[8] ) = {0};
if( w )
{
for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
@@ -239,19 +236,15 @@ static NOINLINE unsigned int x264_weight_cost_chroma( x264_t *h, x264_frame_t *f
* But testing shows that for chroma the DC coefficient is by far the most
* important part of the coding cost. Thus a more useful chroma weight is
* obtained by comparing each block's DC coefficient instead of the actual
- * pixels.
- *
- * FIXME: add a (faster) asm sum function to replace sad. */
- cost += abs( h->pixf.sad_aligned[chromapix]( buf, 8, flat, 0 ) -
- h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
+ * pixels. */
+ cost += h->pixf.asd8( buf, 8, &src[pixoff], i_stride, height );
}
cost += x264_weight_slice_header_cost( h, w, 1 );
}
else
for( int y = 0; y < i_lines; y += height, pixoff = y*i_stride )
for( int x = 0; x < i_width; x += 8, pixoff += 8 )
- cost += abs( h->pixf.sad_aligned[chromapix]( &ref[pixoff], i_stride, flat, 0 ) -
- h->pixf.sad_aligned[chromapix]( &src[pixoff], i_stride, flat, 0 ) );
+ cost += h->pixf.asd8( &ref[pixoff], i_stride, &src[pixoff], i_stride, height );
x264_emms();
return cost;
}
diff --git a/tools/checkasm.c b/tools/checkasm.c
index 630a01d..01e0dd3 100644
--- a/tools/checkasm.c
+++ b/tools/checkasm.c
@@ -475,6 +475,21 @@ static int check_pixel( int cpu_ref, int cpu_new )
}
report( "pixel vsad :" );
+ ok = 1; used_asm = 0;
+ if( pixel_asm.asd8 != pixel_ref.asd8 )
+ {
+ set_func_name( "asd8" );
+ used_asm = 1;
+ int res_c = call_c( pixel_c.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
+ int res_a = call_a( pixel_asm.asd8, pbuf1, (intptr_t)8, pbuf2, (intptr_t)8, 16 );
+ if( res_c != res_a )
+ {
+ ok = 0;
+ fprintf( stderr, "asd: %d != %d\n", res_c, res_a );
+ }
+ }
+ report( "pixel asd :" );
+
#define TEST_INTRA_X3( name, i8x8, ... ) \
if( pixel_asm.name && pixel_asm.name != pixel_ref.name ) \
{ \
More information about the x264-devel
mailing list