[x264-devel] Enable some existing asm functions that were missing function pointers
Loren Merritt
git at videolan.org
Wed Aug 10 05:15:14 CEST 2011
x264 | branch: master | Loren Merritt <pengvado at akuvian.org> | Wed Aug 3 14:58:50 2011 +0000| [a718aad0045b2930d871fd7b6bf33fc0192d526f] | committer: Jason Garrett-Glaser
Enable some existing asm functions that were missing function pointers
pixel_ads1_avx, predict_8x8_hd_avxx
High bit depth mc_copy_w8_sse2, denoise_dct_avx, prefetch_fenc/ref, and several pixel*sse4.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=a718aad0045b2930d871fd7b6bf33fc0192d526f
---
common/pixel.c | 33 +++++++++++++++++++++++++++++----
common/quant.c | 4 ++++
common/x86/mc-a.asm | 23 ++++++++++++++---------
common/x86/mc-c.c | 13 +++++++++----
common/x86/predict-c.c | 1 +
5 files changed, 57 insertions(+), 17 deletions(-)
diff --git a/common/pixel.c b/common/pixel.c
index 40395b2..57b5d58 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -546,6 +546,7 @@ INTRA_MBCMP( sad, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP(satd, 16, v, h, dc, , _mmx2 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _sse2 )
INTRA_MBCMP( sad, 16, v, h, dc, , _sse2 )
+INTRA_MBCMP( sad, 4, v, h, dc, , _ssse3 )
INTRA_MBCMP( sad, 8, dc, h, v, c, _ssse3 )
INTRA_MBCMP( sad, 16, v, h, dc, , _ssse3 )
#endif
@@ -873,10 +874,35 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_ssse3;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_ssse3;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_ssse3;
pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_ssse3;
pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_ssse3;
}
+ if( cpu&X264_CPU_SSE4 )
+ {
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ INIT4( hadamard_ac, _sse4 );
+ }
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
+ }
+ if( cpu&X264_CPU_AVX )
+ {
+ INIT_ADS( _avx );
+ if( !(cpu&X264_CPU_STACK_MOD4) )
+ {
+ INIT4( hadamard_ac, _avx );
+ }
+ pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_avx;
+ pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_avx;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx;
+ pixf->ssd_nv12_core = x264_pixel_ssd_nv12_core_avx;
+ pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_avx;
+ pixf->ssim_end4 = x264_pixel_ssim_end4_avx;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
@@ -1038,6 +1064,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_ssse3;
#endif
pixf->var2_8x8 = x264_pixel_var2_8x8_ssse3;
+ if( cpu&X264_CPU_SHUFFLE_IS_FAST )
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
if( cpu&X264_CPU_CACHELINE_64 )
{
INIT2( sad, _cache64_ssse3 );
@@ -1062,8 +1090,6 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_16x16]= x264_pixel_sa8d_16x16_sse4;
pixf->sa8d[PIXEL_8x8] = x264_pixel_sa8d_8x8_sse4;
pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_sse4;
- /* Slower on Conroe, so only enable under SSE4 */
- pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_ssse3;
}
if( cpu&X264_CPU_AVX )
@@ -1071,8 +1097,7 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
INIT7( satd, _avx );
INIT7( satd_x3, _avx );
INIT7( satd_x4, _avx );
- pixf->ads[PIXEL_16x16] = x264_pixel_ads4_avx;
- pixf->ads[PIXEL_16x8] = x264_pixel_ads2_avx;
+ INIT_ADS( _avx );
if( !(cpu&X264_CPU_STACK_MOD4) )
{
INIT4( hadamard_ac, _avx );
diff --git a/common/quant.c b/common/quant.c
index b0537ce..40ac1f1 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -428,6 +428,10 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->quant_4x4 = x264_quant_4x4_sse4;
pf->quant_8x8 = x264_quant_8x8_sse4;
}
+ if( cpu&X264_CPU_AVX )
+ {
+ pf->denoise_dct = x264_denoise_dct_avx;
+ }
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
#if HAVE_MMX
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 403059b..93c5687 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -1291,19 +1291,21 @@ MC_COPY 16
;=============================================================================
; prefetch
;=============================================================================
-; FIXME assumes 64 byte cachelines
+; assumes 64 byte cachelines
+; FIXME doesn't cover all pixels in high depth and/or 4:4:4
;-----------------------------------------------------------------------------
-; void prefetch_fenc( uint8_t *pix_y, int stride_y,
-; uint8_t *pix_uv, int stride_uv, int mb_x )
+; void prefetch_fenc( pixel *pix_y, int stride_y,
+; pixel *pix_uv, int stride_uv, int mb_x )
;-----------------------------------------------------------------------------
INIT_MMX
%ifdef ARCH_X86_64
cglobal prefetch_fenc_mmx2, 5,5
+ FIX_STRIDES r1d, r3d
and r4d, 3
mov eax, r4d
imul r4d, r1d
- lea r0, [r0+r4*4+64]
+ lea r0, [r0+r4*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
@@ -1311,7 +1313,7 @@ cglobal prefetch_fenc_mmx2, 5,5
prefetcht0 [r0+r1]
imul eax, r3d
- lea r2, [r2+rax*2+64]
+ lea r2, [r2+rax*2+64*SIZEOF_PIXEL]
prefetcht0 [r2]
prefetcht0 [r2+r3]
RET
@@ -1321,9 +1323,10 @@ cglobal prefetch_fenc_mmx2, 0,3
mov r2, r4m
mov r1, r1m
mov r0, r0m
+ FIX_STRIDES r1
and r2, 3
imul r2, r1
- lea r0, [r0+r2*4+64]
+ lea r0, [r0+r2*4+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
lea r0, [r0+r1*2]
@@ -1333,21 +1336,23 @@ cglobal prefetch_fenc_mmx2, 0,3
mov r2, r4m
mov r1, r3m
mov r0, r2m
+ FIX_STRIDES r1
and r2, 3
imul r2, r1
- lea r0, [r0+r2*2+64]
+ lea r0, [r0+r2*2+64*SIZEOF_PIXEL]
prefetcht0 [r0]
prefetcht0 [r0+r1]
ret
%endif ; ARCH_X86_64
;-----------------------------------------------------------------------------
-; void prefetch_ref( uint8_t *pix, int stride, int parity )
+; void prefetch_ref( pixel *pix, int stride, int parity )
;-----------------------------------------------------------------------------
cglobal prefetch_ref_mmx2, 3,3
+ FIX_STRIDES r1d
dec r2d
and r2d, r1d
- lea r0, [r0+r2*8+64]
+ lea r0, [r0+r2*8+64*SIZEOF_PIXEL]
lea r2, [r1*3]
prefetcht0 [r0]
prefetcht0 [r0+r1]
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 81dd1ca..2c028e7 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -85,8 +85,8 @@ void x264_mc_copy_w8_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_mmx( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_sse2( pixel *, int, pixel *, int, int );
void x264_mc_copy_w16_aligned_sse2( pixel *, int, pixel *, int, int );
-void x264_prefetch_fenc_mmx2( uint8_t *, int, uint8_t *, int, int );
-void x264_prefetch_ref_mmx2( uint8_t *, int, int );
+void x264_prefetch_fenc_mmx2( pixel *, int, pixel *, int, int );
+void x264_prefetch_ref_mmx2( pixel *, int, int );
void x264_plane_copy_core_mmx2( pixel *, int, pixel *, int, int w, int h);
void x264_plane_copy_c( pixel *, int, pixel *, int, int w, int h );
void x264_plane_copy_interleave_core_mmx2( pixel *dst, int i_dst,
@@ -225,7 +225,11 @@ static void (* const x264_mc_copy_wtab_##instr[5])( pixel *, int, pixel *, int,
};
MC_COPY_WTAB(mmx,mmx,mmx,mmx)
+#if HIGH_BIT_DEPTH
+MC_COPY_WTAB(sse2,mmx,sse2,sse2)
+#else
MC_COPY_WTAB(sse2,mmx,mmx,sse2)
+#endif
#define MC_WEIGHT_WTAB(function, instr, name1, name2, w12version)\
static void (* x264_mc_##function##_wtab_##instr[6])( pixel *, int, pixel *, int, const x264_weight_t *, int ) =\
@@ -510,6 +514,9 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_MMX2) )
return;
+ pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
+ pf->prefetch_ref = x264_prefetch_ref_mmx2;
+
pf->plane_copy = x264_plane_copy_mmx2;
pf->plane_copy_interleave = x264_plane_copy_interleave_mmx2;
pf->store_interleave_8x8x2 = x264_store_interleave_8x8x2_mmx2;
@@ -605,8 +612,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( !(cpu&X264_CPU_STACK_MOD4) )
pf->mc_chroma = x264_mc_chroma_avx;
#else // !HIGH_BIT_DEPTH
- pf->prefetch_fenc = x264_prefetch_fenc_mmx2;
- pf->prefetch_ref = x264_prefetch_ref_mmx2;
#if ARCH_X86 // all x86_64 cpus with cacheline split issues use sse2 instead
if( cpu&X264_CPU_CACHELINE_32 )
diff --git a/common/x86/predict-c.c b/common/x86/predict-c.c
index 1db0925..96d0e23 100644
--- a/common/x86/predict-c.c
+++ b/common/x86/predict-c.c
@@ -566,6 +566,7 @@ void x264_predict_8x8_init_mmx( int cpu, x264_predict8x8_t pf[12], x264_predict_
pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_avx;
pf[I_PRED_8x8_VL] = x264_predict_8x8_vl_avx;
pf[I_PRED_8x8_VR] = x264_predict_8x8_vr_avx;
+ pf[I_PRED_8x8_HD] = x264_predict_8x8_hd_avx;
#endif // HIGH_BIT_DEPTH
}
More information about the x264-devel
mailing list