[x264-devel] x86: AVX2 high bit-depth load_deinterleave_chroma
Henrik Gramner
git at videolan.org
Tue Jan 24 21:14:12 CET 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Jan 18 21:57:14 2017 +0100| [2524fc3164d9f00b393d4254d2c5ea8f3b9d43b0] | committer: Henrik Gramner
x86: AVX2 high bit-depth load_deinterleave_chroma
load_deinterleave_chroma_fenc: 50% faster than AVX
load_deinterleave_chroma_fdec: 25% faster than AVX
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=2524fc3164d9f00b393d4254d2c5ea8f3b9d43b0
---
common/x86/mc-a2.asm | 5 +++++
common/x86/mc-c.c | 6 ++++--
2 files changed, 9 insertions(+), 2 deletions(-)
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index c4aff28..2e72b61 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1049,8 +1049,12 @@ PLANE_COPY_CORE 1
%if mmsize == 32
pshufb m0, %5
vpermq m0, m0, q3120
+%if %4
+ mova [%1], m0
+%else
mov%6 [%1], xm0
vextracti128 [%2], m0, 1
+%endif
%elif HIGH_BIT_DEPTH
mova m1, [%3+mmsize]
psrld m2, m0, 16
@@ -1455,6 +1459,7 @@ PLANE_DEINTERLEAVE
LOAD_DEINTERLEAVE_CHROMA
PLANE_DEINTERLEAVE_V210
INIT_YMM avx2
+LOAD_DEINTERLEAVE_CHROMA
PLANE_DEINTERLEAVE_V210
%else
INIT_XMM sse2
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 8f1bca5..3258381 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -138,10 +138,11 @@ void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu,
void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fenc_avx2( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
void x264_memzero_aligned_mmx( void *dst, size_t n );
@@ -673,6 +674,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
if( cpu&X264_CPU_AVX2 )
{
pf->mc_luma = mc_luma_avx2;
+ pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
}
#else // !HIGH_BIT_DEPTH
@@ -824,7 +826,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init8h = x264_integral_init8h_avx2;
pf->integral_init4h = x264_integral_init4h_avx2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
- pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
}
#endif // HIGH_BIT_DEPTH
@@ -843,6 +844,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
return;
pf->plane_copy_swap = x264_plane_copy_swap_avx2;
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->get_ref = get_ref_avx2;
pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;
More information about the x264-devel
mailing list