[x264-devel] x86: AVX2 high bit-depth load_deinterleave_chroma

Henrik Gramner git at videolan.org
Tue Jan 24 21:14:12 CET 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Jan 18 21:57:14 2017 +0100| [2524fc3164d9f00b393d4254d2c5ea8f3b9d43b0] | committer: Henrik Gramner

x86: AVX2 high bit-depth load_deinterleave_chroma

load_deinterleave_chroma_fenc: 50% faster than AVX
load_deinterleave_chroma_fdec: 25% faster than AVX

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=2524fc3164d9f00b393d4254d2c5ea8f3b9d43b0
---

 common/x86/mc-a2.asm | 5 +++++
 common/x86/mc-c.c    | 6 ++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index c4aff28..2e72b61 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1049,8 +1049,12 @@ PLANE_COPY_CORE 1
 %if mmsize == 32
     pshufb   m0, %5
     vpermq   m0, m0, q3120
+%if %4
+    mova   [%1], m0
+%else
     mov%6  [%1], xm0
     vextracti128 [%2], m0, 1
+%endif
 %elif HIGH_BIT_DEPTH
     mova     m1, [%3+mmsize]
     psrld    m2, m0, 16
@@ -1455,6 +1459,7 @@ PLANE_DEINTERLEAVE
 LOAD_DEINTERLEAVE_CHROMA
 PLANE_DEINTERLEAVE_V210
 INIT_YMM avx2
+LOAD_DEINTERLEAVE_CHROMA
 PLANE_DEINTERLEAVE_V210
 %else
 INIT_XMM sse2
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 8f1bca5..3258381 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -138,10 +138,11 @@ void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu,
 void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
-void x264_load_deinterleave_chroma_fenc_avx2( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
 void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fdec_avx2( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
 void *x264_memcpy_aligned_mmx( void *dst, const void *src, size_t n );
 void *x264_memcpy_aligned_sse( void *dst, const void *src, size_t n );
 void x264_memzero_aligned_mmx( void *dst, size_t n );
@@ -673,6 +674,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     if( cpu&X264_CPU_AVX2 )
     {
         pf->mc_luma = mc_luma_avx2;
+        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
         pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
     }
 #else // !HIGH_BIT_DEPTH
@@ -824,7 +826,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->integral_init8h = x264_integral_init8h_avx2;
         pf->integral_init4h = x264_integral_init4h_avx2;
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
-        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
         pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
     }
 #endif // HIGH_BIT_DEPTH
@@ -843,6 +844,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         return;
     pf->plane_copy_swap = x264_plane_copy_swap_avx2;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
     pf->get_ref = get_ref_avx2;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
     pf->mbtree_propagate_list = x264_mbtree_propagate_list_avx2;



More information about the x264-devel mailing list