[x264-devel] x86: AVX-512 load_deinterleave_chroma_fenc

Henrik Gramner git at videolan.org
Mon Dec 25 20:39:57 CET 2017


x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sun Oct  8 21:23:12 2017 +0200| [d93851ec282eb069f91a6eddab3284f7766cd5bd] | committer: Anton Mitrofanov

x86: AVX-512 load_deinterleave_chroma_fenc

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d93851ec282eb069f91a6eddab3284f7766cd5bd
---

 common/x86/mc-a2.asm | 30 +++++++++++++++++++++---------
 common/x86/mc-c.c    |  6 +++++-
 2 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index a4e11616..69ed4cd4 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1264,17 +1264,27 @@ cglobal load_deinterleave_chroma_fenc, 4,5
     vbroadcasti128 m0, [deinterleave_shuf]
     lea            r4, [r2*3]
 .loop:
-    mova          xm1, [r1]
-    vinserti128    m1, m1, [r1+r2], 1
-    mova          xm2, [r1+r2*2]
-    vinserti128    m2, m2, [r1+r4], 1
+    mova          xm1, [r1]         ; 0
+    vinserti128   ym1, [r1+r2], 1   ; 1
+%if mmsize == 64
+    mova          xm2, [r1+r2*4]    ; 4
+    vinserti32x4   m1, [r1+r2*2], 2 ; 2
+    vinserti32x4   m2, [r1+r4*2], 2 ; 6
+    vinserti32x4   m1, [r1+r4], 3   ; 3
+    lea            r1, [r1+r2*4]
+    vinserti32x4   m2, [r1+r2], 1   ; 5
+    vinserti32x4   m2, [r1+r4], 3   ; 7
+%else
+    mova          xm2, [r1+r2*2]    ; 2
+    vinserti128    m2, [r1+r4], 1   ; 3
+%endif
+    lea            r1, [r1+r2*4]
     pshufb         m1, m0
     pshufb         m2, m0
-    mova [r0+0*FENC_STRIDE], m1
-    mova [r0+2*FENC_STRIDE], m2
-    lea            r1, [r1+r2*4]
-    add            r0, 4*FENC_STRIDE
-    sub           r3d, 4
+    mova         [r0], m1
+    mova  [r0+mmsize], m2
+    add            r0, 2*mmsize
+    sub           r3d, mmsize/8
     jg .loop
     RET
 %endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
@@ -1499,6 +1509,8 @@ PLANE_DEINTERLEAVE_RGB
 INIT_YMM avx2
 LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
 PLANE_DEINTERLEAVE_RGB
+INIT_ZMM avx512
+LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
 %endif
 
 ; These functions are not general-use; not only do they require aligned input, but memcpy
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 51764811..0deb1387 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -245,6 +245,8 @@ void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intpt
 void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fenc_avx2 x264_template(load_deinterleave_chroma_fenc_avx2)
 void x264_load_deinterleave_chroma_fenc_avx2( pixel *dst, pixel *src, intptr_t i_src, int height );
+#define x264_load_deinterleave_chroma_fenc_avx512 x264_template(load_deinterleave_chroma_fenc_avx512)
+void x264_load_deinterleave_chroma_fenc_avx512( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fdec_sse2 x264_template(load_deinterleave_chroma_fdec_sse2)
 void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
 #define x264_load_deinterleave_chroma_fdec_ssse3 x264_template(load_deinterleave_chroma_fdec_ssse3)
@@ -909,6 +911,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     {
         pf->mc_luma = mc_luma_avx2;
         pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_avx2;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
         pf->plane_copy_deinterleave_v210 = x264_plane_copy_deinterleave_v210_avx2;
     }
 
@@ -1068,6 +1071,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->integral_init4h = x264_integral_init4h_avx2;
         pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
         pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
     }
 
     if( cpu&X264_CPU_AVX512 )
@@ -1077,6 +1081,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
         pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_avx512;
         pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_avx512;
         pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_avx512;
+        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx512;
     }
 #endif // HIGH_BIT_DEPTH
 
@@ -1096,7 +1101,6 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
     pf->plane_copy_swap = plane_copy_swap_avx2;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_avx2;
     pf->plane_copy_deinterleave_yuyv = plane_copy_deinterleave_yuyv_avx2;
-    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
     pf->get_ref = get_ref_avx2;
     pf->mbtree_propagate_cost = x264_mbtree_propagate_cost_avx2;
     pf->mbtree_propagate_list = mbtree_propagate_list_avx2;



More information about the x264-devel mailing list