[x264-devel] x86: AVX2 load_deinterleave_chroma_fenc
Henrik Gramner
git at videolan.org
Tue Jan 24 21:14:12 CET 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Wed Jan 18 21:46:55 2017 +0100| [cce50082129d3c92bd41bc0afc5a8c8d93084c9c] | committer: Henrik Gramner
x86: AVX2 load_deinterleave_chroma_fenc
20% faster than SSSE3.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=cce50082129d3c92bd41bc0afc5a8c8d93084c9c
---
common/common.h | 2 +-
common/x86/mc-a2.asm | 21 +++++++++++++++++++++
common/x86/mc-c.c | 2 ++
encoder/analyse.c | 2 +-
encoder/ratecontrol.c | 2 +-
5 files changed, 26 insertions(+), 3 deletions(-)
diff --git a/common/common.h b/common/common.h
index f26868e..bce186d 100644
--- a/common/common.h
+++ b/common/common.h
@@ -781,7 +781,7 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
- ALIGNED_16( pixel fenc_buf[48*FENC_STRIDE] );
+ ALIGNED_N( pixel fenc_buf[48*FENC_STRIDE] );
ALIGNED_N( pixel fdec_buf[52*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
diff --git a/common/x86/mc-a2.asm b/common/x86/mc-a2.asm
index f39645a..c4aff28 100644
--- a/common/x86/mc-a2.asm
+++ b/common/x86/mc-a2.asm
@@ -1250,6 +1250,26 @@ cglobal load_deinterleave_chroma_fdec, 4,4
RET
%endmacro ; LOAD_DEINTERLEAVE_CHROMA
+%macro LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2 0
+cglobal load_deinterleave_chroma_fenc, 4,5
+ vbroadcasti128 m0, [deinterleave_shuf]
+ lea r4, [r2*3]
+.loop:
+ mova xm1, [r1]
+ vinserti128 m1, m1, [r1+r2], 1
+ mova xm2, [r1+r2*2]
+ vinserti128 m2, m2, [r1+r4], 1
+ pshufb m1, m0
+ pshufb m2, m0
+ mova [r0+0*FENC_STRIDE], m1
+ mova [r0+2*FENC_STRIDE], m2
+ lea r1, [r1+r2*4]
+ add r0, 4*FENC_STRIDE
+ sub r3d, 4
+ jg .loop
+ RET
+%endmacro ; LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
+
%macro PLANE_DEINTERLEAVE_RGB_CORE 9 ; pw, i_dsta, i_dstb, i_dstc, i_src, w, h, tmp1, tmp2
%if mmsize == 32
vbroadcasti128 m3, [deinterleave_rgb_shuf+(%1-3)*16]
@@ -1444,6 +1464,7 @@ PLANE_DEINTERLEAVE
LOAD_DEINTERLEAVE_CHROMA
PLANE_DEINTERLEAVE_RGB
INIT_YMM avx2
+LOAD_DEINTERLEAVE_CHROMA_FENC_AVX2
PLANE_DEINTERLEAVE_RGB
%endif
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index d8fbafc..8f1bca5 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -138,6 +138,7 @@ void x264_store_interleave_chroma_avx ( pixel *dst, intptr_t i_dst, pixel *srcu,
void x264_load_deinterleave_chroma_fenc_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fenc_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fenc_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_avx2( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_sse2( pixel *dst, pixel *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_ssse3( uint8_t *dst, uint8_t *src, intptr_t i_src, int height );
void x264_load_deinterleave_chroma_fdec_avx( uint16_t *dst, uint16_t *src, intptr_t i_src, int height );
@@ -823,6 +824,7 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->integral_init8h = x264_integral_init8h_avx2;
pf->integral_init4h = x264_integral_init4h_avx2;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
+ pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_avx2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
}
#endif // HIGH_BIT_DEPTH
diff --git a/encoder/analyse.c b/encoder/analyse.c
index ff65552..1941bf2 100644
--- a/encoder/analyse.c
+++ b/encoder/analyse.c
@@ -2147,7 +2147,7 @@ static void x264_mb_analyse_inter_b16x16( x264_t *h, x264_mb_analysis_t *a )
}
else
{
- ALIGNED_ARRAY_16( pixel, pixuv, [2],[16*FENC_STRIDE] );
+ ALIGNED_ARRAY_N( pixel, pixuv, [2],[16*FENC_STRIDE] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int v_shift = CHROMA_V_SHIFT;
diff --git a/encoder/ratecontrol.c b/encoder/ratecontrol.c
index 3c5357a..dbccb27 100644
--- a/encoder/ratecontrol.c
+++ b/encoder/ratecontrol.c
@@ -243,7 +243,7 @@ static ALWAYS_INLINE uint32_t ac_energy_plane( x264_t *h, int mb_x, int mb_y, x2
stride <<= b_field;
if( b_chroma )
{
- ALIGNED_ARRAY_16( pixel, pix,[FENC_STRIDE*16] );
+ ALIGNED_ARRAY_N( pixel, pix,[FENC_STRIDE*16] );
int chromapix = h->luma2chroma_pixel[PIXEL_16x16];
int shift = 7 - CHROMA_V_SHIFT;
More information about the x264-devel
mailing list