[x264-devel] x86: AVX-512 pixel_var2_8x8 and 8x16
Henrik Gramner
git at videolan.org
Mon May 22 00:04:12 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Mon May 1 14:55:45 2017 +0200| [49fb50a67cc41e4bed2dd66f7beed12797249cd9] | committer: Henrik Gramner
x86: AVX-512 pixel_var2_8x8 and 8x16
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=49fb50a67cc41e4bed2dd66f7beed12797249cd9
---
common/common.h | 4 ++--
common/macroblock.c | 5 +++--
common/pixel.c | 4 ++++
common/x86/pixel-a.asm | 45 +++++++++++++++++++++++++++++++++++++++++++++
common/x86/pixel.h | 2 ++
5 files changed, 56 insertions(+), 4 deletions(-)
diff --git a/common/common.h b/common/common.h
index 3bb814de..091f9ed5 100644
--- a/common/common.h
+++ b/common/common.h
@@ -778,8 +778,8 @@ struct x264_t
/* space for p_fenc and p_fdec */
#define FENC_STRIDE 16
#define FDEC_STRIDE 32
- ALIGNED_32( pixel fenc_buf[48*FENC_STRIDE] );
- ALIGNED_32( pixel fdec_buf[52*FDEC_STRIDE] );
+ ALIGNED_64( pixel fenc_buf[48*FENC_STRIDE] );
+ ALIGNED_64( pixel fdec_buf[52*FDEC_STRIDE] );
/* i4x4 and i8x8 backup data, for skipping the encode stage when possible */
ALIGNED_16( pixel i4x4_fdec_buf[16*16] );
diff --git a/common/macroblock.c b/common/macroblock.c
index 0aa09de4..90a33da8 100644
--- a/common/macroblock.c
+++ b/common/macroblock.c
@@ -532,16 +532,17 @@ void x264_macroblock_thread_init( x264_t *h )
h->mb.pic.p_fenc[0] = h->mb.pic.fenc_buf;
h->mb.pic.p_fdec[0] = h->mb.pic.fdec_buf + 2*FDEC_STRIDE;
h->mb.pic.p_fenc[1] = h->mb.pic.fenc_buf + 16*FENC_STRIDE;
- h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
if( CHROMA444 )
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 32*FENC_STRIDE;
+ h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE;
h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 36*FDEC_STRIDE;
}
else
{
h->mb.pic.p_fenc[2] = h->mb.pic.fenc_buf + 16*FENC_STRIDE + 8;
- h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 19*FDEC_STRIDE + 16;
+ h->mb.pic.p_fdec[1] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE;
+ h->mb.pic.p_fdec[2] = h->mb.pic.fdec_buf + 20*FDEC_STRIDE + 16;
}
}
diff --git a/common/pixel.c b/common/pixel.c
index 164471c9..020b450b 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1049,6 +1049,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
{
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif // HAVE_MMX
#else // !HIGH_BIT_DEPTH
@@ -1351,6 +1353,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_avx512;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_avx512;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_avx512;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_avx512;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_avx512;
}
#endif //HAVE_MMX
diff --git a/common/x86/pixel-a.asm b/common/x86/pixel-a.asm
index ffb710fc..1ce26b9e 100644
--- a/common/x86/pixel-a.asm
+++ b/common/x86/pixel-a.asm
@@ -1128,10 +1128,17 @@ VAR2_8x8_SSSE3 16, 7
%macro VAR2_AVX2_LOAD 3 ; offset_reg, row1_offset, row2_offset
%if HIGH_BIT_DEPTH
+%if mmsize == 64
+ mova m2, [r1+2*%1+%2*FDEC_STRIDEB]
+ vshufi32x4 m2, [r1+2*%1+%2*FDEC_STRIDEB+64], q2020
+ mova m3, [r1+2*%1+%3*FDEC_STRIDEB]
+ vshufi32x4 m3, [r1+2*%1+%3*FDEC_STRIDEB+64], q2020
+%else
mova xm2, [r1+2*%1+%2*FDEC_STRIDEB]
vinserti128 m2, [r1+2*%1+%2*FDEC_STRIDEB+32], 1
mova xm3, [r1+2*%1+%3*FDEC_STRIDEB]
vinserti128 m3, [r1+2*%1+%3*FDEC_STRIDEB+32], 1
+%endif
psubw m2, [r0+1*%1+%2*FENC_STRIDEB]
psubw m3, [r0+1*%1+%3*FENC_STRIDEB]
%else
@@ -1174,6 +1181,44 @@ INIT_YMM avx2
VAR2_8x8_AVX2 8, 6
VAR2_8x8_AVX2 16, 7
+%macro VAR2_AVX512_END 1 ; shift
+ vbroadcasti32x4 m2, [pw_1]
+ pmaddwd m0, m2
+ SBUTTERFLY qdq, 0, 1, 2
+ paddd m0, m1
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+ psrlq ym1, ym0, 32
+ paddd ym0, ym1
+ vpmovqd xmm0, ym0 ; sum_u, sqr_u, sum_v, sqr_v
+ VAR2_END xmm0, xmm1, %1
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_var2_8x8, 2,3
+%if HIGH_BIT_DEPTH == 0
+ pxor xm6, xm6
+%endif
+ VAR2_AVX2_LOAD 0, 0, 2
+ VAR2_CORE m2, m3, 0
+ VAR2_AVX2_LOAD 0, 4, 6
+ VAR2_CORE m2, m3, 1
+ VAR2_AVX512_END 6
+
+cglobal pixel_var2_8x16, 2,3
+%if HIGH_BIT_DEPTH == 0
+ pxor xm6, xm6
+%endif
+ mov t0d, 10*FENC_STRIDEB
+ VAR2_AVX2_LOAD 0, 14, 12
+ VAR2_CORE m2, m3, 0
+.loop:
+ VAR2_AVX2_LOAD t0, 0, -2
+ VAR2_CORE m2, m3, 1
+ sub t0d, 4*FENC_STRIDEB
+ jg .loop
+ VAR2_AVX512_END 7
+
;=============================================================================
; SATD
;=============================================================================
diff --git a/common/x86/pixel.h b/common/x86/pixel.h
index b28f1981..57229986 100644
--- a/common/x86/pixel.h
+++ b/common/x86/pixel.h
@@ -169,9 +169,11 @@ float x264_pixel_ssim_end4_avx ( int sum0[5][4], int sum1[5][4], int width );
int x264_pixel_var2_8x8_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x8_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x8_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
+int x264_pixel_var2_8x8_avx512 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_sse2 ( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_var2_8x16_ssse3 ( uint8_t *fenc, uint8_t *fdec, int ssd[2] );
int x264_pixel_var2_8x16_avx2 ( pixel *fenc, pixel *fdec, int ssd[2] );
+int x264_pixel_var2_8x16_avx512( pixel *fenc, pixel *fdec, int ssd[2] );
int x264_pixel_vsad_mmx2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_sse2 ( pixel *src, intptr_t stride, int height );
int x264_pixel_vsad_ssse3( pixel *src, intptr_t stride, int height );
More information about the x264-devel
mailing list