[x264-devel] x86: AVX-512 pixel_avg_weight_w16
Henrik Gramner
git at videolan.org
Mon Jun 26 21:59:10 CEST 2017
x264 | branch: master | Henrik Gramner <henrik at gramner.com> | Sat Jun 24 14:26:25 2017 +0200| [d3214e6b102701911fc9d5fc92435e79e8b49100] | committer: Henrik Gramner
x86: AVX-512 pixel_avg_weight_w16
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=d3214e6b102701911fc9d5fc92435e79e8b49100
---
common/x86/mc-a.asm | 62 ++++++++++++++++++++++++++++++++++++++++++-----------
common/x86/mc-c.c | 9 +++++++-
2 files changed, 58 insertions(+), 13 deletions(-)
diff --git a/common/x86/mc-a.asm b/common/x86/mc-a.asm
index 92029ade..2dbdee5d 100644
--- a/common/x86/mc-a.asm
+++ b/common/x86/mc-a.asm
@@ -83,11 +83,11 @@ cextern deinterleave_shufd
%endmacro
%endif
-%macro AVG_END 0
- lea t4, [t4+t5*2*SIZEOF_PIXEL]
+%macro AVG_END 0-1 2 ; rows
lea t2, [t2+t3*2*SIZEOF_PIXEL]
+ lea t4, [t4+t5*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
- sub eax, 2
+ sub eax, %1
jg .height_loop
RET
%endmacro
@@ -147,17 +147,24 @@ cextern deinterleave_shufd
%endmacro
%macro BIWEIGHT_START_SSSE3 0
- movzx t6d, byte r6m ; FIXME x86_64
- mov t7d, 64
- sub t7d, t6d
- shl t7d, 8
- add t6d, t7d
- mova m4, [pw_512]
- movd xm3, t6d
+ movzx t6d, byte r6m ; FIXME x86_64
+%if mmsize > 16
+ vbroadcasti128 m4, [pw_512]
+%else
+ mova m4, [pw_512]
+%endif
+ lea t7d, [t6+(64<<8)]
+ shl t6d, 8
+ sub t7d, t6d
+%if cpuflag(avx512)
+ vpbroadcastw m3, t7d
+%else
+ movd xm3, t7d
%if cpuflag(avx2)
- vpbroadcastw m3, xm3
+ vpbroadcastw m3, xm3
%else
- SPLATW m3, m3 ; weight_dst,src
+ SPLATW m3, m3 ; weight_dst,src
+%endif
%endif
%endmacro
@@ -268,6 +275,34 @@ cglobal pixel_avg_weight_w16
mova [t0], xm0
vextracti128 [t0+t1], m0, 1
AVG_END
+
+INIT_ZMM avx512
+cglobal pixel_avg_weight_w16
+ BIWEIGHT_START
+ AVG_START 5
+.height_loop:
+ movu xm0, [t2]
+ movu xm1, [t4]
+ vinserti128 ym0, [t2+t3], 1
+ vinserti128 ym1, [t4+t5], 1
+ lea t2, [t2+t3*2]
+ lea t4, [t4+t5*2]
+ vinserti32x4 m0, [t2], 2
+ vinserti32x4 m1, [t4], 2
+ vinserti32x4 m0, [t2+t3], 3
+ vinserti32x4 m1, [t4+t5], 3
+ SBUTTERFLY bw, 0, 1, 2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ mova [t0], xm0
+ vextracti128 [t0+t1], ym0, 1
+ lea t0, [t0+t1*2]
+ vextracti32x4 [t0], m0, 2
+ vextracti32x4 [t0+t1], m0, 3
+ AVG_END 4
%endif ;HIGH_BIT_DEPTH
;=============================================================================
@@ -738,6 +773,9 @@ INIT_XMM avx2
AVG_FUNC 16, movdqu, movdqa
AVGH 16, 16
AVGH 16, 8
+INIT_XMM avx512
+AVGH 16, 16
+AVGH 16, 8
%endif ;HIGH_BIT_DEPTH
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index 74cfffec..0a7e414c 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -32,7 +32,8 @@
void func##_mmx2 args;\
void func##_sse2 args;\
void func##_ssse3 args;\
- void func##_avx2 args;
+ void func##_avx2 args;\
+ void func##_avx512 args;
DECL_SUF( x264_pixel_avg_16x16, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
DECL_SUF( x264_pixel_avg_16x8, ( pixel *, intptr_t, pixel *, intptr_t, pixel *, intptr_t, int ))
@@ -865,6 +866,12 @@ void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
pf->frame_init_lowres_core = x264_frame_init_lowres_core_avx2;
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_avx2;
}
+
+ if( cpu&X264_CPU_AVX512 )
+ {
+ pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_avx512;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_avx512;
+ }
#endif // HIGH_BIT_DEPTH
if( !(cpu&X264_CPU_AVX) )
More information about the x264-devel
mailing list