[x265] [PATCH 005 of 307] x86: AVX-512 pixel_avg_weight_w16
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:03 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1498474278 -19800
# Mon Jun 26 16:21:18 2017 +0530
# Node ID 5309fe76c442d720d2d3419eefab11f2a1f9731a
# Parent 2e5128235d577806f16e5cf93266dcd7f4155a63
x86: AVX-512 pixel_avg_weight_w16
diff -r 2e5128235d57 -r 5309fe76c442 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Jun 23 17:25:27 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Jun 26 16:21:18 2017 +0530
@@ -3754,6 +3754,8 @@
p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
+ p.pu[LUMA_16x16].pixelavg_pp = PFX(pixel_avg_16x16_avx512);
+ p.pu[LUMA_16x8].pixelavg_pp = PFX(pixel_avg_16x8_avx512);
}
#endif
}
diff -r 2e5128235d57 -r 5309fe76c442 source/common/x86/mc-a.asm
--- a/source/common/x86/mc-a.asm Fri Jun 23 17:25:27 2017 +0530
+++ b/source/common/x86/mc-a.asm Mon Jun 26 16:21:18 2017 +0530
@@ -3367,11 +3367,11 @@
%endmacro
%endif
-%macro AVG_END 0
+%macro AVG_END 0-1 2;rows
+ lea t2, [t2+t3*2*SIZEOF_PIXEL]
lea t4, [t4+t5*2*SIZEOF_PIXEL]
- lea t2, [t2+t3*2*SIZEOF_PIXEL]
lea t0, [t0+t1*2*SIZEOF_PIXEL]
- sub eax, 2
+ sub eax, %1
jg .height_loop
%ifidn movu,movq ; detect MMX
EMMS
@@ -3434,17 +3434,24 @@
%endmacro
%macro BIWEIGHT_START_SSSE3 0
- movzx t6d, byte r6m ; FIXME x86_64
- mov t7d, 64
- sub t7d, t6d
- shl t7d, 8
- add t6d, t7d
- mova m4, [pw_512]
- movd xm3, t6d
+ movzx t6d, byte r6m ; FIXME x86_64
+%if mmsize > 16
+ vbroadcasti128 m4, [pw_512]
+%else
+ mova m4, [pw_512]
+%endif
+ lea t7d, [t6+(64<<8)]
+ shl t6d, 8
+ sub t7d, t6d
+%if cpuflag(avx512)
+ vpbroadcastw m3, t7d
+%else
+ movd xm3, t7d
%if cpuflag(avx2)
- vpbroadcastw m3, xm3
+ vpbroadcastw m3, xm3
%else
- SPLATW m3, m3 ; weight_dst,src
+ SPLATW m3, m3 ; weight_dst,src
+%endif
%endif
%endmacro
@@ -3586,6 +3593,34 @@
vextracti128 [t0+t1], m0, 1
AVG_END
+INIT_ZMM avx512
+ cglobal pixel_avg_weight_w16
+ BIWEIGHT_START
+ AVG_START 5
+.height_loop:
+ movu xm0, [t2]
+ movu xm1, [t4]
+ vinserti128 ym0, [t2+t3], 1
+ vinserti128 ym1, [t4+t5], 1
+ lea t2, [t2+t3*2]
+ lea t4, [t4+t5*2]
+ vinserti32x4 m0, [t2], 2
+ vinserti32x4 m1, [t4], 2
+ vinserti32x4 m0, [t2+t3], 3
+ vinserti32x4 m1, [t4+t5], 3
+ SBUTTERFLY bw, 0, 1, 2
+ pmaddubsw m0, m3
+ pmaddubsw m1, m3
+ pmulhrsw m0, m4
+ pmulhrsw m1, m4
+ packuswb m0, m1
+ mova [t0], xm0
+ vextracti128 [t0+t1], ym0, 1
+ lea t0, [t0+t1*2]
+ vextracti32x4 [t0], m0, 2
+ vextracti32x4 [t0+t1], m0, 3
+ AVG_END 4
+
cglobal pixel_avg_weight_w32
BIWEIGHT_START
AVG_START 5
@@ -4345,6 +4380,10 @@
AVGH 16, 8
AVGH 16, 4
+INIT_XMM avx512
+AVGH 16, 16
+AVGH 16, 8
+
%endif ;HIGH_BIT_DEPTH
;-------------------------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list