[x265] [PATCH 004 of 307] x86: AVX-512 pixel_var_8x8 and 16x16
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:30:02 CEST 2018
# HG changeset patch
# User Vignesh Vijayakumar
# Date 1498218927 -19800
# Fri Jun 23 17:25:27 2017 +0530
# Node ID 2e5128235d577806f16e5cf93266dcd7f4155a63
# Parent 6ce366c4e4919a4f1641234824f6bf4f128df400
x86: AVX-512 pixel_var_8x8 and 16x16
diff -r 6ce366c4e491 -r 2e5128235d57 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 21 12:43:06 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Jun 23 17:25:27 2017 +0530
@@ -2188,6 +2188,10 @@
p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
}
}
+ if (cpuMask & X265_CPU_AVX512)
+ {
+ p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
+ }
}
#else // if HIGH_BIT_DEPTH
@@ -3748,6 +3752,8 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_avx512);
+ p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_avx512);
+ p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_avx512);
}
#endif
}
diff -r 6ce366c4e491 -r 2e5128235d57 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Jun 21 12:43:06 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Fri Jun 23 17:25:27 2017 +0530
@@ -28,6 +28,8 @@
SECTION_RODATA 32
+var_shuf_avx512: db 0,-1, 1,-1, 2,-1, 3,-1, 4,-1, 5,-1, 6,-1, 7,-1
+ db 8,-1, 9,-1,10,-1,11,-1,12,-1,13,-1,14,-1,15,-1
%if BIT_DEPTH == 12
ssim_c1: times 4 dd 107321.76 ; .01*.01*4095*4095*64
ssim_c2: times 4 dd 60851437.92 ; .03*.03*4095*4095*64*63
@@ -5757,7 +5759,7 @@
%if HIGH_BIT_DEPTH == 0
%if %1
mova m7, [pw_00ff]
-%elif mmsize < 32
+%elif mmsize == 16
pxor m7, m7 ; zero
%endif
%endif ; !HIGH_BIT_DEPTH
@@ -6476,6 +6478,118 @@
RET
%endif ; !HIGH_BIT_DEPTH
+%macro VAR_AVX512_CORE 1 ; accum
+%if %1
+ paddw m0, m2
+ pmaddwd m2, m2
+ paddw m0, m3
+ pmaddwd m3, m3
+ paddd m1, m2
+ paddd m1, m3
+%else
+ paddw m0, m2, m3
+ pmaddwd m2, m2
+ pmaddwd m3, m3
+ paddd m1, m2, m3
+%endif
+%endmacro
+
+%macro VAR_AVX512_CORE_16x16 1 ; accum
+%if HIGH_BIT_DEPTH
+ mova ym2, [r0]
+ vinserti64x4 m2, [r0+r1], 1
+ mova ym3, [r0+2*r1]
+ vinserti64x4 m3, [r0+r3], 1
+%else
+ vbroadcasti64x2 ym2, [r0]
+ vbroadcasti64x2 m2 {k1}, [r0+r1]
+ vbroadcasti64x2 ym3, [r0+2*r1]
+ vbroadcasti64x2 m3 {k1}, [r0+r3]
+ pshufb m2, m4
+ pshufb m3, m4
+%endif
+ VAR_AVX512_CORE %1
+%endmacro
+
+%macro VAR_AVX512_CORE_8x8 1 ; accum
+%if HIGH_BIT_DEPTH
+ mova xm2, [r0]
+ mova xm3, [r0+r1]
+%else
+ movq xm2, [r0]
+ movq xm3, [r0+r1]
+%endif
+ vinserti128 ym2, [r0+2*r1], 1
+ vinserti128 ym3, [r0+r2], 1
+ lea r0, [r0+4*r1]
+ vinserti32x4 m2, [r0], 2
+ vinserti32x4 m3, [r0+r1], 2
+ vinserti32x4 m2, [r0+2*r1], 3
+ vinserti32x4 m3, [r0+r2], 3
+%if HIGH_BIT_DEPTH == 0
+ punpcklbw m2, m4
+ punpcklbw m3, m4
+%endif
+ VAR_AVX512_CORE %1
+%endmacro
+
+INIT_ZMM avx512
+cglobal pixel_var_16x16, 2,4
+ FIX_STRIDES r1
+ mov r2d, 0xf0
+ lea r3, [3*r1]
+%if HIGH_BIT_DEPTH == 0
+ vbroadcasti64x4 m4, [var_shuf_avx512]
+ kmovb k1, r2d
+%endif
+ VAR_AVX512_CORE_16x16 0
+.loop:
+ lea r0, [r0+4*r1]
+ VAR_AVX512_CORE_16x16 1
+ sub r2d, 0x50
+ jg .loop
+%if ARCH_X86_64 == 0
+ pop r3d
+ %assign regs_used 3
+%endif
+var_avx512_end:
+ vbroadcasti32x4 m2, [pw_1]
+ pmaddwd m0, m2
+ SBUTTERFLY dq, 0, 1, 2
+ paddd m0, m1
+ vextracti32x8 ym1, m0, 1
+ paddd ym0, ym1
+ vextracti128 xm1, ym0, 1
+ paddd xmm0, xm0, xm1
+ punpckhqdq xmm1, xmm0, xmm0
+ paddd xmm0, xmm1
+%if ARCH_X86_64
+ movq rax, xmm0
+%else
+ movd eax, xmm0
+ pextrd edx, xmm0, 1
+ %endif
+ RET
+
+%if HIGH_BIT_DEPTH == 0 ; 8x8 doesn't benefit from AVX-512 in high bit-depth
+cglobal pixel_var_8x8, 2,3
+ lea r2, [3*r1]
+ pxor xm4, xm4
+ VAR_AVX512_CORE_8x8 0
+ jmp var_avx512_end
+%endif
+
+cglobal pixel_var_8x16, 2,3
+ FIX_STRIDES r1
+ lea r2, [3*r1]
+%if HIGH_BIT_DEPTH == 0
+ pxor xm4, xm4
+%endif
+ VAR_AVX512_CORE_8x8 0
+ lea r0, [r0+4*r1]
+ VAR_AVX512_CORE_8x8 1
+ jmp var_avx512_end
+
%macro VAR2_END 3
HADDW %2, xm1
movd r1d, %2
More information about the x265-devel
mailing list