[x265] [PATCH 300 of 307] x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:58 CEST 2018
# HG changeset patch
# User Jayashree
# Date 1517283539 28800
# Mon Jan 29 19:38:59 2018 -0800
# Node ID 3c6e5ce07dbca7f967e4b5b62fe450979da3bf81
# Parent 624c83571d1df840e1206c46e589044fbf87ff32
x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2
count_nonzero[16x16] 18.88x -> 23.04x
diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Jan 12 12:40:16 2018 -0800
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 29 19:38:59 2018 -0800
@@ -5375,6 +5375,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
p.planecopy_sp_shl = PFX(upShift_16_avx512);
+ p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx512);
}
#endif
diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Fri Jan 12 12:40:16 2018 -0800
+++ b/source/common/x86/pixel-util.h Mon Jan 29 19:38:59 2018 -0800
@@ -61,4 +61,6 @@
uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
+int PFX(count_nonzero_16x16_avx512(const int16_t* quantCoeff));
+
#endif // ifndef X265_PIXEL_UTIL_H
diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Jan 12 12:40:16 2018 -0800
+++ b/source/common/x86/pixel-util8.asm Mon Jan 29 19:38:59 2018 -0800
@@ -4,6 +4,7 @@
;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
;* Nabajit Deka <nabajit at multicorewareinc.com>
;* Rajesh Paulraj <rajesh at multicorewareinc.com>
+;* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
;*
;* This program is free software; you can redistribute it and/or modify
;* it under the terms of the GNU General Public License as published by
@@ -1857,6 +1858,30 @@
movd eax, xm0
RET
+;-----------------------------------------------------------------------------
+; int x265_count_nonzero_16x16_avx512(const int16_t *quantCoeff);
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal count_nonzero_16x16, 1,4,2
+ mov r1, 0xFFFFFFFFFFFFFFFF
+ kmovq k2, r1
+ xor r3, r3
+ pxor m0, m0
+
+%assign x 0
+%rep 4
+ movu m1, [r0 + x]
+ vpacksswb m1, [r0 + x + 64]
+%assign x x+128
+ vpcmpb k1 {k2}, m1, m0, 00000100b
+ kmovq r1, k1
+ popcnt r2, r1
+ add r3d, r2d
+%endrep
+ mov eax, r3d
+
+ RET
+
;-----------------------------------------------------------------------------
; int x265_count_nonzero_32x32_sse2(const int16_t *quantCoeff);
More information about the x265-devel
mailing list