[x265] [PATCH 301 of 307] x86: AVX512 'count_nonzero_32x32' avx-512 kernel
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:34:59 CEST 2018
# HG changeset patch
# User Jayashree
# Date 1517285149 28800
# Mon Jan 29 20:05:49 2018 -0800
# Node ID 3a08a957d4cd2bf0eb57524651a824513378e0a3
# Parent 3c6e5ce07dbca7f967e4b5b62fe450979da3bf81
x86: AVX512 'count_nonzero_32x32' avx-512 kernel
diff -r 3c6e5ce07dbc -r 3a08a957d4cd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 29 19:38:59 2018 -0800
+++ b/source/common/x86/asm-primitives.cpp Mon Jan 29 20:05:49 2018 -0800
@@ -5376,6 +5376,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
p.planecopy_sp_shl = PFX(upShift_16_avx512);
p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx512);
+ p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32x32_avx512);
}
#endif
diff -r 3c6e5ce07dbc -r 3a08a957d4cd source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Mon Jan 29 19:38:59 2018 -0800
+++ b/source/common/x86/pixel-util.h Mon Jan 29 20:05:49 2018 -0800
@@ -62,5 +62,6 @@
uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
int PFX(count_nonzero_16x16_avx512(const int16_t* quantCoeff));
+int PFX(count_nonzero_32x32_avx512(const int16_t* quantCoeff));
#endif // ifndef X265_PIXEL_UTIL_H
diff -r 3c6e5ce07dbc -r 3a08a957d4cd source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Jan 29 19:38:59 2018 -0800
+++ b/source/common/x86/pixel-util8.asm Mon Jan 29 20:05:49 2018 -0800
@@ -1932,6 +1932,30 @@
RET
+;-----------------------------------------------------------------------------
+; int x265_count_nonzero_32x32_avx512(const int16_t *quantCoeff);
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal count_nonzero_32x32, 1,4,2
+ mov r1, 0xFFFFFFFFFFFFFFFF
+ kmovq k2, r1
+ xor r3, r3
+ pxor m0, m0
+
+%assign x 0
+%rep 16
+ movu m1, [r0 + x]
+ vpacksswb m1, [r0 + x + 64]
+%assign x x+128
+ vpcmpb k1 {k2}, m1, m0, 00000100b
+ kmovq r1, k1
+ popcnt r2, r1
+ add r3d, r2d
+%endrep
+ mov eax, r3d
+
+ RET
+
;-----------------------------------------------------------------------------------------------------------------------------------------------
;void weight_pp(pixel *src, pixel *dst, intptr_t stride, int width, int height, int w0, int round, int shift, int offset)
;-----------------------------------------------------------------------------------------------------------------------------------------------
More information about the x265-devel
mailing list