[x265] [PATCH 300 of 307] x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:34:58 CEST 2018


# HG changeset patch
# User Jayashree
# Date 1517283539 28800
#      Mon Jan 29 19:38:59 2018 -0800
# Node ID 3c6e5ce07dbca7f967e4b5b62fe450979da3bf81
# Parent  624c83571d1df840e1206c46e589044fbf87ff32
x86: AVX512 'count_nonzero_16x16' avx-512 kernel, 22% speedup over avx2

count_nonzero[16x16]   18.88x ->  23.04x

diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Jan 12 12:40:16 2018 -0800
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 29 19:38:59 2018 -0800
@@ -5375,6 +5375,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_avx512);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx512);
         p.planecopy_sp_shl = PFX(upShift_16_avx512);
+        p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16x16_avx512);
 
     }
 #endif
diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Fri Jan 12 12:40:16 2018 -0800
+++ b/source/common/x86/pixel-util.h	Mon Jan 29 19:38:59 2018 -0800
@@ -61,4 +61,6 @@
 uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
 uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
 
+int  PFX(count_nonzero_16x16_avx512(const int16_t* quantCoeff));
+
 #endif // ifndef X265_PIXEL_UTIL_H
diff -r 624c83571d1d -r 3c6e5ce07dbc source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Fri Jan 12 12:40:16 2018 -0800
+++ b/source/common/x86/pixel-util8.asm	Mon Jan 29 19:38:59 2018 -0800
@@ -4,6 +4,7 @@
 ;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
 ;*          Nabajit Deka <nabajit at multicorewareinc.com>
 ;*          Rajesh Paulraj <rajesh at multicorewareinc.com>
+;*          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -1857,6 +1858,30 @@
     movd            eax, xm0
     RET
 
+;-----------------------------------------------------------------------------
+; int x265_count_nonzero_16x16_avx512(const int16_t *quantCoeff);
+;-----------------------------------------------------------------------------
+INIT_ZMM avx512
+cglobal count_nonzero_16x16, 1,4,2
+    mov             r1, 0xFFFFFFFFFFFFFFFF
+    kmovq           k2, r1
+    xor             r3, r3
+    pxor            m0, m0
+
+%assign x 0
+%rep 4
+    movu            m1, [r0 + x]
+    vpacksswb       m1, [r0 + x + 64]
+%assign x x+128
+    vpcmpb          k1 {k2}, m1, m0, 00000100b
+    kmovq           r1, k1
+    popcnt          r2, r1
+    add             r3d, r2d
+%endrep
+    mov             eax, r3d
+
+    RET
+
 
 ;-----------------------------------------------------------------------------
 ; int x265_count_nonzero_32x32_sse2(const int16_t *quantCoeff);


More information about the x265-devel mailing list