[x265] [PATCH 219 of 307] [x265-avx512]x86: AVX512 nquant
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:37 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1511947290 -19800
# Wed Nov 29 14:51:30 2017 +0530
# Node ID 3e2058cec6c6f4ad49d92f9df7fbc110a54f4b4b
# Parent d7af8d747bffacafa5dfe8f4d513bbd09314ad63
[x265-avx512]x86: AVX512 nquant
AVX2 Performance : 21.42x
AVX512 Performance : 25.60x
diff -r d7af8d747bff -r 3e2058cec6c6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 30 15:29:18 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Nov 29 14:51:30 2017 +0530
@@ -2887,8 +2887,7 @@
p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
p.quant = PFX(quant_avx512);
-
-
+ p.nquant = PFX(nquant_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
@@ -5015,7 +5014,7 @@
p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
p.quant = PFX(quant_avx512);
-
+ p.nquant = PFX(nquant_avx512);
}
#endif
}
diff -r d7af8d747bff -r 3e2058cec6c6 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Nov 30 15:29:18 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Nov 29 14:51:30 2017 +0530
@@ -1277,7 +1277,101 @@
paddd xm5, xm0
movd eax, xm5
RET
-
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal nquant, 3,5,22
+%if UNIX64 == 0
+ vpbroadcastd m4, r4m
+%else ; Mac
+ movd xm4, r4m
+ vpbroadcastd m4, xm4
+%endif
+
+ vbroadcasti32x8 m6, [pw_1]
+ mov r4d, r5m
+ pxor m5, m5
+ movd xm3, r3m
+ sub r4d, 16
+ je .coeff16
+ add r4d, 16
+ shr r4d, 5
+ jmp .loop
+
+.coeff16:
+ pmovsxwd m16, [r0]
+ pabsd m17, m16
+ pmulld m17, [r1]
+ paddd m17, m4
+ psrad m17, xm3
+
+ vextracti64x4 ym19, m17, 1
+ vextracti64x4 ym20, m16, 1
+ psignd ym17, ym16
+ psignd ym19, ym20
+ packssdw ym17, ym19
+ vpermq ym17, ym17, q3120
+ pabsw ym17, ym17
+ movu [r2], ym17
+ pminuw ym17, ym6
+ paddw ym5, ym17
+ pxor m0, m0
+ psadbw ym5, ym0
+ vextracti128 xm0, ym5, 1
+ paddd xm5, xm0
+ pshufd xm0, xm5, 2
+ paddd xm5, xm0
+ movd eax, xm5
+ RET
+
+.loop:
+ pmovsxwd m16, [r0]
+ pabsd m17, m16
+ pmulld m17, [r1]
+ paddd m17, m4
+ psrad m17, xm3
+ vextracti64x4 ym19, m17, 1
+ vextracti64x4 ym20, m16, 1
+ psignd ym17, ym16
+ psignd ym19, ym20
+ packssdw ym17, ym19
+
+ pmovsxwd m16, [r0 + mmsize/2]
+ pabsd m18, m16
+ pmulld m18, [r1 + mmsize]
+ paddd m18, m4
+ psrad m18, xm3
+ vextracti64x4 ym21, m18, 1
+ vextracti64x4 ym20, m16, 1
+ psignd ym18, ym16
+ psignd ym21, ym20
+ packssdw ym18, ym21
+ vinserti64x4 m17, m17, ym18, 1
+ vpermq m17, m17, q3120
+
+ pabsw m17, m17
+ movu [r2], m17
+
+ add r0, mmsize
+ add r1, mmsize * 2
+ add r2, mmsize
+
+ pminuw m17, m6
+ paddw m5, m17
+
+ dec r4d
+ jnz .loop
+
+ pxor m0, m0
+ psadbw m5, m0
+ vextracti32x8 ym1, m5, 1
+ paddd ym5, ym1
+ vextracti64x2 xm1, m5, 1
+ paddd xm5, xm1
+ pshufd xm1, xm5, 2
+ paddd xm5, xm1
+ movd eax, xm5
+ RET
+%endif ; ARCH_X86_64 == 1
;-----------------------------------------------------------------------------
; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
diff -r d7af8d747bff -r 3e2058cec6c6 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Thu Nov 30 15:29:18 2017 +0530
+++ b/source/test/mbdstharness.cpp Wed Nov 29 14:51:30 2017 +0530
@@ -252,12 +252,10 @@
bool MBDstHarness::check_nquant_primitive(nquant_t ref, nquant_t opt)
{
int j = 0;
-
for (int i = 0; i < ITERS; i++)
{
- int width = (rand() % 4 + 1) * 4;
+ int width = 1 << (rand() % 4 + 2);
int height = width;
-
uint32_t optReturnValue = 0;
uint32_t refReturnValue = 0;
More information about the x265-devel
mailing list