[x265] [PATCH 213 of 307] [x265-avx512]x86: AVX512 Quant
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:31 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1511857266 -19800
# Tue Nov 28 13:51:06 2017 +0530
# Node ID 0c4c63916d6e5af144bad8280706da6f3a613e75
# Parent 8eeff916ebe608526f167177a19c4516266ba513
[x265-avx512]x86: AVX512 Quant
AVX2 Performance : 18.77x
AVX512 Performance : 23.83x
diff -r 8eeff916ebe6 -r 0c4c63916d6e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Nov 29 12:01:16 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Nov 28 13:51:06 2017 +0530
@@ -2886,6 +2886,7 @@
p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
+ p.quant = PFX(quant_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
@@ -5007,6 +5008,7 @@
p.cu[BLOCK_8x8].idct = PFX(idct8_avx512);
p.cu[BLOCK_16x16].idct = PFX(idct16_avx512);
p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
+ p.quant = PFX(quant_avx512);
}
#endif
diff -r 8eeff916ebe6 -r 0c4c63916d6e source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Nov 29 12:01:16 2017 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Nov 28 13:51:06 2017 +0530
@@ -1044,6 +1044,133 @@
%endif ; ARCH_X86_64 == 1
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal quant, 5, 6, 22
+ ; fill qbits
+ movd xm4, r4d ; m4 = qbits
+
+ ; fill qbits-8
+ sub r4d, 8
+ movd xm6, r4d ; m6 = qbits8
+
+ ; fill offset
+%if UNIX64 == 0
+ vpbroadcastd m5, r5m ; m5 = add
+%else ; Mac
+ movd xm5, r5m
+ vpbroadcastd m5, xm5 ; m5 = add
+%endif
+
+ vbroadcasti32x8 m9, [pw_1]
+
+ mov r4d, r6m
+ pxor m7, m7
+ sub r4d, 32
+ jl .coeff16
+ add r4d, 32
+ shr r4d, 5
+ jmp .loop
+
+.coeff16:
+ ; 16 coeff
+ pxor m7, m7
+ pmovsxwd m16, [r0] ; m16 = level
+ pabsd m1, m16
+ pmulld m1, [r1]
+ paddd m17, m1, m5
+ psrad m17, xm4 ; m17 = level1
+
+ pslld m3, m17, 8
+ psrad m1, xm6
+ psubd m1, m3 ; m1 = deltaU1
+ movu [r2], m1
+ vextracti64x4 ym19, m17, 1
+ vextracti64x4 ym20, m16, 1
+ psignd ym17, ym16
+ psignd ym19, ym20
+ packssdw ym17, ym19
+ vpermq ym17, ym17, q3120
+ movu [r3], ym17
+
+ pminuw ym17, ym9
+ paddw ym7, ym17
+
+ ; sum count
+ xorpd m0, m0
+ psadbw ym7, ym0
+ vextracti128 xm1, ym7, 1
+ paddd xm7, xm1
+ movhlps xm0, xm7
+ paddd xm7, xm0
+ movd eax, xm7
+ RET
+
+.loop:
+ ; 16 coeff
+ pmovsxwd m16, [r0] ; m16 = level
+ pabsd m1, m16
+ pmulld m1, [r1]
+ paddd m17, m1, m5
+ psrad m17, xm4 ; m17 = level1
+
+ pslld m3, m17, 8
+ psrad m1, xm6
+ psubd m1, m3 ; m1 = deltaU1
+ movu [r2], m1
+ vextracti64x4 ym19, m17, 1
+ vextracti64x4 ym20, m16, 1
+ psignd ym17, ym16
+ psignd ym19, ym20
+ packssdw ym17, ym19
+
+ ; 16 coeff
+ pmovsxwd m16, [r0 + mmsize/2] ; m16 = level
+ pabsd m1, m16
+ pmulld m1, [r1 + mmsize]
+ paddd m18, m1, m5
+ psrad m18, xm4 ; m2 = level1
+
+ pslld m8, m18, 8
+ psrad m1, xm6
+ psubd m1, m8 ; m1 = deltaU1
+ movu [r2 + mmsize], m1
+ vextracti64x4 ym21, m18, 1
+ vextracti64x4 ym20, m16, 1
+ psignd ym18, ym16
+ psignd ym21, ym20
+ packssdw ym18, ym21
+ vinserti64x4 m17, m17, ym18, 1
+ vpermq m17, m17, q3120
+
+ movu [r3], m17
+
+ pminuw m17, m9
+ paddw m7, m17
+
+ add r0, mmsize
+ add r1, mmsize * 2
+ add r2, mmsize * 2
+ add r3, mmsize
+
+ dec r4d
+ jnz .loop
+
+ ; sum count
+ xorpd m0, m0
+ psadbw m7, m0
+ vextracti32x8 ym1, m7, 1
+ paddd ym7, ym1
+ vextracti64x2 xm1, m7, 1
+ paddd xm7, xm1
+ pshufd xm1, xm7, 2
+ paddd xm7, xm1
+ movd eax, xm7
+ RET
+%endif ; ARCH_X86_64 == 1
+
+
+
;-----------------------------------------------------------------------------
; uint32_t nquant(int16_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list