[x265] [PATCH 3 of 3] asm: avx2 version of nquant(), improve 11k cycles -> 7k cycles
Min Chen
chenm003 at 163.com
Thu Sep 4 01:37:49 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1409787455 25200
# Node ID 8d109ec524a7767925084049a77668d004a5b319
# Parent 4ca9e972f48cb4530ca7181ad7cec351568a99b3
asm: avx2 version of nquant(), improve 11k cycles -> 7k cycles
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/dct.cpp
--- a/source/common/dct.cpp Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/dct.cpp Wed Sep 03 16:37:35 2014 -0700
@@ -803,7 +803,7 @@
X265_CHECK((numCoeff % 16) == 0, "number of quant coeff is not multiple of 4x4\n");
X265_CHECK((uint32_t)add < ((uint32_t)1 << qBits), "2 ^ qBits less than add\n");
- X265_CHECK(((intptr_t)quantCoeff & 15) == 0, "quantCoeff buffer not aligned\n");
+ X265_CHECK(((intptr_t)quantCoeff & 31) == 0, "quantCoeff buffer not aligned\n");
for (int blockpos = 0; blockpos < numCoeff; blockpos++)
{
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp Wed Sep 03 16:37:35 2014 -0700
@@ -1432,6 +1432,7 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.nquant = x265_nquant_avx2;
}
/* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
@@ -1715,6 +1716,7 @@
p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
p.denoiseDct = x265_denoise_dct_avx2;
+ p.nquant = x265_nquant_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/x86/const-a.asm Wed Sep 03 16:37:35 2014 -0700
@@ -86,7 +86,8 @@
const pd_1024, times 4 dd 1024
const pd_2048, times 4 dd 2048
const pd_ffff, times 4 dd 0xffff
-const pd_n32768, times 4 dd 0xffff8000
+const pd_32767, times 8 dd 32767
+const pd_n32768, times 8 dd 0xffff8000
const pw_ff00, times 8 dw 0xff00
const multi_2Row, dw 1, 2, 3, 4, 1, 2, 3, 4
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/x86/pixel-util.h Wed Sep 03 16:37:35 2014 -0700
@@ -46,6 +46,7 @@
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff);
uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *qCoef, int qBits, int add, int numCoeff);
void x265_dequant_normal_sse4(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift);
int x265_count_nonzero_ssse3(const int32_t *quantCoeff, int numCoeff);
diff -r 4ca9e972f48c -r 8d109ec524a7 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Sep 03 16:36:59 2014 -0700
+++ b/source/common/x86/pixel-util8.asm Wed Sep 03 16:37:35 2014 -0700
@@ -54,6 +54,8 @@
cextern pw_00ff
cextern pw_2000
cextern pw_pixel_max
+cextern pd_32767
+cextern pd_n32768
;-----------------------------------------------------------------------------
; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
@@ -991,6 +993,61 @@
sub eax, r4d ; numSig
RET
+
+INIT_YMM avx2
+cglobal nquant, 3,5,8
+ vpbroadcastd m6, r4m
+ mov r4d, r5m
+ movd xm5, r3m ; m5 = qbits
+ mov r3d, r4d ; r3 = numCoeff
+ shr r4d, 4
+ pxor m7, m7 ; m7 = numZero
+
+.loop:
+ movu m0, [r0] ; m0 = level
+ pabsd m1, m0
+ pmulld m1, [r1] ; m1 = tmpLevel1
+ paddd m1, m6
+ psrad m1, xm5 ; m1 = level1
+ psignd m1, m0 ; restore sign
+
+ movu m2, [r0 + mmsize] ; m2 = level
+ pabsd m3, m2
+ pmulld m3, [r1 + mmsize] ; m3 = tmpLevel1
+ paddd m3, m6
+ psrad m3, xm5 ; m3 = level1
+ psignd m3, m2 ; restore sign
+
+ add r0, 2 * mmsize
+ add r1, 2 * mmsize
+
+ packssdw m0, m1, m3
+
+ pminsd m2, m1, [pd_32767]
+ pmaxsd m2, m2, [pd_n32768]
+ movu [r2], m2
+
+ pminsd m2, m3, [pd_32767]
+ pmaxsd m2, m2, [pd_n32768]
+ movu [r2 + mmsize], m2
+ add r2, 2 * mmsize
+
+ pxor m4, m4
+ pcmpeqw m0, m4
+ psubw m7, m0
+
+ dec r4d
+ jnz .loop
+
+ vextracti128 xm0, m7, 1
+ paddw xm7, xm0
+ packuswb xm7, xm7
+ psadbw xm7, xm4
+ mov eax, r3d
+ movd r4d, xm7
+ sub eax, r4d ; numSig
+ RET
+
;-----------------------------------------------------------------------------
; void dequant_normal(const int32_t* quantCoef, int32_t* coef, int num, int scale, int shift)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list