[x265] [PATCH 3 of 3] asm: avx2 version of nquant(), improve 9.8k cycles -> 5.3k cycles
Min Chen
chenm003 at 163.com
Fri Sep 5 03:59:49 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1409882374 25200
# Node ID ae1602e8571d9baf2287b1df77a4cf1230da9804
# Parent c6ba4d4834a73dfbc57d14f74936ab7f49a0c24d
asm: avx2 version of nquant(), improve 9.8k cycles -> 5.3k cycles
diff -r c6ba4d4834a7 -r ae1602e8571d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Sep 04 18:59:07 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp Thu Sep 04 18:59:34 2014 -0700
@@ -1441,6 +1441,7 @@
if (cpuMask & X265_CPU_AVX2)
{
p.dct[DCT_4x4] = x265_dct4_avx2;
+ p.nquant = x265_nquant_avx2;
}
/* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
@@ -1736,6 +1737,7 @@
p.denoiseDct = x265_denoise_dct_avx2;
p.dct[DCT_4x4] = x265_dct4_avx2;
+ p.nquant = x265_nquant_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r c6ba4d4834a7 -r ae1602e8571d source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Thu Sep 04 18:59:07 2014 -0700
+++ b/source/common/x86/pixel-util.h Thu Sep 04 18:59:34 2014 -0700
@@ -46,6 +46,7 @@
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
diff -r c6ba4d4834a7 -r ae1602e8571d source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Sep 04 18:59:07 2014 -0700
+++ b/source/common/x86/pixel-util8.asm Thu Sep 04 18:59:34 2014 -0700
@@ -986,6 +986,55 @@
RET
+INIT_YMM avx2
+cglobal nquant, 3,5,7
+ vpbroadcastd m4, r4m
+ vpbroadcastd m6, [pw_1]
+ mov r4d, r5m
+ pxor m5, m5 ; m7 = numZero
+ movd xm3, r3m ; m5 = qbits
+ mov r3d, r4d ; r3 = numCoeff
+ shr r4d, 4
+
+.loop:
+ movu m0, [r0] ; m0 = level
+ pabsd m1, m0
+ pmulld m1, [r1] ; m0 = tmpLevel1 * qcoeff
+ paddd m1, m4
+ psrad m1, xm3 ; m0 = level1
+ psignd m1, m0
+
+ movu m0, [r0 + mmsize] ; m0 = level
+ pabsd m2, m0
+ pmulld m2, [r1 + mmsize] ; m0 = tmpLevel1 * qcoeff
+ paddd m2, m4
+ psrad m2, xm3 ; m0 = level1
+ psignd m2, m0
+
+ packssdw m1, m2
+ vpermq m2, m1, q3120
+
+ movu [r2], m2
+ add r0, mmsize * 2
+ add r1, mmsize * 2
+ add r2, mmsize
+
+ pminuw m1, m6
+ paddw m5, m1
+
+ dec r4d
+ jnz .loop
+
+ pxor m0, m0
+ psadbw m5, m0
+ vextracti128 xm0, m5, 1
+ paddd xm5, xm0
+ pshufd xm0, xm5, 2
+ paddd xm5, xm0
+ movd eax, xm5
+ RET
+
+
;-----------------------------------------------------------------------------
; void dequant_normal(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift)
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list