[x265] [PATCH 4 of 4] asm: avx2 version of quant, improve 16.6k cycles -> 8.4k cycles
Min Chen
chenm003 at 163.com
Tue Sep 9 04:39:45 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1410230374 25200
# Node ID 2a36ebe3ee8c48f6ef3d87f3770a64c5bae70953
# Parent 88dd9f5b8b82cc6cc6689979034c4a0635fef473
asm: avx2 version of quant, improve 16.6k cycles -> 8.4k cycles
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/dct.cpp
--- a/source/common/dct.cpp Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/dct.cpp Mon Sep 08 19:39:34 2014 -0700
@@ -773,6 +773,7 @@
uint32_t quant_c(int32_t* coef, int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff)
{
X265_CHECK(qBits >= 8, "qBits less than 8\n");
+ X265_CHECK((numCoeff % 16) == 0, "numCoeff must be multiple of 16\n");
int qBits8 = qBits - 8;
uint32_t numSig = 0;
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp Mon Sep 08 19:39:34 2014 -0700
@@ -1441,6 +1441,7 @@
if (cpuMask & X265_CPU_AVX2)
{
p.dct[DCT_4x4] = x265_dct4_avx2;
+ p.quant = x265_quant_avx2;
p.nquant = x265_nquant_avx2;
p.dequant_normal = x265_dequant_normal_avx2;
}
@@ -1739,6 +1740,7 @@
p.denoiseDct = x265_denoise_dct_avx2;
p.dct[DCT_4x4] = x265_dct4_avx2;
+ p.quant = x265_quant_avx2;
p.nquant = x265_nquant_avx2;
p.dequant_normal = x265_dequant_normal_avx2;
}
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/x86/const-a.asm Mon Sep 08 19:39:34 2014 -0700
@@ -76,7 +76,7 @@
const pw_ppmmppmm, dw 1,1,-1,-1,1,1,-1,-1
const pw_pmpmpmpm, dw 1,-1,1,-1,1,-1,1,-1
const pw_pmmpzzzz, dw 1,-1,-1,1,0,0,0,0
-const pd_1, times 4 dd 1
+const pd_1, times 8 dd 1
const pd_2, times 4 dd 2
const pd_4, times 4 dd 4
const pd_8, times 4 dd 8
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/x86/pixel-util.h Mon Sep 08 19:39:34 2014 -0700
@@ -45,6 +45,7 @@
void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
+uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);
uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
diff -r 88dd9f5b8b82 -r 2a36ebe3ee8c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Sep 08 19:39:14 2014 -0700
+++ b/source/common/x86/pixel-util8.asm Mon Sep 08 19:39:34 2014 -0700
@@ -929,6 +929,156 @@
RET
+IACA_START
+%if ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal quant, 5,5,10
+ ; fill qbits
+ movd xm4, r4d ; m4 = qbits
+
+ ; fill qbits-8
+ sub r4d, 8
+ movd xm6, r4d ; m6 = qbits8
+
+ ; fill offset
+ vpbroadcastd m5, r5m ; m5 = add
+
+ vpbroadcastw m9, [pw_1] ; m9 = word [1]
+
+ mov r4d, r6m
+ shr r4d, 4
+ pxor m7, m7 ; m7 = numZero
+.loop:
+ ; 8 coeff
+ movu m0, [r0] ; m0 = level
+ pabsd m1, m0
+ pmulld m1, [r1] ; m0 = tmpLevel1
+ paddd m2, m1, m5
+ psrad m2, xm4 ; m2 = level1
+
+ pslld m3, m2, 8
+ psrad m1, xm6
+ psubd m1, m3 ; m1 = deltaU1
+ movu [r2], m1
+ psignd m2, m0
+
+ ; 8 coeff
+ movu m0, [r0 + mmsize] ; m0 = level
+ pabsd m1, m0
+ pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
+ paddd m3, m1, m5
+ psrad m3, xm4 ; m2 = level1
+
+ pslld m8, m3, 8
+ psrad m1, xm6
+ psubd m1, m8 ; m1 = deltaU1
+ movu [r2 + mmsize], m1
+ psignd m3, m0
+
+ packssdw m2, m3
+ vpermq m2, m2, q3120
+ movu [r3], m2
+
+ ; count non-zero coeff
+ ; TODO: popcnt is faster, but some CPU can't support
+ pminuw m2, m9
+ paddw m7, m2
+
+ add r0, mmsize*2
+ add r1, mmsize*2
+ add r2, mmsize*2
+ add r3, mmsize
+
+ dec r4d
+ jnz .loop
+
+ ; sum count
+ xorpd m0, m0
+ psadbw m7, m0
+ vextracti128 xm1, m7, 1
+ paddd xm7, xm1
+ movhlps xm0, xm7
+ paddd xm7, xm0
+ movd eax, xm7
+ RET
+
+%else ; ARCH_X86_64 == 1
+INIT_YMM avx2
+cglobal quant, 5,6,8
+ ; fill qbits
+ movd xm4, r4d ; m4 = qbits
+
+ ; fill qbits-8
+ sub r4d, 8
+ movd xm6, r4d ; m6 = qbits8
+
+ ; fill offset
+ vpbroadcastd m5, r5m ; m5 = ad
+
+ lea r5, [pd_1]
+
+ mov r4d, r6m
+ shr r4d, 4
+ pxor m7, m7 ; m7 = numZero
+.loop:
+ ; 8 coeff
+ movu m0, [r0] ; m0 = level
+ pabsd m1, m0
+ pmulld m1, [r1] ; m0 = tmpLevel1
+ paddd m2, m1, m5
+ psrad m2, xm4 ; m2 = level1
+
+ pslld m3, m2, 8
+ psrad m1, xm6
+ psubd m1, m3 ; m1 = deltaU1
+
+ movu [r2], m1
+ psignd m3, m2, m0
+ pminud m2, [r5]
+ paddd m7, m2
+ packssdw m3, m3
+ vpermq m3, m3, q0020
+ movu [r3], xm3
+
+ ; 8 coeff
+ movu m0, [r0 + mmsize] ; m0 = level
+ pabsd m1, m0
+ pmulld m1, [r1 + mmsize] ; m0 = tmpLevel1
+ paddd m2, m1, m5
+ psrad m2, xm4 ; m2 = level1
+
+ pslld m3, m2, 8
+ psrad m1, xm6
+ psubd m1, m3 ; m1 = deltaU1
+
+ movu [r2 + mmsize], m1
+ psignd m3, m2, m0
+ pminud m2, [r5]
+ paddd m7, m2
+ packssdw m3, m3
+ vpermq m3, m3, q0020
+ movu [r3 + mmsize/2], xm3
+
+ add r0, mmsize*2
+ add r1, mmsize*2
+ add r2, mmsize*2
+ add r3, mmsize
+
+ dec r4d
+ jnz .loop
+
+ xorpd m0, m0
+ psadbw m7, m0
+ vextracti128 xm1, m7, 1
+ paddd xm7, xm1
+ movhlps xm0, xm7
+ paddd xm7, xm0
+ movd eax, xm7
+ RET
+%endif ; ARCH_X86_64 == 1
+IACA_END
+
+
;-----------------------------------------------------------------------------
; uint32_t nquant(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list