[x265] [PATCH 2 of 2] asm: AVX2 version of dequant_normal, improve 9.3k Cycles -> 4.2k Cycles
Min Chen
chenm003 at 163.com
Sat Sep 6 02:36:28 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1409963778 25200
# Node ID fa6c8db15be326900905d3a36e59b4411de1af11
# Parent c4dd39c9ad0b96fbf520f399de41e1e9b4b77c72
asm: AVX2 version of dequant_normal, improve 9.3k Cycles -> 4.2k Cycles
diff -r c4dd39c9ad0b -r fa6c8db15be3 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Sep 05 17:36:01 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp Fri Sep 05 17:36:18 2014 -0700
@@ -1442,6 +1442,7 @@
{
p.dct[DCT_4x4] = x265_dct4_avx2;
p.nquant = x265_nquant_avx2;
+ p.dequant_normal = x265_dequant_normal_avx2;
}
/* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
@@ -1739,6 +1740,7 @@
p.dct[DCT_4x4] = x265_dct4_avx2;
p.nquant = x265_nquant_avx2;
+ p.dequant_normal = x265_dequant_normal_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r c4dd39c9ad0b -r fa6c8db15be3 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Sep 05 17:36:01 2014 -0700
+++ b/source/common/x86/const-a.asm Fri Sep 05 17:36:18 2014 -0700
@@ -89,6 +89,7 @@
const pd_1024, times 4 dd 1024
const pd_2048, times 4 dd 2048
const pd_ffff, times 4 dd 0xffff
+const pd_32767, times 4 dd 32767
const pd_n32768, times 4 dd 0xffff8000
const pw_ff00, times 8 dw 0xff00
diff -r c4dd39c9ad0b -r fa6c8db15be3 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Fri Sep 05 17:36:01 2014 -0700
+++ b/source/common/x86/pixel-util.h Fri Sep 05 17:36:18 2014 -0700
@@ -48,6 +48,7 @@
uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
uint32_t x265_nquant_avx2(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);
void x265_dequant_normal_sse4(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
+void x265_dequant_normal_avx2(const int16_t* quantCoef, int32_t* coef, int num, int scale, int shift);
int x265_count_nonzero_ssse3(const int16_t *quantCoeff, int numCoeff);
void x265_weight_pp_sse4(pixel *src, pixel *dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
diff -r c4dd39c9ad0b -r fa6c8db15be3 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Fri Sep 05 17:36:01 2014 -0700
+++ b/source/common/x86/pixel-util8.asm Fri Sep 05 17:36:18 2014 -0700
@@ -54,6 +54,8 @@
cextern pw_00ff
cextern pw_2000
cextern pw_pixel_max
+cextern pd_32767
+cextern pd_n32768
;-----------------------------------------------------------------------------
; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
@@ -1079,6 +1081,52 @@
RET
+INIT_YMM avx2
+cglobal dequant_normal, 5,5,7
+ vpbroadcastd m2, [pw_1] ; m2 = word [1]
+ vpbroadcastd m5, [pd_32767] ; m5 = dword [32767]
+ vpbroadcastd m6, [pd_n32768] ; m6 = dword [-32768]
+%if HIGH_BIT_DEPTH
+ cmp r3d, 32767
+ jle .skip
+ shr r3d, 2
+ sub r4d, 2
+.skip:
+%endif
+ movd xm0, r4d ; m0 = shift
+ add r4d, -1+16
+ bts r3d, r4d
+ vpbroadcastd m1, r3d ; m1 = dword [add scale]
+
+ ; m0 = shift
+ ; m1 = scale
+ ; m2 = word [1]
+ shr r2d, 4
+.loop:
+ movu m3, [r0]
+ punpckhwd m4, m3, m2
+ punpcklwd m3, m2
+ pmaddwd m3, m1 ; m3 = dword (clipQCoef * scale + add)
+ pmaddwd m4, m1
+ psrad m3, xm0
+ psrad m4, xm0
+ pminsd m3, m5
+ pmaxsd m3, m6
+ pminsd m4, m5
+ pmaxsd m4, m6
+ mova [r1 + 0 * mmsize/2], xm3
+ mova [r1 + 1 * mmsize/2], xm4
+ vextracti128 [r1 + 2 * mmsize/2], m3, 1
+ vextracti128 [r1 + 3 * mmsize/2], m4, 1
+
+ add r0, mmsize
+ add r1, mmsize * 2
+
+ dec r2d
+ jnz .loop
+ RET
+
+
;-----------------------------------------------------------------------------
; int count_nonzero(const int16_t *quantCoeff, int numCoeff);
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list