[x265] [PATCH 2 of 3] asm: AVX2 version of sa8d[32x32]
Min Chen
chenm003 at 163.com
Tue Apr 12 19:31:01 CEST 2016
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1460482248 18000
# Node ID 37e80d50caf51a74e85c83f24317935171a5d375
# Parent 40afead3177d7c128066334bfe075042388e86b0
asm: AVX2 version of sa8d[32x32]
AVX:
sa8d[32x32] 5.47x 7403.68 40490.18
AVX2:
sa8d[32x32] 10.57x 3783.80 40001.89
---
source/common/x86/asm-primitives.cpp | 1 +
source/common/x86/pixel-a.asm | 369 ++++++++++++++++++++++++++++++++++
2 files changed, 370 insertions(+), 0 deletions(-)
diff -r 40afead3177d -r 37e80d50caf5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sat Apr 09 19:32:28 2016 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Apr 12 12:30:48 2016 -0500
@@ -2161,6 +2161,7 @@
p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_avx2);
p.cu[LUMA_8x8].sa8d = PFX(pixel_sa8d_8x8_avx2);
p.cu[LUMA_16x16].sa8d = PFX(pixel_sa8d_16x16_avx2);
+ p.cu[LUMA_32x32].sa8d = PFX(pixel_sa8d_32x32_avx2);
#endif
if (cpuMask & X265_CPU_BMI2)
diff -r 40afead3177d -r 37e80d50caf5 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Sat Apr 09 19:32:28 2016 +0530
+++ b/source/common/x86/pixel-a.asm Tue Apr 12 12:30:48 2016 -0500
@@ -13995,4 +13995,373 @@
shr eax, 1
RET
+
+; TODO: optimize me, need more 2 of YMM registers because C model get partial result every 16x16 block
+INIT_YMM avx2
+cglobal pixel_sa8d_32x32, 4,8,14
+ FIX_STRIDES r1, r3
+ lea r4, [3*r1]
+ lea r5, [3*r3]
+ lea r6, [r0+4*r1]
+ lea r7, [r2+4*r3]
+ vbroadcasti128 m7, [pw_1]
+
+
+ ;SA8D[16x8] ; pix[0]
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0]
+ movu m5, [r2]
+ psubw m0, m5
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+ paddw m0, m1
+ paddw m2, m8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ paddd m10, m0, m2
+
+
+ ; SA8D[16x8] ; pix[16]
+ add r0, mmsize
+ add r2, mmsize
+ add r6, mmsize
+ add r7, mmsize
+
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0]
+ movu m5, [r2]
+ psubw m0, m5
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+ paddw m0, m1
+ paddw m2, m8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ paddd m12, m0, m2
+
+
+ ; SA8D[16x8] ; pix[8*stride+16]
+ lea r0, [r0+8*r1]
+ lea r2, [r2+8*r3]
+ lea r6, [r6+8*r1]
+ lea r7, [r7+8*r3]
+
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0]
+ movu m5, [r2]
+ psubw m0, m5
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+ paddw m0, m1
+ paddw m2, m8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ paddd m12, m0
+ paddd m12, m2
+
+ ; sum[1]
+ HADDD m12, m0
+
+
+ ; SA8D[16x8] ; pix[8*stride]
+ sub r0, mmsize
+ sub r2, mmsize
+ sub r6, mmsize
+ sub r7, mmsize
+
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0]
+ movu m5, [r2]
+ psubw m0, m5
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+ paddw m0, m1
+ paddw m2, m8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ paddd m10, m0
+ paddd m10, m2
+
+ ; sum[0]
+ HADDD m10, m0
+ punpckldq xm10, xm12
+
+
+ ;SA8D[16x8] ; pix[16*stridr]
+ lea r0, [r0+8*r1]
+ lea r2, [r2+8*r3]
+ lea r6, [r6+8*r1]
+ lea r7, [r7+8*r3]
+
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0]
+ movu m5, [r2]
+ psubw m0, m5
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+ paddw m0, m1
+ paddw m2, m8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ paddd m12, m0, m2
+
+
+ ; SA8D[16x8] ; pix[16*stride+16]
+ add r0, mmsize
+ add r2, mmsize
+ add r6, mmsize
+ add r7, mmsize
+
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0]
+ movu m5, [r2]
+ psubw m0, m5
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+ paddw m0, m1
+ paddw m2, m8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ paddd m13, m0, m2
+
+
+ ; SA8D[16x8] ; pix[24*stride+16]
+ lea r0, [r0+8*r1]
+ lea r2, [r2+8*r3]
+ lea r6, [r6+8*r1]
+ lea r7, [r7+8*r3]
+
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0]
+ movu m5, [r2]
+ psubw m0, m5
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+ paddw m0, m1
+ paddw m2, m8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ paddd m13, m0
+ paddd m13, m2
+
+ ; sum[3]
+ HADDD m13, m0
+
+
+ ; SA8D[16x8] ; pix[24*stride]
+ sub r0, mmsize
+ sub r2, mmsize
+ sub r6, mmsize
+ sub r7, mmsize
+
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0]
+ movu m5, [r2]
+ psubw m0, m5
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
+ paddw m0, m1
+ paddw m2, m8
+ pmaddwd m0, m7
+ pmaddwd m2, m7
+ paddd m12, m0
+ paddd m12, m2
+
+ ; sum[2]
+ HADDD m12, m0
+ punpckldq xm12, xm13
+
+ ; SA8D
+ punpcklqdq xm0, xm10, xm12
+ paddd xm0, [pd_1]
+ psrld xm0, 1
+ HADDD xm0, xm1
+
+ movd eax, xm0
+ RET
+
%endif ; HIGH_BIT_DEPTH == 1 && BIT_DEPTH == 10
More information about the x265-devel
mailing list