[x265] [PATCH] asm: avx2 code for satd_48x64 and 64xN, improved over ~100% than SSE
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Thu Apr 16 12:13:13 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1429173485 -19800
# Thu Apr 16 14:08:05 2015 +0530
# Node ID 04e7526a8bde9e46867f5c4cfb63b98409c7fb44
# Parent ebca2a0d3ab905b62c346d5d0b23d50c618d5827
asm: avx2 code for satd_48x64 and 64xN, improved over ~100% than SSE
AVX2:
satd[48x64] 12.52x 7696.91 96366.03
satd[64x48] 12.16x 8103.43 98564.64
satd[64x16] 12.15x 2759.65 33537.19
satd[64x32] 12.12x 5372.52 65090.38
satd[64x64] 13.02x 10260.38 133615.69
SSE:
satd[48x64] 5.32x 18146.13 96505.38
satd[64x48] 5.33x 18201.03 96975.23
satd[64x16] 5.21x 6272.14 32651.24
satd[64x32] 5.42x 11910.58 64529.81
satd[64x64] 5.30x 26665.73 141387.59
diff -r ebca2a0d3ab9 -r 04e7526a8bde source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Apr 16 12:22:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Apr 16 14:08:05 2015 +0530
@@ -1707,6 +1707,11 @@
p.pu[LUMA_32x24].satd = x265_pixel_satd_32x24_avx2;
p.pu[LUMA_32x32].satd = x265_pixel_satd_32x32_avx2;
p.pu[LUMA_32x64].satd = x265_pixel_satd_32x64_avx2;
+ p.pu[LUMA_48x64].satd = x265_pixel_satd_48x64_avx2;
+ p.pu[LUMA_64x16].satd = x265_pixel_satd_64x16_avx2;
+ p.pu[LUMA_64x32].satd = x265_pixel_satd_64x32_avx2;
+ p.pu[LUMA_64x48].satd = x265_pixel_satd_64x48_avx2;
+ p.pu[LUMA_64x64].satd = x265_pixel_satd_64x64_avx2;
p.pu[LUMA_32x8].sad = x265_pixel_sad_32x8_avx2;
p.pu[LUMA_32x16].sad = x265_pixel_sad_32x16_avx2;
diff -r ebca2a0d3ab9 -r 04e7526a8bde source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Thu Apr 16 12:22:53 2015 +0530
+++ b/source/common/x86/pixel-a.asm Thu Apr 16 14:08:05 2015 +0530
@@ -10903,4 +10903,279 @@
movd eax, xm0
RET
+cglobal pixel_satd_48x64, 4,8,10 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ mova m9, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+ pmaddwd m9, [pw_1]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ mova m8, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+ pmaddwd m8, [pw_1]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ pmaddwd m6, [pw_1]
+ vextracti128 xm2, m9, 1
+ vextracti128 xm1, m8, 1
+ vextracti128 xm0, m6, 1
+ paddd xm2, xm9
+ paddd xm1, xm8
+ paddd xm0, xm6
+ paddd xm0, xm1
+ paddd xm0, xm2
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
+cglobal pixel_satd_64x16, 4,8,8 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ vextracti128 xm0, m6, 1
+ paddw xm0, xm6
+ pmaddwd xm0, [pw_1]
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
+cglobal pixel_satd_64x32, 4,8,9 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ mova m8, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+ pmaddwd m8, [pw_1]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ pmaddwd m6, [pw_1]
+ vextracti128 xm1, m8, 1
+ vextracti128 xm0, m6, 1
+ paddd xm1, xm8
+ paddd xm0, xm6
+ paddd xm0, xm1
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
+cglobal pixel_satd_64x48, 4,8,10 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ mova m8, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+ pmaddwd m8, [pw_1]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ mova m9, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+ pmaddwd m9, [pw_1]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ pmaddwd m6, [pw_1]
+ vextracti128 xm2, m9, 1
+ vextracti128 xm1, m8, 1
+ vextracti128 xm0, m6, 1
+ paddd xm2, xm9
+ paddd xm1, xm8
+ paddd xm0, xm6
+ paddd xm0, xm2
+ paddd xm0, xm1
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
+cglobal pixel_satd_64x64, 4,8,11 ; if WIN64 && cpuflag(avx2)
+ mova m7, [hmul_16p]
+ lea r4, [3 * r1]
+ lea r5, [3 * r3]
+ pxor m6, m6
+ mov r6, r0
+ mov r7, r2
+
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 16]
+ lea r2, [r7 + 16]
+ mova m10, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+ pmaddwd m10, [pw_1]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 32]
+ lea r2, [r7 + 32]
+ mova m9, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+ pmaddwd m9, [pw_1]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ lea r0, [r6 + 48]
+ lea r2, [r7 + 48]
+ mova m8, m6 ; to avoid overflow, move to another register
+ pxor m6, m6
+ pmaddwd m8, [pw_1]
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+ call calc_satd_16x8
+
+ pmaddwd m6, [pw_1]
+ vextracti128 xm3, m10, 1
+ vextracti128 xm2, m9, 1
+ vextracti128 xm1, m8, 1
+ vextracti128 xm0, m6, 1
+ paddd xm3, xm10
+ paddd xm2, xm9
+ paddd xm1, xm8
+ paddd xm0, xm6
+ paddd xm0, xm3
+ paddd xm0, xm2
+ paddd xm0, xm1
+ movhlps xm7, xm0
+ paddd xm0, xm7
+ pshuflw xm7, xm0, q0032
+ paddd xm0, xm7
+ movd eax, xm0
+ RET
+
%endif ; if ARCH_X86_64 == 1
More information about the x265-devel
mailing list