[x265] [PATCH] asm: avx2 code for sad_x4_64xN, improved over 40% than SSE
rajesh at multicorewareinc.com
rajesh at multicorewareinc.com
Tue Sep 15 10:52:59 CEST 2015
# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1442296528 -19800
# Tue Sep 15 11:25:28 2015 +0530
# Node ID 5fd2ef7bbf09f771d479a11eec2256d02fadf1cf
# Parent 365f7ed4d89628d49cd6af8d81d4edc01f73ffad
asm: avx2 code for sad_x4_64xN, improved over 40% than SSE
avx2:
sad_x4[64x16] 75.32x 1488.36 112105.17
sad_x4[64x32] 69.74x 2791.91 194701.75
sad_x4[64x48] 72.11x 4016.55 289624.06
sad_x4[64x64] 59.44x 6698.95 398170.66
ssse3:
sad_x4[64x16] 38.05x 2574.72 97978.73
sad_x4[64x32] 37.85x 5199.53 196795.09
sad_x4[64x48] 38.70x 7573.90 293083.88
sad_x4[64x64] 34.82x 11349.89 395213.00
diff -r 365f7ed4d896 -r 5fd2ef7bbf09 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Sep 15 11:25:28 2015 +0530
@@ -2866,6 +2866,10 @@
p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx2);
p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx2);
p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx2);
+ p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx2);
+ p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx2);
+ p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx2);
+ p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx2);
p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
diff -r 365f7ed4d896 -r 5fd2ef7bbf09 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/sad-a.asm Tue Sep 15 11:25:28 2015 +0530
@@ -3328,6 +3328,453 @@
SAD_X4_END_SSE2 1
%endmacro
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+%macro SAD_X4_64x8_AVX2 0
+ movu m4, [r0]
+ movu m5, [r1]
+ movu m6, [r2]
+ movu m7, [r3]
+ movu m8, [r4]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + mmsize]
+ movu m5, [r1 + mmsize]
+ movu m6, [r2 + mmsize]
+ movu m7, [r3 + mmsize]
+ movu m8, [r4 + mmsize]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE]
+ movu m5, [r1 + r5]
+ movu m6, [r2 + r5]
+ movu m7, [r3 + r5]
+ movu m8, [r4 + r5]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE + mmsize]
+ movu m5, [r1 + r5 + mmsize]
+ movu m6, [r2 + r5 + mmsize]
+ movu m7, [r3 + r5 + mmsize]
+ movu m8, [r4 + r5 + mmsize]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 2]
+ movu m5, [r1 + r5 * 2]
+ movu m6, [r2 + r5 * 2]
+ movu m7, [r3 + r5 * 2]
+ movu m8, [r4 + r5 * 2]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 2 + mmsize]
+ movu m5, [r1 + r5 * 2 + mmsize]
+ movu m6, [r2 + r5 * 2 + mmsize]
+ movu m7, [r3 + r5 * 2 + mmsize]
+ movu m8, [r4 + r5 * 2 + mmsize]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 3]
+ movu m5, [r1 + r7]
+ movu m6, [r2 + r7]
+ movu m7, [r3 + r7]
+ movu m8, [r4 + r7]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 3 + mmsize]
+ movu m5, [r1 + r7 + mmsize]
+ movu m6, [r2 + r7 + mmsize]
+ movu m7, [r3 + r7 + mmsize]
+ movu m8, [r4 + r7 + mmsize]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ movu m4, [r0]
+ movu m5, [r1]
+ movu m6, [r2]
+ movu m7, [r3]
+ movu m8, [r4]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + mmsize]
+ movu m5, [r1 + mmsize]
+ movu m6, [r2 + mmsize]
+ movu m7, [r3 + mmsize]
+ movu m8, [r4 + mmsize]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE]
+ movu m5, [r1 + r5]
+ movu m6, [r2 + r5]
+ movu m7, [r3 + r5]
+ movu m8, [r4 + r5]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE + mmsize]
+ movu m5, [r1 + r5 + mmsize]
+ movu m6, [r2 + r5 + mmsize]
+ movu m7, [r3 + r5 + mmsize]
+ movu m8, [r4 + r5 + mmsize]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 2]
+ movu m5, [r1 + r5 * 2]
+ movu m6, [r2 + r5 * 2]
+ movu m7, [r3 + r5 * 2]
+ movu m8, [r4 + r5 * 2]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 2 + mmsize]
+ movu m5, [r1 + r5 * 2 + mmsize]
+ movu m6, [r2 + r5 * 2 + mmsize]
+ movu m7, [r3 + r5 * 2 + mmsize]
+ movu m8, [r4 + r5 * 2 + mmsize]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 3]
+ movu m5, [r1 + r7]
+ movu m6, [r2 + r7]
+ movu m7, [r3 + r7]
+ movu m8, [r4 + r7]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+
+ movu m4, [r0 + FENC_STRIDE * 3 + mmsize]
+ movu m5, [r1 + r7 + mmsize]
+ movu m6, [r2 + r7 + mmsize]
+ movu m7, [r3 + r7 + mmsize]
+ movu m8, [r4 + r7 + mmsize]
+
+ psadbw m9, m4, m5
+ paddd m0, m9
+ psadbw m5, m4, m6
+ paddd m1, m5
+ psadbw m6, m4, m7
+ paddd m2, m6
+ psadbw m4, m8
+ paddd m3, m4
+%endmacro
+
+%macro PIXEL_SAD_X4_END_AVX2 0
+ vextracti128 xm4, m0, 1
+ vextracti128 xm5, m1, 1
+ vextracti128 xm6, m2, 1
+ vextracti128 xm7, m3, 1
+ paddd m0, m4
+ paddd m1, m5
+ paddd m2, m6
+ paddd m3, m7
+ pshufd xm4, xm0, 2
+ pshufd xm5, xm1, 2
+ pshufd xm6, xm2, 2
+ pshufd xm7, xm3, 2
+ paddd m0, m4
+ paddd m1, m5
+ paddd m2, m6
+ paddd m3, m7
+
+ movd [r6 + 0], xm0
+ movd [r6 + 4], xm1
+ movd [r6 + 8], xm2
+ movd [r6 + 12], xm3
+%endmacro
+
+cglobal pixel_sad_x4_64x16, 7,8,10
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ lea r7, [r5 * 3]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+ PIXEL_SAD_X4_END_AVX2
+ RET
+
+cglobal pixel_sad_x4_64x32, 7,8,10
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ lea r7, [r5 * 3]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+ PIXEL_SAD_X4_END_AVX2
+ RET
+
+cglobal pixel_sad_x4_64x48, 7,8,10
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ lea r7, [r5 * 3]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+ PIXEL_SAD_X4_END_AVX2
+ RET
+
+cglobal pixel_sad_x4_64x64, 7,8,10
+ pxor m0, m0
+ pxor m1, m1
+ pxor m2, m2
+ pxor m3, m3
+ lea r7, [r5 * 3]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+
+ add r0, FENC_STRIDE * 4
+ lea r1, [r1 + r5 * 4]
+ lea r2, [r2 + r5 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r4, [r4 + r5 * 4]
+
+ SAD_X4_64x8_AVX2
+ PIXEL_SAD_X4_END_AVX2
+ RET
+%endif
+
INIT_XMM sse2
SAD_X_SSE2 3, 16, 16, 7
SAD_X_SSE2 3, 16, 8, 7
More information about the x265-devel
mailing list