[x265] [PATCH] asm: avx2 code for sad_x4_64xN, improved over 40% than SSE

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Tue Sep 15 10:52:59 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1442296528 -19800
#      Tue Sep 15 11:25:28 2015 +0530
# Node ID 5fd2ef7bbf09f771d479a11eec2256d02fadf1cf
# Parent  365f7ed4d89628d49cd6af8d81d4edc01f73ffad
asm: avx2 code for sad_x4_64xN, improved over 40% than SSE

avx2:
sad_x4[64x16]  75.32x   1488.36         112105.17
sad_x4[64x32]  69.74x   2791.91         194701.75
sad_x4[64x48]  72.11x   4016.55         289624.06
sad_x4[64x64]  59.44x   6698.95         398170.66

ssse3:
sad_x4[64x16]  38.05x   2574.72         97978.73
sad_x4[64x32]  37.85x   5199.53         196795.09
sad_x4[64x48]  38.70x   7573.90         293083.88
sad_x4[64x64]  34.82x   11349.89        395213.00

diff -r 365f7ed4d896 -r 5fd2ef7bbf09 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Sep 15 11:25:28 2015 +0530
@@ -2866,6 +2866,10 @@
         p.pu[LUMA_32x64].sad_x4 = PFX(pixel_sad_x4_32x64_avx2);
         p.pu[LUMA_32x24].sad_x4 = PFX(pixel_sad_x4_32x24_avx2);
         p.pu[LUMA_32x8].sad_x4 = PFX(pixel_sad_x4_32x8_avx2);
+        p.pu[LUMA_64x16].sad_x4 = PFX(pixel_sad_x4_64x16_avx2);
+        p.pu[LUMA_64x32].sad_x4 = PFX(pixel_sad_x4_64x32_avx2);
+        p.pu[LUMA_64x48].sad_x4 = PFX(pixel_sad_x4_64x48_avx2);
+        p.pu[LUMA_64x64].sad_x4 = PFX(pixel_sad_x4_64x64_avx2);
 
         p.cu[BLOCK_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
         p.cu[BLOCK_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
diff -r 365f7ed4d896 -r 5fd2ef7bbf09 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Tue Sep 08 16:38:01 2015 +0530
+++ b/source/common/x86/sad-a.asm	Tue Sep 15 11:25:28 2015 +0530
@@ -3328,6 +3328,453 @@
     SAD_X4_END_SSE2 1
 %endmacro
 
+%if ARCH_X86_64 == 1 && HIGH_BIT_DEPTH == 0
+INIT_YMM avx2
+%macro SAD_X4_64x8_AVX2 0
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + mmsize]
+    movu            m5, [r1 + mmsize]
+    movu            m6, [r2 + mmsize]
+    movu            m7, [r3 + mmsize]
+    movu            m8, [r4 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
+    movu            m5, [r1 + r5 + mmsize]
+    movu            m6, [r2 + r5 + mmsize]
+    movu            m7, [r3 + r5 + mmsize]
+    movu            m8, [r4 + r5 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2]
+    movu            m5, [r1 + r5 * 2]
+    movu            m6, [r2 + r5 * 2]
+    movu            m7, [r3 + r5 * 2]
+    movu            m8, [r4 + r5 * 2]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2 + mmsize]
+    movu            m5, [r1 + r5 * 2 + mmsize]
+    movu            m6, [r2 + r5 * 2 + mmsize]
+    movu            m7, [r3 + r5 * 2 + mmsize]
+    movu            m8, [r4 + r5 * 2 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3]
+    movu            m5, [r1 + r7]
+    movu            m6, [r2 + r7]
+    movu            m7, [r3 + r7]
+    movu            m8, [r4 + r7]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize]
+    movu            m5, [r1 + r7 + mmsize]
+    movu            m6, [r2 + r7 + mmsize]
+    movu            m7, [r3 + r7 + mmsize]
+    movu            m8, [r4 + r7 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    movu            m4, [r0]
+    movu            m5, [r1]
+    movu            m6, [r2]
+    movu            m7, [r3]
+    movu            m8, [r4]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + mmsize]
+    movu            m5, [r1 + mmsize]
+    movu            m6, [r2 + mmsize]
+    movu            m7, [r3 + mmsize]
+    movu            m8, [r4 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE]
+    movu            m5, [r1 + r5]
+    movu            m6, [r2 + r5]
+    movu            m7, [r3 + r5]
+    movu            m8, [r4 + r5]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE + mmsize]
+    movu            m5, [r1 + r5 + mmsize]
+    movu            m6, [r2 + r5 + mmsize]
+    movu            m7, [r3 + r5 + mmsize]
+    movu            m8, [r4 + r5 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2]
+    movu            m5, [r1 + r5 * 2]
+    movu            m6, [r2 + r5 * 2]
+    movu            m7, [r3 + r5 * 2]
+    movu            m8, [r4 + r5 * 2]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 2 + mmsize]
+    movu            m5, [r1 + r5 * 2 + mmsize]
+    movu            m6, [r2 + r5 * 2 + mmsize]
+    movu            m7, [r3 + r5 * 2 + mmsize]
+    movu            m8, [r4 + r5 * 2 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3]
+    movu            m5, [r1 + r7]
+    movu            m6, [r2 + r7]
+    movu            m7, [r3 + r7]
+    movu            m8, [r4 + r7]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+
+    movu            m4, [r0 + FENC_STRIDE * 3 + mmsize]
+    movu            m5, [r1 + r7 + mmsize]
+    movu            m6, [r2 + r7 + mmsize]
+    movu            m7, [r3 + r7 + mmsize]
+    movu            m8, [r4 + r7 + mmsize]
+
+    psadbw          m9, m4, m5
+    paddd           m0, m9
+    psadbw          m5, m4, m6
+    paddd           m1, m5
+    psadbw          m6, m4, m7
+    paddd           m2, m6
+    psadbw          m4, m8
+    paddd           m3, m4
+%endmacro
+
+%macro PIXEL_SAD_X4_END_AVX2 0
+    vextracti128   xm4, m0, 1
+    vextracti128   xm5, m1, 1
+    vextracti128   xm6, m2, 1
+    vextracti128   xm7, m3, 1
+    paddd           m0, m4
+    paddd           m1, m5
+    paddd           m2, m6
+    paddd           m3, m7
+    pshufd         xm4, xm0, 2
+    pshufd         xm5, xm1, 2
+    pshufd         xm6, xm2, 2
+    pshufd         xm7, xm3, 2
+    paddd           m0, m4
+    paddd           m1, m5
+    paddd           m2, m6
+    paddd           m3, m7
+
+    movd            [r6 + 0], xm0
+    movd            [r6 + 4], xm1
+    movd            [r6 + 8], xm2
+    movd            [r6 + 12], xm3
+%endmacro
+
+cglobal pixel_sad_x4_64x16, 7,8,10
+    pxor            m0, m0
+    pxor            m1, m1
+    pxor            m2, m2
+    pxor            m3, m3
+    lea             r7, [r5 * 3]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+    PIXEL_SAD_X4_END_AVX2
+    RET
+
+cglobal pixel_sad_x4_64x32, 7,8,10
+    pxor            m0, m0
+    pxor            m1, m1
+    pxor            m2, m2
+    pxor            m3, m3
+    lea             r7, [r5 * 3]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+    PIXEL_SAD_X4_END_AVX2
+    RET
+
+cglobal pixel_sad_x4_64x48, 7,8,10
+    pxor            m0, m0
+    pxor            m1, m1
+    pxor            m2, m2
+    pxor            m3, m3
+    lea             r7, [r5 * 3]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+    PIXEL_SAD_X4_END_AVX2
+    RET
+
+cglobal pixel_sad_x4_64x64, 7,8,10
+    pxor            m0, m0
+    pxor            m1, m1
+    pxor            m2, m2
+    pxor            m3, m3
+    lea             r7, [r5 * 3]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+
+    add             r0, FENC_STRIDE * 4
+    lea             r1, [r1 + r5 * 4]
+    lea             r2, [r2 + r5 * 4]
+    lea             r3, [r3 + r5 * 4]
+    lea             r4, [r4 + r5 * 4]
+
+    SAD_X4_64x8_AVX2
+    PIXEL_SAD_X4_END_AVX2
+    RET
+%endif
+
 INIT_XMM sse2
 SAD_X_SSE2 3, 16, 16, 7
 SAD_X_SSE2 3, 16,  8, 7


More information about the x265-devel mailing list