[x265] [PATCH 1 of 3] asm: SA8D avx2 asm code for main12

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Wed Dec 9 09:50:50 CET 2015


# HG changeset patch
# User Dnyaneshwar Gorade <goradedd at gmail.com>
# Date 1449647037 -19800
#      Wed Dec 09 13:13:57 2015 +0530
# Node ID e2b07541670331ab0cd94b5f312f8f7cac893f92
# Parent  b80087c9bf25697c3d354d732323fc895a2ca11f
asm: SA8D avx2 asm code for main12

sa8d[  8x8]  4.70x    564.58          2652.82
sa8d[ 8x16]  4.00x    1358.06         5429.52
sa8d[16x16]  5.57x    2013.70         11212.47
sa8d[16x32]  3.90x    5610.47         21883.35
sa8d[32x32]  5.36x    8274.18         44361.61
sa8d[32x64]  3.86x    23024.04        88901.80
sa8d[64x64]  4.35x    45509.79        198165.11

diff -r b80087c9bf25 -r e2b075416703 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Dec 08 15:52:21 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 09 13:13:57 2015 +0530
@@ -1313,6 +1313,9 @@
     }
     if (cpuMask & X265_CPU_AVX2)
     {
+#if X265_DEPTH == 12
+        ASSIGN_SA8D(avx2);
+#endif
         p.cu[BLOCK_4x4].intra_filter = PFX(intra_filter_4x4_avx2);
 
         // TODO: the planecopy_sp is really planecopy_SC now, must be fix it
diff -r b80087c9bf25 -r e2b075416703 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Dec 08 15:52:21 2015 +0530
+++ b/source/common/x86/pixel-a.asm	Wed Dec 09 13:13:57 2015 +0530
@@ -6499,6 +6499,1357 @@
 %endif ; !ARCH_X86_64
 %endmacro ; SA8D
 
+
+%if ARCH_X86_64 == 1 && BIT_DEPTH == 12
+INIT_YMM avx2
+cglobal sa8d_8x8_12bit
+    pmovzxwd        m0, [r0]
+    pmovzxwd        m9, [r2]
+    psubd           m0, m9
+
+    pmovzxwd        m1, [r0 + r1]
+    pmovzxwd        m9, [r2 + r3]
+    psubd           m1, m9
+
+    pmovzxwd        m2, [r0 + r1 * 2]
+    pmovzxwd        m9, [r2 + r3 * 2]
+    psubd           m2, m9
+
+    pmovzxwd        m8, [r0 + r4]
+    pmovzxwd        m9, [r2 + r5]
+    psubd           m8, m9
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+
+    pmovzxwd        m4, [r0]
+    pmovzxwd        m9, [r2]
+    psubd           m4, m9
+
+    pmovzxwd        m5, [r0 + r1]
+    pmovzxwd        m9, [r2 + r3]
+    psubd           m5, m9
+
+    pmovzxwd        m3, [r0 + r1 * 2]
+    pmovzxwd        m9, [r2 + r3 * 2]
+    psubd           m3, m9
+
+    pmovzxwd        m7, [r0 + r4]
+    pmovzxwd        m9, [r2 + r5]
+    psubd           m7, m9
+
+    mova            m6, m0
+    paddd           m0, m1
+    psubd           m1, m6
+    mova            m6, m2
+    paddd           m2, m8
+    psubd           m8, m6
+    mova            m6, m0
+
+    punpckldq       m0, m1
+    punpckhdq       m6, m1
+
+    mova            m1, m0
+    paddd           m0, m6
+    psubd           m6, m1
+    mova            m1, m2
+
+    punpckldq       m2, m8
+    punpckhdq       m1, m8
+
+    mova            m8, m2
+    paddd           m2, m1
+    psubd           m1, m8
+    mova            m8, m4
+    paddd           m4, m5
+    psubd           m5, m8
+    mova            m8, m3
+    paddd           m3, m7
+    psubd           m7, m8
+    mova            m8, m4
+
+    punpckldq       m4, m5
+    punpckhdq       m8, m5
+
+    mova            m5, m4
+    paddd           m4, m8
+    psubd           m8, m5
+    mova            m5, m3
+    punpckldq       m3, m7
+    punpckhdq       m5, m7
+
+    mova            m7, m3
+    paddd           m3, m5
+    psubd           m5, m7
+    mova            m7, m0
+    paddd           m0, m2
+    psubd           m2, m7
+    mova            m7, m6
+    paddd           m6, m1
+    psubd           m1, m7
+    mova            m7, m0
+
+    punpcklqdq      m0, m2
+    punpckhqdq      m7, m2
+
+    mova            m2, m0
+    paddd           m0, m7
+    psubd           m7, m2
+    mova            m2, m6
+
+    punpcklqdq      m6, m1
+    punpckhqdq      m2, m1
+
+    mova            m1, m6
+    paddd           m6, m2
+    psubd           m2, m1
+    mova            m1, m4
+    paddd           m4, m3
+    psubd           m3, m1
+    mova            m1, m8
+    paddd           m8, m5
+    psubd           m5, m1
+    mova            m1, m4
+
+    punpcklqdq      m4, m3
+    punpckhqdq      m1, m3
+
+    mova            m3, m4
+    paddd           m4, m1
+    psubd           m1, m3
+    mova            m3, m8
+
+    punpcklqdq      m8, m5
+    punpckhqdq      m3, m5
+
+    mova            m5, m8
+    paddd           m8, m3
+    psubd           m3, m5
+    mova            m5, m0
+    paddd           m0, m4
+    psubd           m4, m5
+    mova            m5, m7
+    paddd           m7, m1
+    psubd           m1, m5
+    mova            m5, m0
+
+    vinserti128     m0, m0, xm4, 1
+    vperm2i128      m5, m5, m4, 00110001b
+
+    pxor            m4, m4
+    psubd           m4, m0
+    pmaxsd          m0, m4
+    pxor            m4, m4
+    psubd           m4, m5
+    pmaxsd          m5, m4
+    pmaxsd          m0, m5
+    mova            m4, m7
+
+    vinserti128     m7, m7, xm1, 1
+    vperm2i128      m4, m4, m1, 00110001b
+
+    pxor            m1, m1
+    psubd           m1, m7
+    pmaxsd          m7, m1
+    pxor            m1, m1
+    psubd           m1, m4
+    pmaxsd          m4, m1
+    pmaxsd          m7, m4
+    mova            m1, m6
+    paddd           m6, m8
+    psubd           m8, m1
+    mova            m1, m2
+    paddd           m2, m3
+    psubd           m3, m1
+    mova            m1, m6
+
+    vinserti128     m6, m6, xm8, 1
+    vperm2i128      m1, m1, m8, 00110001b
+
+    pxor            m8, m8
+    psubd           m8, m6
+    pmaxsd          m6, m8
+    pxor            m8, m8
+    psubd           m8, m1
+    pmaxsd          m1, m8
+    pmaxsd          m6, m1
+    mova            m8, m2
+
+    vinserti128     m2, m2, xm3, 1
+    vperm2i128      m8, m8, m3, 00110001b
+
+    pxor            m3, m3
+    psubd           m3, m2
+    pmaxsd          m2, m3
+    pxor            m3, m3
+    psubd           m3, m8
+    pmaxsd          m8, m3
+    pmaxsd          m2, m8
+    paddd           m0, m6
+    paddd           m0, m7
+    paddd           m0, m2
+    ret
+
+cglobal pixel_sa8d_8x8, 4,6,10
+    add             r1d, r1d
+    add             r3d, r3d
+    lea             r4, [r1 + r1 * 2]
+    lea             r5, [r3 + r3 * 2]
+
+    call            sa8d_8x8_12bit
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    movd            eax, xm0
+    add             eax, 1
+    shr             eax, 1
+    RET
+
+cglobal pixel_sa8d_8x16, 4,7,11
+    add             r1d, r1d
+    add             r3d, r3d
+    lea             r4, [r1 + r1 * 2]
+    lea             r5, [r3 + r3 * 2]
+    pxor            m10, m10
+
+    call            sa8d_8x8_12bit
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm10, xm0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm0, xm10
+    movd            eax, xm0
+    RET
+
+cglobal pixel_sa8d_16x16, 4,8,11
+    add             r1d, r1d
+    add             r3d, r3d
+    lea             r4, [r1 + r1 * 2]
+    lea             r5, [r3 + r3 * 2]
+    mov             r6, r0
+    mov             r7, r2
+    pxor            m10, m10
+
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    movd            eax, xm0
+    add             eax, 1
+    shr             eax, 1
+    RET
+
+cglobal pixel_sa8d_16x32, 4,8,12
+    add             r1d, r1d
+    add             r3d, r3d
+    lea             r4, [r1 + r1 * 2]
+    lea             r5, [r3 + r3 * 2]
+    mov             r6, r0
+    mov             r7, r2
+    pxor            m10, m10
+    pxor            m11, m11
+
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    lea             r6, [r6 + r1 * 8]
+    lea             r6, [r6 + r1 * 8]
+    lea             r7, [r7 + r3 * 8]
+    lea             r7, [r7 + r3 * 8]
+    pxor            m10, m10
+    mov             r0, r6
+    mov             r2, r7
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+    movd            eax, xm11
+    RET
+
+cglobal pixel_sa8d_32x32, 4,8,12
+    add             r1d, r1d
+    add             r3d, r3d
+    lea             r4, [r1 + r1 * 2]
+    lea             r5, [r3 + r3 * 2]
+    mov             r6, r0
+    mov             r7, r2
+    pxor            m10, m10
+    pxor            m11, m11
+
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    lea             r6, [r6 + r1 * 8]
+    lea             r6, [r6 + r1 * 8]
+    lea             r7, [r7 + r3 * 8]
+    lea             r7, [r7 + r3 * 8]
+    pxor            m10, m10
+    mov             r0, r6
+    mov             r2, r7
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+    movd            eax, xm11
+    RET
+
+cglobal pixel_sa8d_32x64, 4,8,12
+    add             r1d, r1d
+    add             r3d, r3d
+    lea             r4, [r1 + r1 * 2]
+    lea             r5, [r3 + r3 * 2]
+    mov             r6, r0
+    mov             r7, r2
+    pxor            m10, m10
+    pxor            m11, m11
+
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    lea             r6, [r6 + r1 * 8]
+    lea             r6, [r6 + r1 * 8]
+    lea             r7, [r7 + r3 * 8]
+    lea             r7, [r7 + r3 * 8]
+    pxor            m10, m10
+    mov             r0, r6
+    mov             r2, r7
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    lea             r6, [r6 + r1 * 8]
+    lea             r6, [r6 + r1 * 8]
+    lea             r7, [r7 + r3 * 8]
+    lea             r7, [r7 + r3 * 8]
+    pxor            m10, m10
+    mov             r0, r6
+    mov             r2, r7
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    lea             r6, [r6 + r1 * 8]
+    lea             r6, [r6 + r1 * 8]
+    lea             r7, [r7 + r3 * 8]
+    lea             r7, [r7 + r3 * 8]
+    pxor            m10, m10
+    mov             r0, r6
+    mov             r2, r7
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+    movd            eax, xm11
+    RET
+
+cglobal pixel_sa8d_64x64, 4,8,12
+    add             r1d, r1d
+    add             r3d, r3d
+    lea             r4, [r1 + r1 * 2]
+    lea             r5, [r3 + r3 * 2]
+    mov             r6, r0
+    mov             r7, r2
+    pxor            m10, m10
+    pxor            m11, m11
+
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 64]
+    lea             r2, [r7 + 64]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 80]
+    lea             r2, [r7 + 80]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 96]
+    lea             r2, [r7 + 96]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 112]
+    lea             r2, [r7 + 112]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    lea             r6, [r6 + r1 * 8]
+    lea             r6, [r6 + r1 * 8]
+    lea             r7, [r7 + r3 * 8]
+    lea             r7, [r7 + r3 * 8]
+    pxor            m10, m10
+    mov             r0, r6
+    mov             r2, r7
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 64]
+    lea             r2, [r7 + 64]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 80]
+    lea             r2, [r7 + 80]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 96]
+    lea             r2, [r7 + 96]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 112]
+    lea             r2, [r7 + 112]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    lea             r6, [r6 + r1 * 8]
+    lea             r6, [r6 + r1 * 8]
+    lea             r7, [r7 + r3 * 8]
+    lea             r7, [r7 + r3 * 8]
+    pxor            m10, m10
+    mov             r0, r6
+    mov             r2, r7
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 64]
+    lea             r2, [r7 + 64]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 80]
+    lea             r2, [r7 + 80]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 96]
+    lea             r2, [r7 + 96]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 112]
+    lea             r2, [r7 + 112]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    lea             r6, [r6 + r1 * 8]
+    lea             r6, [r6 + r1 * 8]
+    lea             r7, [r7 + r3 * 8]
+    lea             r7, [r7 + r3 * 8]
+    pxor            m10, m10
+    mov             r0, r6
+    mov             r2, r7
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 16]
+    lea             r2, [r7 + 16]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 32]
+    lea             r2, [r7 + 32]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 48]
+    lea             r2, [r7 + 48]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 64]
+    lea             r2, [r7 + 64]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 80]
+    lea             r2, [r7 + 80]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+
+    pxor            m10, m10
+    lea             r0, [r6 + 96]
+    lea             r2, [r7 + 96]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r6 + 112]
+    lea             r2, [r7 + 112]
+    call            sa8d_8x8_12bit
+    paddd           m10, m0
+
+    lea             r0, [r0 + r1 * 4]
+    lea             r2, [r2 + r3 * 4]
+    call            sa8d_8x8_12bit
+    paddd           m0, m10
+
+    vextracti128    xm6, m0, 1
+    paddd           xm0, xm6
+
+    movhlps         xm6, xm0
+    paddd           xm0, xm6
+
+    pshuflw         xm6, xm0, 0Eh
+    paddd           xm0, xm6
+    paddd           xm0, [pd_1]
+    psrld           xm0, 1
+    paddd           xm11, xm0
+    movd            eax, xm11
+    RET
+%endif
+
+
 ;=============================================================================
 ; INTRA SATD
 ;=============================================================================
@@ -6510,7 +7861,9 @@
 %define movdqu movups
 %define punpcklqdq movlhps
 INIT_XMM sse2
+%if BIT_DEPTH <= 10
 SA8D
+%endif
 SATDS_SSE2
 
 %if HIGH_BIT_DEPTH == 0
@@ -6526,8 +7879,10 @@
 %define LOAD_SUMSUB_16P  LOAD_SUMSUB_16P_SSSE3
 %endif
 INIT_XMM ssse3
+%if BIT_DEPTH <= 10
+SA8D
+%endif
 SATDS_SSE2
-SA8D
 %undef movdqa ; nehalem doesn't like movaps
 %undef movdqu ; movups
 %undef punpcklqdq ; or movlhps
@@ -6535,21 +7890,24 @@
 %define TRANS TRANS_SSE4
 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_PENRYN
 INIT_XMM sse4
+%if BIT_DEPTH <= 10
+SA8D
+%endif
 SATDS_SSE2
-SA8D
 
 ; Sandy/Ivy Bridge and Bulldozer do movddup in the load unit, so
 ; it's effectively free.
 %define LOAD_DUP_4x8P LOAD_DUP_4x8P_CONROE
 INIT_XMM avx
+SA8D
 SATDS_SSE2
-SA8D
 
 %define TRANS TRANS_XOP
 INIT_XMM xop
+%if BIT_DEPTH <= 10
+SA8D
+%endif
 SATDS_SSE2
-SA8D
-
 
 %if HIGH_BIT_DEPTH == 0
 %define LOAD_SUMSUB_8x4P LOAD_SUMSUB8_16x4P_AVX2



More information about the x265-devel mailing list