[x265] [PATCH] asm: psyCost_ss_8x8 in sse4: improve 7652c->2515c

Divya Manivannan divya at multicorewareinc.com
Tue Jan 13 10:29:35 CET 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1421141305 -19800
#      Tue Jan 13 14:58:25 2015 +0530
# Node ID 77aa3a2e457a6759a4385cf459c10615d0b5d1b9
# Parent  50a2071500dc4b813edb357c298867931bbf42a1
asm: psyCost_ss_8x8 in sse4: improve 7652c->2515c

diff -r 50a2071500dc -r 77aa3a2e457a source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jan 12 20:01:58 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jan 13 14:58:25 2015 +0530
@@ -1429,6 +1429,8 @@
         p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_sse4;
         p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_sse4;
         p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_sse4;
+
+        p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_sse4;
 #endif
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
     }
@@ -1716,6 +1718,8 @@
         p.cu[BLOCK_16x16].psy_cost_pp = x265_psyCost_pp_16x16_sse4;
         p.cu[BLOCK_32x32].psy_cost_pp = x265_psyCost_pp_32x32_sse4;
         p.cu[BLOCK_64x64].psy_cost_pp = x265_psyCost_pp_64x64_sse4;
+
+        p.cu[BLOCK_8x8].psy_cost_ss = x265_psyCost_ss_8x8_sse4;
 #endif
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_sse4;
     }
diff -r 50a2071500dc -r 77aa3a2e457a source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Mon Jan 12 20:01:58 2015 +0530
+++ b/source/common/x86/pixel-a.asm	Tue Jan 13 14:58:25 2015 +0530
@@ -43,7 +43,8 @@
 mask_1100: times 2 dd 0, -1
 hmul_8w:   times 4 dw 1
            times 2 dw 1, -1
-
+ALIGN 32
+hmul_w:    dw 1, -1, 1, -1, 1, -1, 1, -1
 ALIGN 32
 transd_shuf1: SHUFFLE_MASK_W 0, 8, 2, 10, 4, 12, 6, 14
 transd_shuf2: SHUFFLE_MASK_W 1, 9, 3, 11, 5, 13, 7, 15
@@ -68,6 +69,7 @@
 cextern pw_pmmpzzzz
 cextern pd_1
 cextern popcnt_table
+cextern pd_2
 
 ;=============================================================================
 ; SATD
@@ -7723,3 +7725,583 @@
     pabsd           m0, m7
     movd            eax, m0
     RET
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal psyCost_ss_8x8, 4, 6, 15
+
+    mova            m13, [hmul_w]
+    mova            m14, [pw_1]
+    add             r1, r1
+    add             r3, r3
+    lea             r4, [3 * r1]
+    movu            m0, [r0]
+    movu            m1, [r0 + r1]
+    movu            m2, [r0 + r1 * 2]
+    movu            m3, [r0 + r4]
+    lea             r5, [r0 + r1 * 4]
+    movu            m4, [r5]
+    movu            m5, [r5 + r1]
+    movu            m6, [r5 + r1 * 2]
+    movu            m7, [r5 + r4]
+
+    pabsw           m8, m0
+    pabsw           m9, m1
+    paddw           m8, m9
+    pabsw           m10, m2
+    pabsw           m11, m3
+    paddw           m10, m11
+    paddw           m8, m10
+    pabsw           m9, m4
+    pabsw           m10, m5
+    paddw           m9, m10
+    pabsw           m11, m6
+    pabsw           m12, m7
+    paddw           m11, m12
+    paddw           m9, m11
+    paddw           m8, m9
+    movhlps         m9, m8
+    pmovzxwd        m8, m8
+    pmovzxwd        m9, m9
+    paddd           m8, m9
+    movhlps         m9, m8
+    paddd           m8, m9
+    psrldq          m9, m8, 4
+    paddd           m8, m9
+    psrld           m8, 2
+
+    pmaddwd         m0, m13
+    pmaddwd         m1, m13
+    pmaddwd         m2, m13
+    pmaddwd         m3, m13
+
+    psrldq          m9, m0, 4
+    psubd           m10, m0, m9
+    paddd           m0, m9
+    shufps          m0, m10, 10001000b
+    psrldq          m9, m0, 4
+    psubd           m10, m0, m9
+    paddd           m0, m9
+    shufps          m0, m10, 10001000b
+
+    psrldq          m9, m1, 4
+    psubd           m10, m1, m9
+    paddd           m1, m9
+    shufps          m1, m10, 10001000b
+    psrldq          m9, m1, 4
+    psubd           m10, m1, m9
+    paddd           m1, m9
+    shufps          m1, m10, 10001000b
+
+    psrldq          m9, m2, 4
+    psubd           m10, m2, m9
+    paddd           m2, m9
+    shufps          m2, m10, 10001000b
+    psrldq          m9, m2, 4
+    psubd           m10, m2, m9
+    paddd           m2, m9
+    shufps          m2, m10, 10001000b
+
+    psrldq          m9, m3, 4
+    psubd           m10, m3, m9
+    paddd           m3, m9
+    shufps          m3, m10, 10001000b
+    psrldq          m9, m3, 4
+    psubd           m10, m3, m9
+    paddd           m3, m9
+    shufps          m3, m10, 10001000b
+
+    SUMSUB_BA d, 0, 1, 9
+    SUMSUB_BA d, 2, 3, 9
+    SUMSUB_BA d, 0, 2, 9
+    SUMSUB_BA d, 1, 3, 9
+
+    pmaddwd         m4, m13
+    pmaddwd         m5, m13
+    pmaddwd         m6, m13
+    pmaddwd         m7, m13
+
+    psrldq          m9, m4, 4
+    psubd           m10, m4, m9
+    paddd           m4, m9
+    shufps          m4, m10, 10001000b
+    psrldq          m9, m4, 4
+    psubd           m10, m4, m9
+    paddd           m4, m9
+    shufps          m4, m10, 10001000b
+
+    psrldq          m9, m5, 4
+    psubd           m10, m5, m9
+    paddd           m5, m9
+    shufps          m5, m10, 10001000b
+    psrldq          m9, m5, 4
+    psubd           m10, m5, m9
+    paddd           m5, m9
+    shufps          m5, m10, 10001000b
+
+    psrldq          m9, m6, 4
+    psubd           m10, m6, m9
+    paddd           m6, m9
+    shufps          m6, m10, 10001000b
+    psrldq          m9, m6, 4
+    psubd           m10, m6, m9
+    paddd           m6, m9
+    shufps          m6, m10, 10001000b
+
+    psrldq          m9, m7, 4
+    psubd           m10, m7, m9
+    paddd           m7, m9
+    shufps          m7, m10, 10001000b
+    psrldq          m9, m7, 4
+    psubd           m10, m7, m9
+    paddd           m7, m9
+    shufps          m7, m10, 10001000b
+
+    SUMSUB_BA d, 4, 5, 9
+    SUMSUB_BA d, 6, 7, 9
+    SUMSUB_BA d, 4, 6, 9
+    SUMSUB_BA d, 5, 7, 9
+
+    SUMSUB_BA d, 0, 4, 9
+    SUMSUB_BA d, 1, 5, 9
+    SUMSUB_BA d, 2, 6, 9
+    SUMSUB_BA d, 3, 7, 9
+
+    pabsd           m0, m0
+    pabsd           m2, m2
+    pabsd           m1, m1
+    pabsd           m3, m3
+    pabsd           m4, m4
+    pabsd           m5, m5
+    pabsd           m6, m6
+    pabsd           m7, m7
+
+    paddd           m0, m2
+    paddd           m1, m3
+    paddd           m0, m1
+    paddd           m5, m4
+    paddd           m0, m5
+    paddd           m7, m6
+    paddd           m11, m0, m7
+
+    movu            m0, [r0]
+    movu            m1, [r0 + r1]
+    movu            m2, [r0 + r1 * 2]
+    movu            m3, [r0 + r4]
+
+    pmaddwd         m0, m14
+    pmaddwd         m1, m14
+    pmaddwd         m2, m14
+    pmaddwd         m3, m14
+
+    psrldq          m9, m0, 4
+    psubd           m10, m0, m9
+    paddd           m0, m9
+    shufps          m0, m10, 10001000b
+    psrldq          m9, m0, 4
+    psubd           m10, m0, m9
+    paddd           m0, m9
+    shufps          m0, m10, 10001000b
+
+    psrldq          m9, m1, 4
+    psubd           m10, m1, m9
+    paddd           m1, m9
+    shufps          m1, m10, 10001000b
+    psrldq          m9, m1, 4
+    psubd           m10, m1, m9
+    paddd           m1, m9
+    shufps          m1, m10, 10001000b
+
+    psrldq          m9, m2, 4
+    psubd           m10, m2, m9
+    paddd           m2, m9
+    shufps          m2, m10, 10001000b
+    psrldq          m9, m2, 4
+    psubd           m10, m2, m9
+    paddd           m2, m9
+    shufps          m2, m10, 10001000b
+
+    psrldq          m9, m3, 4
+    psubd           m10, m3, m9
+    paddd           m3, m9
+    shufps          m3, m10, 10001000b
+    psrldq          m9, m3, 4
+    psubd           m10, m3, m9
+    paddd           m3, m9
+    shufps          m3, m10, 10001000b
+
+    SUMSUB_BA d, 0, 1, 9
+    SUMSUB_BA d, 2, 3, 9
+    SUMSUB_BA d, 0, 2, 9
+    SUMSUB_BA d, 1, 3, 9
+
+    movu            m4, [r5]
+    movu            m5, [r5 + r1]
+    movu            m6, [r5 + r1 * 2]
+    movu            m7, [r5 + r4]
+
+    pmaddwd         m4, m14
+    pmaddwd         m5, m14
+    pmaddwd         m6, m14
+    pmaddwd         m7, m14
+
+    psrldq          m9, m4, 4
+    psubd           m10, m4, m9
+    paddd           m4, m9
+    shufps          m4, m10, 10001000b
+    psrldq          m9, m4, 4
+    psubd           m10, m4, m9
+    paddd           m4, m9
+    shufps          m4, m10, 10001000b
+
+    psrldq          m9, m5, 4
+    psubd           m10, m5, m9
+    paddd           m5, m9
+    shufps          m5, m10, 10001000b
+    psrldq          m9, m5, 4
+    psubd           m10, m5, m9
+    paddd           m5, m9
+    shufps          m5, m10, 10001000b
+
+    psrldq          m9, m6, 4
+    psubd           m10, m6, m9
+    paddd           m6, m9
+    shufps          m6, m10, 10001000b
+    psrldq          m9, m6, 4
+    psubd           m10, m6, m9
+    paddd           m6, m9
+    shufps          m6, m10, 10001000b
+
+    psrldq          m9, m7, 4
+    psubd           m10, m7, m9
+    paddd           m7, m9
+    shufps          m7, m10, 10001000b
+    psrldq          m9, m7, 4
+    psubd           m10, m7, m9
+    paddd           m7, m9
+    shufps          m7, m10, 10001000b
+
+    SUMSUB_BA d, 4, 5, 9
+    SUMSUB_BA d, 6, 7, 9
+    SUMSUB_BA d, 4, 6, 9
+    SUMSUB_BA d, 5, 7, 9
+
+    SUMSUB_BA d, 0, 4, 9
+    SUMSUB_BA d, 1, 5, 9
+    SUMSUB_BA d, 2, 6, 9
+    SUMSUB_BA d, 3, 7, 9
+
+    pabsd           m0, m0
+    pabsd           m2, m2
+    pabsd           m1, m1
+    pabsd           m3, m3
+    pabsd           m4, m4
+    pabsd           m5, m5
+    pabsd           m6, m6
+    pabsd           m7, m7
+
+    paddd           m0, m2
+    paddd           m1, m3
+    paddd           m0, m1
+    paddd           m5, m4
+    paddd           m0, m5
+    paddd           m7, m6
+    paddd           m0, m7
+    paddd           m0, m11
+
+    movhlps         m1, m0
+    paddd           m0, m1
+    psrldq          m1, m0, 4
+    paddd           m0, m1
+    paddd           m0, [pd_2]
+    psrld           m0, 2
+    psubd           m12, m0, m8
+
+    lea             r4, [3 * r3]
+    movu            m0, [r2]
+    movu            m1, [r2 + r3]
+    movu            m2, [r2 + r3 * 2]
+    movu            m3, [r2 + r4]
+    lea             r5, [r2 + r3 * 4]
+    movu            m4, [r5]
+    movu            m5, [r5 + r3]
+    movu            m6, [r5 + r3 * 2]
+    movu            m7, [r5 + r4]
+
+    pabsw           m8, m0
+    pabsw           m9, m1
+    paddw           m8, m9
+    pabsw           m10, m2
+    pabsw           m11, m3
+    paddw           m10, m11
+    paddw           m8, m10
+    pabsw           m9, m4
+    pabsw           m10, m5
+    paddw           m9, m10
+    pabsw           m11, m6
+    pabsw           m10, m7
+    paddw           m11, m10
+    paddw           m9, m11
+    paddw           m8, m9
+    movhlps         m9, m8
+    pmovzxwd        m8, m8
+    pmovzxwd        m9, m9
+    paddd           m8, m9
+    movhlps         m9, m8
+    paddd           m8, m9
+    psrldq          m9, m8, 4
+    paddd           m8, m9
+    psrld           m8, 2
+
+    pmaddwd         m0, m13
+    pmaddwd         m1, m13
+    pmaddwd         m2, m13
+    pmaddwd         m3, m13
+
+    psrldq          m9, m0, 4
+    psubd           m10, m0, m9
+    paddd           m0, m9
+    shufps          m0, m10, 10001000b
+    psrldq          m9, m0, 4
+    psubd           m10, m0, m9
+    paddd           m0, m9
+    shufps          m0, m10, 10001000b
+
+    psrldq          m9, m1, 4
+    psubd           m10, m1, m9
+    paddd           m1, m9
+    shufps          m1, m10, 10001000b
+    psrldq          m9, m1, 4
+    psubd           m10, m1, m9
+    paddd           m1, m9
+    shufps          m1, m10, 10001000b
+
+    psrldq          m9, m2, 4
+    psubd           m10, m2, m9
+    paddd           m2, m9
+    shufps          m2, m10, 10001000b
+    psrldq          m9, m2, 4
+    psubd           m10, m2, m9
+    paddd           m2, m9
+    shufps          m2, m10, 10001000b
+
+    psrldq          m9, m3, 4
+    psubd           m10, m3, m9
+    paddd           m3, m9
+    shufps          m3, m10, 10001000b
+    psrldq          m9, m3, 4
+    psubd           m10, m3, m9
+    paddd           m3, m9
+    shufps          m3, m10, 10001000b
+
+    SUMSUB_BA d, 0, 1, 9
+    SUMSUB_BA d, 2, 3, 9
+    SUMSUB_BA d, 0, 2, 9
+    SUMSUB_BA d, 1, 3, 9
+
+    pmaddwd         m4, m13
+    pmaddwd         m5, m13
+    pmaddwd         m6, m13
+    pmaddwd         m7, m13
+
+    psrldq          m9, m4, 4
+    psubd           m10, m4, m9
+    paddd           m4, m9
+    shufps          m4, m10, 10001000b
+    psrldq          m9, m4, 4
+    psubd           m10, m4, m9
+    paddd           m4, m9
+    shufps          m4, m10, 10001000b
+
+    psrldq          m9, m5, 4
+    psubd           m10, m5, m9
+    paddd           m5, m9
+    shufps          m5, m10, 10001000b
+    psrldq          m9, m5, 4
+    psubd           m10, m5, m9
+    paddd           m5, m9
+    shufps          m5, m10, 10001000b
+
+    psrldq          m9, m6, 4
+    psubd           m10, m6, m9
+    paddd           m6, m9
+    shufps          m6, m10, 10001000b
+    psrldq          m9, m6, 4
+    psubd           m10, m6, m9
+    paddd           m6, m9
+    shufps          m6, m10, 10001000b
+
+    psrldq          m9, m7, 4
+    psubd           m10, m7, m9
+    paddd           m7, m9
+    shufps          m7, m10, 10001000b
+    psrldq          m9, m7, 4
+    psubd           m10, m7, m9
+    paddd           m7, m9
+    shufps          m7, m10, 10001000b
+
+    SUMSUB_BA d, 4, 5, 9
+    SUMSUB_BA d, 6, 7, 9
+    SUMSUB_BA d, 4, 6, 9
+    SUMSUB_BA d, 5, 7, 9
+
+    SUMSUB_BA d, 0, 4, 9
+    SUMSUB_BA d, 1, 5, 9
+    SUMSUB_BA d, 2, 6, 9
+    SUMSUB_BA d, 3, 7, 9
+
+    pabsd           m0, m0
+    pabsd           m2, m2
+    pabsd           m1, m1
+    pabsd           m3, m3
+    pabsd           m4, m4
+    pabsd           m5, m5
+    pabsd           m6, m6
+    pabsd           m7, m7
+
+    paddd           m0, m2
+    paddd           m1, m3
+    paddd           m0, m1
+    paddd           m5, m4
+    paddd           m0, m5
+    paddd           m7, m6
+    paddd           m11, m0, m7
+
+    movu            m0, [r2]
+    movu            m1, [r2 + r3]
+    movu            m2, [r2 + r3 * 2]
+    movu            m3, [r2 + r4]
+
+    pmaddwd         m0, m14
+    pmaddwd         m1, m14
+    pmaddwd         m2, m14
+    pmaddwd         m3, m14
+
+    psrldq          m9, m0, 4
+    psubd           m10, m0, m9
+    paddd           m0, m9
+    shufps          m0, m10, 10001000b
+    psrldq          m9, m0, 4
+    psubd           m10, m0, m9
+    paddd           m0, m9
+    shufps          m0, m10, 10001000b
+
+    psrldq          m9, m1, 4
+    psubd           m10, m1, m9
+    paddd           m1, m9
+    shufps          m1, m10, 10001000b
+    psrldq          m9, m1, 4
+    psubd           m10, m1, m9
+    paddd           m1, m9
+    shufps          m1, m10, 10001000b
+
+    psrldq          m9, m2, 4
+    psubd           m10, m2, m9
+    paddd           m2, m9
+    shufps          m2, m10, 10001000b
+    psrldq          m9, m2, 4
+    psubd           m10, m2, m9
+    paddd           m2, m9
+    shufps          m2, m10, 10001000b
+
+    psrldq          m9, m3, 4
+    psubd           m10, m3, m9
+    paddd           m3, m9
+    shufps          m3, m10, 10001000b
+    psrldq          m9, m3, 4
+    psubd           m10, m3, m9
+    paddd           m3, m9
+    shufps          m3, m10, 10001000b
+
+    SUMSUB_BA d, 0, 1, 9
+    SUMSUB_BA d, 2, 3, 9
+    SUMSUB_BA d, 0, 2, 9
+    SUMSUB_BA d, 1, 3, 9
+
+    movu            m4, [r5]
+    movu            m5, [r5 + r3]
+    movu            m6, [r5 + r3 * 2]
+    movu            m7, [r5 + r4]
+
+    pmaddwd         m4, m14
+    pmaddwd         m5, m14
+    pmaddwd         m6, m14
+    pmaddwd         m7, m14
+
+    psrldq          m9, m4, 4
+    psubd           m10, m4, m9
+    paddd           m4, m9
+    shufps          m4, m10, 10001000b
+    psrldq          m9, m4, 4
+    psubd           m10, m4, m9
+    paddd           m4, m9
+    shufps          m4, m10, 10001000b
+
+    psrldq          m9, m5, 4
+    psubd           m10, m5, m9
+    paddd           m5, m9
+    shufps          m5, m10, 10001000b
+    psrldq          m9, m5, 4
+    psubd           m10, m5, m9
+    paddd           m5, m9
+    shufps          m5, m10, 10001000b
+
+    psrldq          m9, m6, 4
+    psubd           m10, m6, m9
+    paddd           m6, m9
+    shufps          m6, m10, 10001000b
+    psrldq          m9, m6, 4
+    psubd           m10, m6, m9
+    paddd           m6, m9
+    shufps          m6, m10, 10001000b
+
+    psrldq          m9, m7, 4
+    psubd           m10, m7, m9
+    paddd           m7, m9
+    shufps          m7, m10, 10001000b
+    psrldq          m9, m7, 4
+    psubd           m10, m7, m9
+    paddd           m7, m9
+    shufps          m7, m10, 10001000b
+
+    SUMSUB_BA d, 4, 5, 9
+    SUMSUB_BA d, 6, 7, 9
+    SUMSUB_BA d, 4, 6, 9
+    SUMSUB_BA d, 5, 7, 9
+
+    SUMSUB_BA d, 0, 4, 9
+    SUMSUB_BA d, 1, 5, 9
+    SUMSUB_BA d, 2, 6, 9
+    SUMSUB_BA d, 3, 7, 9
+
+    pabsd           m0, m0
+    pabsd           m2, m2
+    pabsd           m1, m1
+    pabsd           m3, m3
+    pabsd           m4, m4
+    pabsd           m5, m5
+    pabsd           m6, m6
+    pabsd           m7, m7
+
+    paddd           m0, m2
+    paddd           m1, m3
+    paddd           m0, m1
+    paddd           m5, m4
+    paddd           m0, m5
+    paddd           m7, m6
+    paddd           m0, m7
+    paddd           m0, m11
+
+    movhlps         m1, m0
+    paddd           m0, m1
+    psrldq          m1, m0, 4
+    paddd           m0, m1
+    paddd           m0, [pd_2]
+    psrld           m0, 2
+    psubd           m0, m8
+
+    psubd           m12, m0
+    pabsd           m0, m12
+    movd            eax, m0
+    RET
+%endif
diff -r 50a2071500dc -r 77aa3a2e457a source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Mon Jan 12 20:01:58 2015 +0530
+++ b/source/common/x86/pixel.h	Tue Jan 13 14:58:25 2015 +0530
@@ -224,6 +224,7 @@
 int x265_psyCost_pp_32x32_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int x265_psyCost_pp_64x64_sse4(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int x265_psyCost_ss_4x4_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
+int x265_psyCost_ss_8x8_sse4(const int16_t* source, intptr_t sstride, const int16_t* recon, intptr_t rstride);
 
 #undef DECL_PIXELS
 #undef DECL_HEVC_SSD


More information about the x265-devel mailing list