[x265] [PATCH] asm: psyCost_pp_8x8 for HIGH_BIT_DEPTH in sse4: improve 6995c->1070c

Divya Manivannan divya at multicorewareinc.com
Tue Dec 30 12:49:13 CET 2014


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1419940084 -19800
#      Tue Dec 30 17:18:04 2014 +0530
# Node ID 259bf78bae8c6bd9bfefa8e8575de8620d20971c
# Parent  f15a798c41f69a053b1694399230b38eec8cb1a5
asm: psyCost_pp_8x8 for HIGH_BIT_DEPTH in sse4: improve 6995c->1070c

diff -r f15a798c41f6 -r 259bf78bae8c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Dec 30 14:35:08 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Dec 30 17:18:04 2014 +0530
@@ -1436,6 +1436,9 @@
         INTRA_ANG_SSE4_HIGH(sse4);
 
         p.psy_cost_pp[BLOCK_4x4] = x265_psyCost_pp_4x4_sse4;
+#if X86_64
+        p.psy_cost_pp[BLOCK_8x8] = x265_psyCost_pp_8x8_sse4;
+#endif
     }
     if (cpuMask & X265_CPU_XOP)
     {
diff -r f15a798c41f6 -r 259bf78bae8c source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Dec 30 14:35:08 2014 +0530
+++ b/source/common/x86/pixel-a.asm	Tue Dec 30 17:18:04 2014 +0530
@@ -6748,7 +6748,83 @@
 INIT_XMM sse4
 cglobal psyCost_pp_8x8, 4, 6, 13
 
-    FIX_STRIDES r1, r3
+%if HIGH_BIT_DEPTH
+    FIX_STRIDES r1, r3
+    lea             r4, [3 * r1]
+    pxor            m10, m10
+    movu            m0, [r0]
+    movu            m1, [r0 + r1]
+    movu            m2, [r0 + r1 * 2]
+    movu            m3, [r0 + r4]
+    lea             r5, [r0 + r1 * 4]
+    movu            m4, [r5]
+    movu            m5, [r5 + r1]
+    movu            m6, [r5 + r1 * 2]
+    movu            m7, [r5 + r4]
+
+    paddw           m8, m0, m1
+    paddw           m8, m2
+    paddw           m8, m3
+    paddw           m8, m4
+    paddw           m8, m5
+    paddw           m8, m6
+    paddw           m8, m7
+    pmaddwd         m8, [pw_1]
+    movhlps         m9, m8
+    paddd           m8, m9
+    psrldq          m9, m8, 4
+    paddd           m8, m9
+    psrld           m8, 2
+
+    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+    paddd           m0, m1
+    paddd           m0, m2
+    paddd           m0, m3
+    HADDUW m0, m1
+    paddd           m0, [pd_1]
+    psrld           m0, 1
+    psubd           m10, m0, m8
+
+    lea             r4, [3 * r3]
+    movu            m0, [r2]
+    movu            m1, [r2 + r3]
+    movu            m2, [r2 + r3 * 2]
+    movu            m3, [r2 + r4]
+    lea             r5, [r2 + r3 * 4]
+    movu            m4, [r5]
+    movu            m5, [r5 + r3]
+    movu            m6, [r5 + r3 * 2]
+    movu            m7, [r5 + r4]
+
+    paddw           m8, m0, m1
+    paddw           m8, m2
+    paddw           m8, m3
+    paddw           m8, m4
+    paddw           m8, m5
+    paddw           m8, m6
+    paddw           m8, m7
+    pmaddwd         m8, [pw_1]
+    movhlps         m9, m8
+    paddd           m8, m9
+    psrldq          m9, m8, 4
+    paddd           m8, m9
+    psrld           m8, 2
+
+    HADAMARD8_2D 0, 1, 2, 3, 4, 5, 6, 7, 9, amax
+
+    paddd           m0, m1
+    paddd           m0, m2
+    paddd           m0, m3
+    HADDUW m0, m1
+    paddd           m0, [pd_1]
+    psrld           m0, 1
+    psubd           m0, m8
+    psubd           m10, m0
+    pabsd           m0, m10
+    movd            eax, m0
+
+%else ; !HIGH_BIT_DEPTH
     lea             r4, [3 * r1]
     mova            m8, [hmul_8p]
 
@@ -6842,6 +6918,6 @@
     psubd           m12, m0
     pabsd           m0, m12
     movd            eax, m0
-
+%endif ; HIGH_BIT_DEPTH
     RET
 %endif


More information about the x265-devel mailing list