[x265] [PATCH] asm: avx2 10bit code for sign primitive(356.91 -> 242.00)

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Thu Jun 25 13:15:34 CEST 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1435230598 -19800
#      Thu Jun 25 16:39:58 2015 +0530
# Node ID 1f24ff6471506c0ff5fd4addce149169976f845b
# Parent  a0de1e88f3b1a10d6f8cf656a95e6ec37e1bc134
asm: avx2 10bit code for sign primitive(356.91 -> 242.00)

avx2:
calSign  9.08x    242.00          2197.71

sse4:
calSign  6.16x    356.91          2197.63

diff -r a0de1e88f3b1 -r 1f24ff647150 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Jun 25 16:39:58 2015 +0530
@@ -1496,6 +1496,7 @@
         p.scale1D_128to64 = PFX(scale1D_128to64_avx2);
         p.scale2D_64to32 = PFX(scale2D_64to32_avx2);
         p.weight_pp = PFX(weight_pp_avx2);
+        p.sign = PFX(calSign_avx2);
 
         p.cu[BLOCK_16x16].calcresidual = PFX(getResidual16_avx2);
         p.cu[BLOCK_32x32].calcresidual = PFX(getResidual32_avx2);
diff -r a0de1e88f3b1 -r 1f24ff647150 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Jun 25 16:25:51 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Thu Jun 25 16:39:58 2015 +0530
@@ -1415,6 +1415,41 @@
 %endif
 
 INIT_YMM avx2
+%if HIGH_BIT_DEPTH
+cglobal calSign, 4, 7, 5
+    mova            m0, [pw_1]
+    mov             r4d, r3d
+    shr             r3d, 4
+    add             r3d, 1
+    mov             r5, r0
+    movu            m4, [r0 + r4]
+
+.loop
+    movu            m1, [r1]        ; m2 = pRec[x]
+    movu            m2, [r2]        ; m3 = pTmpU[x]
+
+    pcmpgtw         m3, m1, m2
+    pcmpgtw         m2, m1
+
+    pand            m3, m0
+    por             m3, m2
+    packsswb        m3, m3
+    vpermq          m3, m3, q3220
+    movu            [r0 ], xm3
+
+    add             r0, 16
+    add             r1, 32
+    add             r2, 32
+    dec             r3d
+    jnz             .loop
+
+    mov             r6, r0
+    sub             r6, r5
+    sub             r4, r6
+    movu            [r0 + r4], m4
+    RET
+%else ; HIGH_BIT_DEPTH
+
 cglobal calSign, 4, 5, 6
     vbroadcasti128  m0,     [pb_128]
     mova            m1,     [pb_1]
@@ -1463,3 +1498,4 @@
 
 .end:
     RET
+%endif


More information about the x265-devel mailing list