[x265] [PATCH 1 of 6] asm: 10bpp sse4 code for saoCuOrgE0, improved 8740c->974c, over C code

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Mon Jun 22 14:50:33 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1434712676 -19800
#      Fri Jun 19 16:47:56 2015 +0530
# Node ID a94e9a1f0fde08e060a9b52e3353ce2f242d9257
# Parent  83a7d824442455ba5e0a6b53ea68e6b7043845de
asm: 10bpp sse4 code for saoCuOrgE0, improved 8740c->974c, over C code

diff -r 83a7d8244424 -r a94e9a1f0fde source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/CMakeLists.txt	Fri Jun 19 16:47:56 2015 +0530
@@ -46,7 +46,7 @@
                mc-a2.asm pixel-util8.asm blockcopy8.asm
                pixeladd8.asm dct8.asm)
     if(HIGH_BIT_DEPTH)
-        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
+        set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm loopfilter.asm)
     else()
         set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm intrapred8_allangs.asm ipfilter8.asm loopfilter.asm)
     endif()
diff -r 83a7d8244424 -r a94e9a1f0fde source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri Jun 19 16:47:56 2015 +0530
@@ -1089,6 +1089,8 @@
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+        p.saoCuOrgE0 = PFX(saoCuOrgE0_sse4);
+
         LUMA_ADDAVG(sse4);
         CHROMA_420_ADDAVG(sse4);
         CHROMA_422_ADDAVG(sse4);
diff -r 83a7d8244424 -r a94e9a1f0fde source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Mon Jun 22 15:15:33 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Fri Jun 19 16:47:56 2015 +0530
@@ -38,6 +38,7 @@
 cextern pb_128
 cextern pb_2
 cextern pw_2
+cextern pw_1023
 cextern pb_movemask
 
 
@@ -45,6 +46,107 @@
 ; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
 ;============================================================================================================
 INIT_XMM sse4
+%if HIGH_BIT_DEPTH
+cglobal saoCuOrgE0, 4,5,9
+    mov         r4d, r4m
+    movh        m6,  [r1]
+    movzx       r1d, byte [r3]
+    pxor        m5, m5
+    neg         r1b
+    movd        m0, r1d
+    lea         r1, [r0 + r4 * 2]
+    mov         r4d, r2d
+
+.loop:
+    movu        m7, [r0]
+    movu        m8, [r0 + 16]
+    movu        m2, [r0 + 2]
+    movu        m1, [r0 + 18]
+
+    pcmpgtw     m3, m7, m2
+    pcmpgtw     m2, m7
+    pcmpgtw     m4, m8, m1
+    pcmpgtw     m1, m8 
+
+    packsswb    m3, m4
+    packsswb    m2, m1
+
+    pand        m3, [pb_1]
+    por         m3, m2
+
+    palignr     m2, m3, m5, 15
+    por         m2, m0
+
+    mova        m4, [pw_1023]
+    psignb      m2, [pb_128]                ; m2 = signLeft
+    pxor        m0, m0
+    palignr     m0, m3, 15
+    paddb       m3, m2
+    paddb       m3, [pb_2]                  ; m2 = uiEdgeType
+    pshufb      m2, m6, m3
+    pmovsxbw    m3, m2                      ; offsetEo
+    punpckhbw   m2, m2
+    psraw       m2, 8
+    paddw       m7, m3
+    paddw       m8, m2
+    pmaxsw      m7, m5
+    pmaxsw      m8, m5
+    pminsw      m7, m4
+    pminsw      m8, m4
+    movu        [r0], m7
+    movu        [r0 + 16], m8
+
+    add         r0q, 32
+    sub         r2d, 16
+    jnz        .loop
+
+    movzx       r3d, byte [r3 + 1]
+    neg         r3b
+    movd        m0, r3d
+.loopH:
+    movu        m7, [r1]
+    movu        m8, [r1 + 16]
+    movu        m2, [r1 + 2]
+    movu        m1, [r1 + 18]
+
+    pcmpgtw     m3, m7, m2
+    pcmpgtw     m2, m7
+    pcmpgtw     m4, m8, m1
+    pcmpgtw     m1, m8 
+
+    packsswb    m3, m4
+    packsswb    m2, m1
+
+    pand        m3, [pb_1]
+    por         m3, m2
+
+    palignr     m2, m3, m5, 15
+    por         m2, m0
+
+    mova        m4, [pw_1023]
+    psignb      m2, [pb_128]                ; m2 = signLeft
+    pxor        m0, m0
+    palignr     m0, m3, 15
+    paddb       m3, m2
+    paddb       m3, [pb_2]                  ; m2 = uiEdgeType
+    pshufb      m2, m6, m3
+    pmovsxbw    m3, m2                      ; offsetEo
+    punpckhbw   m2, m2
+    psraw       m2, 8
+    paddw       m7, m3
+    paddw       m8, m2
+    pmaxsw      m7, m5
+    pmaxsw      m8, m5
+    pminsw      m7, m4
+    pminsw      m8, m4
+    movu        [r1], m7
+    movu        [r1 + 16], m8
+
+    add         r1q, 32
+    sub         r4d, 16
+    jnz        .loopH
+    RET
+%else ; HIGH_BIT_DEPTH
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
 
     mov         r4d, r4m
@@ -130,6 +232,7 @@
     sub         r4d, 16
     jnz        .loopH
     RET
+%endif
 
 INIT_YMM avx2
 cglobal saoCuOrgE0, 5, 5, 7, rec, offsetEo, lcuWidth, signLeft, stride
diff -r 83a7d8244424 -r a94e9a1f0fde source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Jun 22 15:15:33 2015 +0530
+++ b/source/test/pixelharness.cpp	Fri Jun 19 16:47:56 2015 +0530
@@ -901,8 +901,8 @@
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
     ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
 
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
+    for (int i = 0; i < 64 * 64; i++)
+        ref_dest[i] = opt_dest[i] = rand() % (PIXEL_MAX);
 
     int j = 0;
 


More information about the x265-devel mailing list