[x265] [PATCH 1 of 3] asm: general calSign to accelerate sao

Min Chen chenm003 at 163.com
Fri Apr 3 13:10:44 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1428059407 -28800
# Node ID abf20efa2234fb7cd6a474d4dac6e3051a94b30c
# Parent  9a5fa67583feb6ffb7668f82632f7e93e5ec9415
asm: general calSign to accelerate sao
---
 source/common/x86/const-a.asm    |    3 ++
 source/common/x86/loopfilter.asm |   69 ++++++++++++++++++++++++++-----------
 source/encoder/sao.cpp           |   14 ++------
 source/test/pixelharness.cpp     |    8 ++--
 4 files changed, 58 insertions(+), 36 deletions(-)

diff -r 9a5fa67583fe -r abf20efa2234 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Thu Apr 02 13:21:32 2015 -0500
+++ b/source/common/x86/const-a.asm	Fri Apr 03 19:10:07 2015 +0800
@@ -65,6 +65,9 @@
 const pb_32,       times 32 db 32
 const pb_128,      times 16 db 128
 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
+const pb_movemask, times 16 db 0x00
+                   times 16 db 0xFF
+                   
 
 const pw_0_15,     times 2 dw 0, 1, 2, 3, 4, 5, 6, 7
 const pw_2,        times 8 dw 2
diff -r 9a5fa67583fe -r abf20efa2234 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Thu Apr 02 13:21:32 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Fri Apr 03 19:10:07 2015 +0800
@@ -36,6 +36,7 @@
 cextern pb_128
 cextern pb_2
 cextern pw_2
+cextern pb_movemask
 
 
 ;============================================================================================================
@@ -321,29 +322,55 @@
     RET
 
 ;============================================================================================================
-; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int endX)
+; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int width)
 ;============================================================================================================
 INIT_XMM sse4
-cglobal calSign, 4, 4, 6
-    mova        m1,    [pb_128]
-    mova        m0,    [pb_1]
-    shr         r3d,   4
-.loop
-    movu        m2,    [r1]        ; m2 = pRec[x]
-    movu        m3,    [r2]        ; m3 = pTmpU[x]
+cglobal calSign, 4,5,6
+    mova        m0,     [pb_128]
+    mova        m1,     [pb_1]
 
-    pxor        m4,    m2,    m1
-    pxor        m3,    m1
-    pcmpgtb     m5,    m4,    m3
-    pcmpgtb     m3,    m4
-    pand        m5,    m0
-    por         m5,    m3
+    sub         r1,     r0
+    sub         r2,     r0
 
-    movu        [r0],  m5
+    mov         r4d,    r3d
+    shr         r3d,    4
+    jz         .next
+.loop:
+    movu        m2,     [r0 + r1]            ; m2 = pRec[x]
+    movu        m3,     [r0 + r2]            ; m3 = pTmpU[x]
+    pxor        m4,     m2,     m0
+    pxor        m3,     m0
+    pcmpgtb     m5,     m4,     m3
+    pcmpgtb     m3,     m4
+    pand        m5,     m1
+    por         m5,     m3
+    movu        [r0],   m5
 
-    add         r0,    16
-    add         r1,    16
-    add         r2,    16
+    add         r0,     16
     dec         r3d
     jnz        .loop
+
+    ; process partial
+.next:
+    and         r4d, 15
+    jz         .end
+
+    movu        m2,     [r0 + r1]            ; m2 = pRec[x]
+    movu        m3,     [r0 + r2]            ; m3 = pTmpU[x]
+    pxor        m4,     m2,     m0
+    pxor        m3,     m0
+    pcmpgtb     m5,     m4,     m3
+    pcmpgtb     m3,     m4
+    pand        m5,     m1
+    por         m5,     m3
+
+    lea         r3,     [pb_movemask + 16]
+    sub         r3,     r4
+    movu        xmm0,   [r3]
+    movu        m3,     [r0]
+    pblendvb    m5,     m5,     m3,     xmm0
+    movu        [r0],   m5
+
+.end:
     RET
+
diff -r 9a5fa67583fe -r abf20efa2234 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Thu Apr 02 13:21:32 2015 -0500
+++ b/source/encoder/sao.cpp	Fri Apr 03 19:10:07 2015 +0800
@@ -783,13 +783,7 @@
                 rec += stride;
             }
 
-            if (!(ctuWidth & 15))
-                primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
-            else
-            {
-                for (x = 0; x < ctuWidth; x++)
-                    upBuff1[x] = signOf(rec[x] - rec[x - stride]);
-            }
+            primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
 
             for (y = startY; y < endY; y++)
             {
@@ -832,8 +826,7 @@
                 rec += stride;
             }
 
-            for (x = startX; x < endX; x++)
-                upBuff1[x] = signOf(rec[x] - rec[x - stride - 1]);
+            primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
 
             for (y = startY; y < endY; y++)
             {
@@ -879,8 +872,7 @@
                 rec += stride;
             }
 
-            for (x = startX - 1; x < endX; x++)
-                upBuff1[x] = signOf(rec[x] - rec[x - stride + 1]);
+            primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
 
             for (y = startY; y < endY; y++)
             {
diff -r 9a5fa67583fe -r abf20efa2234 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Thu Apr 02 13:21:32 2015 -0500
+++ b/source/test/pixelharness.cpp	Fri Apr 03 19:10:07 2015 +0800
@@ -870,8 +870,8 @@
 
 bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
 {
-    ALIGN_VAR_16(int8_t, ref_dest[64 * 64]);
-    ALIGN_VAR_16(int8_t, opt_dest[64 * 64]);
+    ALIGN_VAR_16(int8_t, ref_dest[64 * 2]);
+    ALIGN_VAR_16(int8_t, opt_dest[64 * 2]);
 
     memset(ref_dest, 0xCD, sizeof(ref_dest));
     memset(opt_dest, 0xCD, sizeof(opt_dest));
@@ -880,12 +880,12 @@
 
     for (int i = 0; i < ITERS; i++)
     {
-        int width = 16 * (rand() % 4 + 1);
+        int width = (rand() % 64) + 1;
 
         ref(ref_dest, pbuf2 + j, pbuf3 + j, width);
         checked(opt, opt_dest, pbuf2 + j, pbuf3 + j, width);
 
-        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int8_t)))
+        if (memcmp(ref_dest, opt_dest, sizeof(ref_dest)))
             return false;
 
         reportfail();



More information about the x265-devel mailing list