[x265] [PATCH 1 of 2] asm: saoCuOrgE3 asm code

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Wed Jan 7 10:00:16 CET 2015


# HG changeset patch
# User Nabajit Deka
# Date 1420620491 -19800
#      Wed Jan 07 14:18:11 2015 +0530
# Node ID 9ec89f245be8ca4468362cb095172dbc92bd5140
# Parent  6cc757f662ed982a2f64122eba8e557d8ef0abba
asm: saoCuOrgE3 asm code

diff -r 6cc757f662ed -r 9ec89f245be8 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Wed Jan 07 13:44:23 2015 +0530
+++ b/source/common/loopfilter.cpp	Wed Jan 07 14:18:11 2015 +0530
@@ -87,6 +87,22 @@
     }
 }
 
+void processSaoCUE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX)
+  {
+    int8_t signDown;
+    int8_t edgeType;
+
+    for (int x = startX + 1; x < endX; x++)
+    {
+        signDown = signOf(rec[x] - rec[x + stride]);
+        edgeType = signDown + upBuff1[x] + 2;
+        upBuff1[x - 1] = -signDown;
+
+        short v = rec[x] + m_offsetEo[edgeType];
+        rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
+    }
+  }
+
 void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
 {
     #define SAO_BO_BITS 5
@@ -113,6 +129,7 @@
     p.saoCuOrgE0 = processSaoCUE0;
     p.saoCuOrgE1 = processSaoCUE1;
     p.saoCuOrgE2 = processSaoCUE2;
+    p.saoCuOrgE3 = processSaoCUE3;
     p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
 }
diff -r 6cc757f662ed -r 9ec89f245be8 source/common/primitives.h
--- a/source/common/primitives.h	Wed Jan 07 13:44:23 2015 +0530
+++ b/source/common/primitives.h	Wed Jan 07 14:18:11 2015 +0530
@@ -193,6 +193,7 @@
 typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
 typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
+typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
@@ -248,6 +249,7 @@
     saoCuOrgE0_t          saoCuOrgE0;
     saoCuOrgE1_t          saoCuOrgE1;
     saoCuOrgE2_t          saoCuOrgE2;
+    saoCuOrgE3_t          saoCuOrgE3;
     saoCuOrgB0_t          saoCuOrgB0;
 
     downscale_t           frameInitLowres;
diff -r 6cc757f662ed -r 9ec89f245be8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jan 07 13:44:23 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jan 07 14:18:11 2015 +0530
@@ -1652,6 +1652,7 @@
         p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
         p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
         p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
+        p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
         p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
 
         LUMA_ADDAVG(_sse4);
diff -r 6cc757f662ed -r 9ec89f245be8 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Wed Jan 07 13:44:23 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Wed Jan 07 14:18:11 2015 +0530
@@ -188,6 +188,88 @@
          jnz         .loop
     RET
 
+;=======================================================================================================
+;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX)
+;=======================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE3, 3, 7, 8
+    mov             r3d, r3m
+    mov             r4d, r4m
+    mov             r5d, r5m
+
+    mov             r6d, r5d
+    sub             r6d, r4d
+
+    inc             r4d
+    add             r0, r4
+    add             r1, r4
+    movh            m7, [r0 + r6 - 1]
+    mov             r6, [r1 + r6 - 2]
+    pxor            m0, m0                      ; m0 = 0
+    movu            m6, [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+
+.loop:
+    movu            m1, [r0]                    ; m1 = pRec[x]
+    movu            m2, [r0 + r3]               ; m2 = pRec[x + iStride]
+
+    psubusb         m3, m2, m1
+    psubusb         m4, m1, m2
+    pcmpeqb         m3, m0
+    pcmpeqb         m4, m0
+    pcmpeqb         m2, m1
+
+    pabsb           m3, m3
+    por             m4, m3
+    pandn           m2, m4                      ; m2 = iSignDown
+
+    movu            m3, [r1]                    ; m3 = m_iUpBuff1
+
+    paddb           m3, m2
+    paddb           m3, m6                      ; m3 = uiEdgeType
+
+    movu            m4, [r2]                    ; m4 = m_iOffsetEo
+    pshufb          m5, m4, m3
+
+    psubb           m3, m0, m2
+    movu            [r1 - 1], m3
+
+    pmovzxbw        m2, m1
+    punpckhbw       m1, m0
+    pmovsxbw        m3, m5
+    punpckhbw       m5, m5
+    psraw           m5, 8
+
+    paddw           m2, m3
+    paddw           m1, m5
+    packuswb        m2, m1
+    movu            [r0], m2
+
+    sub             r5d, 16
+    jle             .end
+
+    lea             r0, [r0 + 16]
+    lea             r1, [r1 + 16]
+
+    jnz             .loop
+
+.end:
+    js              .skip
+    sub             r0, r4
+    sub             r1, r4
+    movh            [r0 + 16], m7
+    mov             [r1 + 15], r6
+    jmp             .quit
+
+.skip:
+    sub             r0, r4
+    sub             r1, r4
+    movh            [r0 + 15], m7
+    mov             [r1 + 14], r6
+
+.quit:
+
+    RET
+
 ;=====================================================================================
 ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
 ;=====================================================================================
diff -r 6cc757f662ed -r 9ec89f245be8 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Wed Jan 07 13:44:23 2015 +0530
+++ b/source/common/x86/loopfilter.h	Wed Jan 07 14:18:11 2015 +0530
@@ -28,6 +28,7 @@
 void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
 void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
+void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
diff -r 6cc757f662ed -r 9ec89f245be8 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Jan 07 13:44:23 2015 +0530
+++ b/source/test/pixelharness.cpp	Wed Jan 07 14:18:11 2015 +0530
@@ -978,6 +978,35 @@
     return true;
 }
 
+bool PixelHarness::check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int stride = 16 * (rand() % 4 + 1);
+        int start = rand() % 2;
+        int end = (16 * (rand() % 4 + 1)) - rand() % 2;
+
+        ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end);
+        checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
 {
     ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1542,6 +1571,15 @@
         }
     }
 
+    if (opt.saoCuOrgE3)
+    {
+        if (!check_saoCuOrgE3_t(ref.saoCuOrgE3, opt.saoCuOrgE3))
+        {
+            printf("SAO_EO_3 failed\n");
+            return false;
+        }
+    }
+
     if (opt.saoCuOrgB0)
     {
         if (!check_saoCuOrgB0_t(ref.saoCuOrgB0, opt.saoCuOrgB0))
@@ -1892,6 +1930,12 @@
         REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
     }
 
+    if (opt.saoCuOrgE3)
+    {
+        HEADER0("SAO_EO_3");
+        REPORT_SPEEDUP(opt.saoCuOrgE3, ref.saoCuOrgE3, pbuf1, psbuf2, psbuf1, 64, 0, 64);
+    }
+
     if (opt.saoCuOrgB0)
     {
         HEADER0("SAO_BO_0");
diff -r 6cc757f662ed -r 9ec89f245be8 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Wed Jan 07 13:44:23 2015 +0530
+++ b/source/test/pixelharness.h	Wed Jan 07 14:18:11 2015 +0530
@@ -96,6 +96,7 @@
     bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
     bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
     bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
+    bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);


More information about the x265-devel mailing list