[x265] [PATCH] sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately

Divya Manivannan divya at multicorewareinc.com
Wed Apr 22 08:44:37 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429684176 -19800
#      Wed Apr 22 11:59:36 2015 +0530
# Node ID 584211b333ac9640d81423b3f60a18956425e27c
# Parent  86268e498680951069c48b681eef830b0aa37873
sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately

diff -r 86268e498680 -r 584211b333ac source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/loopfilter.cpp	Wed Apr 22 11:59:36 2015 +0530
@@ -122,25 +122,6 @@
     }
 }
 
-void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
-{
-    int8_t signDown;
-    int8_t edgeType;
-
-    for (int y = 0; y < 2; y++)
-    {
-        for (int x = startX + 1; x < endX; x++)
-        {
-            signDown = signOf(rec[x] - rec[x + stride]);
-            edgeType = signDown + upBuff1[x] + 2;
-            upBuff1[x - 1] = -signDown;
-            rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
-        }
-        upBuff1[endX - 1] = upBuff[y];
-        rec += stride + 1;
-    }
-}
-
 void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
 {
     #define SAO_BO_BITS 5
@@ -164,8 +145,8 @@
     p.saoCuOrgE1 = processSaoCUE1;
     p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows;
     p.saoCuOrgE2 = processSaoCUE2;
-    p.saoCuOrgE3 = processSaoCUE3;
-    p.saoCuOrgE3_2Rows = processSaoCUE3_2Rows;
+    p.saoCuOrgE3[0] = processSaoCUE3;
+    p.saoCuOrgE3[1] = processSaoCUE3;
     p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
 }
diff -r 86268e498680 -r 584211b333ac source/common/primitives.h
--- a/source/common/primitives.h	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/primitives.h	Wed Apr 22 11:59:36 2015 +0530
@@ -172,7 +172,6 @@
 typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
-typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
@@ -278,8 +277,7 @@
     saoCuOrgE0_t          saoCuOrgE0;
     saoCuOrgE1_t          saoCuOrgE1, saoCuOrgE1_2Rows;
     saoCuOrgE2_t          saoCuOrgE2;
-    saoCuOrgE3_t          saoCuOrgE3;
-    saoCuOrgE3_2Rows_t    saoCuOrgE3_2Rows;
+    saoCuOrgE3_t          saoCuOrgE3[2];
     saoCuOrgB0_t          saoCuOrgB0;
 
     downscale_t           frameInitLowres;
diff -r 86268e498680 -r 584211b333ac source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 22 11:59:36 2015 +0530
@@ -1519,8 +1519,8 @@
         p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
         p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
         p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
-        p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
-        p.saoCuOrgE3_2Rows = x265_saoCuOrgE3_2Rows_sse4;
+        p.saoCuOrgE3[0] = x265_saoCuOrgE3_sse4;
+        p.saoCuOrgE3[1] = x265_saoCuOrgE3_sse4;
         p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
 
         LUMA_ADDAVG(sse4);
@@ -1728,7 +1728,7 @@
         p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
         p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
         p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
-        p.saoCuOrgE3 = x265_saoCuOrgE3_avx2;
+        p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
         p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
 
         p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
diff -r 86268e498680 -r 584211b333ac source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Wed Apr 22 11:59:36 2015 +0530
@@ -582,135 +582,6 @@
     movhps          [r1 + r5 - 1], xm7
     RET
 
-;=============================================================================================================================
-;void saoCuOrgE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
-;=============================================================================================================================
-INIT_XMM sse4
-cglobal saoCuOrgE3_2Rows, 3, 7, 8
-    mov             r3d, r3m
-    mov             r4d, r4m
-    movu            m5, [r2]
-    mov             r2d, r5m
-    mov             r6,  r6m
-
-    movh            m7, [r0 + r2]
-    movhps          m7, [r1 + r2 - 1]
-
-    inc             r4d
-    add             r0, r4
-    add             r1, r4
-
-    sub             r2d, r4d
-    pxor            m0, m0                      ; m0 = 0
-    mova            m6, [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-
-.loop:
-    movu            m1, [r0]                    ; m1 = pRec[x]
-    movu            m2, [r0 + r3]               ; m2 = pRec[x + iStride]
-
-    psubusb         m3, m2, m1
-    psubusb         m4, m1, m2
-    pcmpeqb         m3, m0
-    pcmpeqb         m4, m0
-    pcmpeqb         m2, m1
-
-    pabsb           m3, m3
-    por             m4, m3
-    pandn           m2, m4                      ; m2 = iSignDown
-
-    movu            m3, [r1]                    ; m3 = m_iUpBuff1
-
-    paddb           m3, m2
-    paddb           m3, m6                      ; m3 = uiEdgeType
-
-    pshufb          m4, m5, m3
-
-    psubb           m3, m0, m2
-    movu            [r1 - 1], m3
-
-    pmovzxbw        m2, m1
-    punpckhbw       m1, m0
-    pmovsxbw        m3, m4
-    punpckhbw       m4, m4
-    psraw           m4, 8
-
-    paddw           m2, m3
-    paddw           m1, m4
-    packuswb        m2, m1
-    movu            [r0], m2
-
-    add             r0, 16
-    add             r1, 16
-    sub             r2, 16
-    jg              .loop
-
-    add             r0, r2
-    add             r1, r2
-    movh            [r0], m7
-    movhps          [r1 - 1], m7
-
-    mov             r5d, r5m
-    mov             r2b, byte[r6]
-    mov             byte[r1 - 1], r2b
-
-    sub             r0, r5
-    lea             r0, [r0 + r3 + 1]
-
-    movh            m7, [r0 + r5]
-    movhps          m7, [r1 - 1]
-
-    sub             r1, r5
-    add             r0, r4
-    add             r1, r4
-    sub             r5d, r4d
-
-.loop1:
-    movu            m1, [r0]                    ; m1 = pRec[x]
-    movu            m2, [r0 + r3]               ; m2 = pRec[x + iStride]
-
-    psubusb         m3, m2, m1
-    psubusb         m4, m1, m2
-    pcmpeqb         m3, m0
-    pcmpeqb         m4, m0
-    pcmpeqb         m2, m1
-
-    pabsb           m3, m3
-    por             m4, m3
-    pandn           m2, m4                      ; m2 = iSignDown
-
-    movu            m3, [r1]                    ; m3 = m_iUpBuff1
-
-    paddb           m3, m2
-    paddb           m3, m6                      ; m3 = uiEdgeType
-
-    pshufb          m4, m5, m3
-
-    psubb           m3, m0, m2
-    movu            [r1 - 1], m3
-
-    pmovzxbw        m2, m1
-    punpckhbw       m1, m0
-    pmovsxbw        m3, m4
-    punpckhbw       m4, m4
-    psraw           m4, 8
-
-    paddw           m2, m3
-    paddw           m1, m4
-    packuswb        m2, m1
-    movu            [r0], m2
-
-    add             r0, 16
-    add             r1, 16
-    sub             r5, 16
-    jg              .loop1
-
-    movh            [r0 + r5], m7
-    movhps          [r1 + r5 - 1], m7
-
-    mov             r2b, byte[r6 + 1]
-    mov             byte[r1 + r5 - 1], r2b
-    RET
-
 ;=====================================================================================
 ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
 ;=====================================================================================
diff -r 86268e498680 -r 584211b333ac source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/loopfilter.h	Wed Apr 22 11:59:36 2015 +0530
@@ -34,7 +34,6 @@
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
-void x265_saoCuOrgE3_2Rows_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 86268e498680 -r 584211b333ac source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/encoder/sao.cpp	Wed Apr 22 11:59:36 2015 +0530
@@ -516,41 +516,20 @@
             if (rpelx == picWidth)
                 upBuff1[ctuWidth - 1] = lastSign;
 
-            int diff = endY - startY;
-            for (y = 0; y < (diff >> 1); y++)
-            {
-                int8_t signDown, signDown0, upBuff[2];
-                int edgeType1;
-
-                signDown = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
-                edgeType1 = signDown + upBuff1[startX] + 2;
-                rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType1]];
-
-                signDown = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
-                signDown0 = signOf(rec[startX + 1] - rec[startX + stride]);
-                edgeType1 = signDown - signDown0 + 2;
-                upBuff1[startX - 1] = -signDown;
-
-                upBuff[0] = signOf(rec[endX - 1 + stride] - rec[endX]);
-                upBuff[1] = signOf(rec[endX - 1 + 2 * stride] - rec[endX + stride]);
-
-                primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, upBuff);
-
-                rec[startX + stride] = m_clipTable[rec[startX + stride] + m_offsetEo[edgeType1]];
-
-                rec += 2 * stride;
-            }
-            if (diff & 1)
-            {
-                int8_t signDown1 = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
-                int edgeType = signDown1 + upBuff1[startX] + 2;
-                upBuff1[startX - 1] = -signDown1;
-                rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType]];
-
-                primitives.saoCuOrgE3(rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
-
-                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
-            }
+            for (y = startY; y < endY; y++)
+            {
+                x = startX;
+                int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
+                int edgeType = signDown + upBuff1[x] + 2;
+                upBuff1[x - 1] = -signDown;
+                rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+
+                primitives.saoCuOrgE3[endX > 16](rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
+
+                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+
+                rec += stride;
+            }
         }
 
         break;
diff -r 86268e498680 -r 584211b333ac source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/test/pixelharness.cpp	Wed Apr 22 11:59:36 2015 +0530
@@ -66,7 +66,7 @@
         sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
         ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
         psbuf1[i] = psbuf4[i] = (rand() % 65) - 32;                   // range is between -32 to 32
-        psbuf2[i] = psbuf5[i] = psbuf6[i] = psbuf7[i] = (rand() % 3) - 1; // possible values {-1,0,1}
+        psbuf2[i] = psbuf5[i] = (rand() % 3) - 1;                     // possible values {-1,0,1}
         psbuf3[i] = (rand() % 129) - 128;
         sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
     }
@@ -1011,34 +1011,34 @@
     return true;
 }
 
-bool PixelHarness::check_saoCuOrgE3_2Rows_t(saoCuOrgE3_2Rows_t ref, saoCuOrgE3_2Rows_t opt)
-{
-    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
-    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
-
-    memset(ref_dest, 0xCD, sizeof(ref_dest));
-    memset(opt_dest, 0xCD, sizeof(opt_dest));
-
-    int j = 0;
-
-    for (int i = 0; i < ITERS; i++)
-    {
-        int stride = 16 * (rand() % 4 + 1);
-        int start = rand() % 2;
-        int end = (16 * (rand() % 4 + 1)) - rand() % 2;
-
-        ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end, psbuf6 + j);
-        checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end, psbuf7 + j);
-
-        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
-            return false;
-
-        reportfail();
-        j += INCR;
-    }
-
-    return true;
-}
+bool PixelHarness::check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int stride = 32 * (rand() % 2 + 1);
+        int start = rand() % 2;
+        int end = (32 * (rand() % 2 + 1)) - rand() % 2;
+
+        ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end);
+        checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
 
 bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
 {
@@ -1788,20 +1788,20 @@
         }
     }
 
-    if (opt.saoCuOrgE3)
+    if (opt.saoCuOrgE3[0])
     {
-        if (!check_saoCuOrgE3_t(ref.saoCuOrgE3, opt.saoCuOrgE3))
+        if (!check_saoCuOrgE3_t(ref.saoCuOrgE3[0], opt.saoCuOrgE3[0]))
         {
-            printf("SAO_EO_3 failed\n");
+            printf("SAO_EO_3[0] failed\n");
             return false;
         }
     }
 
-    if (opt.saoCuOrgE3_2Rows)
+    if (opt.saoCuOrgE3[1])
     {
-        if (!check_saoCuOrgE3_2Rows_t(ref.saoCuOrgE3_2Rows, opt.saoCuOrgE3_2Rows))
+        if (!check_saoCuOrgE3_32_t(ref.saoCuOrgE3[1], opt.saoCuOrgE3[1]))
         {
-            printf("SAO_EO_3_2Rows failed\n");
+            printf("SAO_EO_3[1] failed\n");
             return false;
         }
     }
@@ -2192,16 +2192,16 @@
         REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
     }
 
-    if (opt.saoCuOrgE3)
+    if (opt.saoCuOrgE3[0])
     {
-        HEADER0("SAO_EO_3");
-        REPORT_SPEEDUP(opt.saoCuOrgE3, ref.saoCuOrgE3, pbuf1, psbuf2, psbuf1, 64, 0, 64);
+        HEADER0("SAO_EO_3[0]");
+        REPORT_SPEEDUP(opt.saoCuOrgE3[0], ref.saoCuOrgE3[0], pbuf1, psbuf2, psbuf1, 64, 0, 64);
     }
 
-    if (opt.saoCuOrgE3_2Rows)
+    if (opt.saoCuOrgE3[1])
     {
-        HEADER0("SAO_EO_3_2Rows");
-        REPORT_SPEEDUP(opt.saoCuOrgE3_2Rows, ref.saoCuOrgE3_2Rows, pbuf1, psbuf2, psbuf1, 64, 0, 64, psbuf6);
+        HEADER0("SAO_EO_3[1]");
+        REPORT_SPEEDUP(opt.saoCuOrgE3[1], ref.saoCuOrgE3[1], pbuf1, psbuf2, psbuf1, 64, 0, 64);
     }
 
     if (opt.saoCuOrgB0)
diff -r 86268e498680 -r 584211b333ac source/test/pixelharness.h
--- a/source/test/pixelharness.h	Wed Apr 22 00:00:39 2015 -0500
+++ b/source/test/pixelharness.h	Wed Apr 22 11:59:36 2015 +0530
@@ -51,8 +51,6 @@
     int8_t   psbuf3[BUFFSIZE];
     int8_t   psbuf4[BUFFSIZE];
     int8_t   psbuf5[BUFFSIZE];
-    int8_t   psbuf6[BUFFSIZE];
-    int8_t   psbuf7[BUFFSIZE];
 
     int16_t  sbuf1[BUFFSIZE];
     int16_t  sbuf2[BUFFSIZE];
@@ -100,7 +98,7 @@
     bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
     bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
     bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
-    bool check_saoCuOrgE3_2Rows_t(saoCuOrgE3_2Rows_t ref, saoCuOrgE3_2Rows_t opt);
+    bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);


More information about the x265-devel mailing list