[x265] [PATCH] sao: modify C and SSE4 code for saoCuOrgE0 to process 2 rows

Divya Manivannan divya at multicorewareinc.com
Wed Apr 1 15:40:46 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1427895336 -19800
#      Wed Apr 01 19:05:36 2015 +0530
# Node ID f718abdc8004d0c859266b292730b7b5b3d0d4df
# Parent  ac85c775620f1dcb0df056874633cbf916098bd2
sao: modify C and SSE4 code for saoCuOrgE0 to process 2 rows

diff -r ac85c775620f -r f718abdc8004 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/loopfilter.cpp	Wed Apr 01 19:05:36 2015 +0530
@@ -42,18 +42,23 @@
         dst[x] = signOf(src1[x] - src2[x]);
 }
 
-void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft)
+void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
 {
-    int x;
-    int8_t signRight;
+    int x, y;
+    int8_t signRight, signLeft0;
     int8_t edgeType;
 
-    for (x = 0; x < width; x++)
+    for (y = 0; y < 2; y++)
     {
-        signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
-        edgeType = signRight + signLeft + 2;
-        signLeft  = -signRight;
-        rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+        signLeft0 = signLeft[y];
+        for (x = 0; x < width; x++)
+        {
+            signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
+            edgeType = signRight + signLeft0 + 2;
+            signLeft0 = -signRight;
+            rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+        }
+        rec += stride;
     }
 }
 
diff -r ac85c775620f -r f718abdc8004 source/common/primitives.h
--- a/source/common/primitives.h	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/primitives.h	Wed Apr 01 19:05:36 2015 +0530
@@ -169,7 +169,7 @@
 typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
 typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
-typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t* signLeft, intptr_t stride);
 typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
diff -r ac85c775620f -r f718abdc8004 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Wed Apr 01 19:05:36 2015 +0530
@@ -39,20 +39,25 @@
 
 
 ;============================================================================================================
-; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft)
+; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
 ;============================================================================================================
 INIT_XMM sse4
-cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft
+cglobal saoCuOrgE0, 5, 6, 8, rec, offsetEo, lcuWidth, signLeft, stride
 
-    neg         r3                          ; r3 = -signLeft
-    movzx       r3d, r3b
-    movd        m0, r3d
-    mova        m4, [pb_128]                ; m4 = [80]
-    pxor        m5, m5                      ; m5 = 0
-    movu        m6, [r1]                    ; m6 = offsetEo
+    mov         r4d, r4m
+    mova        m4,  [pb_128]                ; m4 = [80]
+    pxor        m5,  m5                      ; m5 = 0
+    movu        m6,  [r1]                    ; m6 = offsetEo
+
+    movzx       r5d, byte [r3]
+    inc         r3
+    neg         r5b
+    movd        m0, r5d
+    lea         r5, [r0 + r4]
+    mov         r4d, r2d
 
 .loop:
-    movu        m7, [r0]                    ; m1 = rec[x]
+    movu        m7, [r0]                    ; m7 = rec[x]
     movu        m2, [r0 + 1]                ; m2 = rec[x+1]
 
     pxor        m1, m7, m4
@@ -69,7 +74,7 @@
     pxor        m0, m0
     palignr     m0, m2, 15
     paddb       m2, m3
-    paddb       m2, [pb_2]                  ; m1 = uiEdgeType
+    paddb       m2, [pb_2]                  ; m2 = uiEdgeType
     pshufb      m3, m6, m2
     pmovzxbw    m2, m7                      ; rec
     punpckhbw   m7, m5
@@ -84,6 +89,43 @@
     add         r0q, 16
     sub         r2d, 16
     jnz        .loop
+
+    movzx       r3d, byte [r3]
+    neg         r3b
+    movd        m0, r3d
+.loopH:
+    movu        m7, [r5]                    ; m7 = rec[x]
+    movu        m2, [r5 + 1]                ; m2 = rec[x+1]
+
+    pxor        m1, m7, m4
+    pxor        m3, m2, m4
+    pcmpgtb     m2, m1, m3
+    pcmpgtb     m3, m1
+    pand        m2, [pb_1]
+    por         m2, m3
+
+    pslldq      m3, m2, 1
+    por         m3, m0
+
+    psignb      m3, m4                      ; m3 = signLeft
+    pxor        m0, m0
+    palignr     m0, m2, 15
+    paddb       m2, m3
+    paddb       m2, [pb_2]                  ; m2 = uiEdgeType
+    pshufb      m3, m6, m2
+    pmovzxbw    m2, m7                      ; rec
+    punpckhbw   m7, m5
+    pmovsxbw    m1, m3                      ; offsetEo
+    punpckhbw   m3, m3
+    psraw       m3, 8
+    paddw       m2, m1
+    paddw       m7, m3
+    packuswb    m2, m7
+    movu        [r5], m2
+
+    add         r5q, 16
+    sub         r4d, 16
+    jnz        .loopH
     RET
 
 ;==================================================================================================
diff -r ac85c775620f -r f718abdc8004 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/loopfilter.h	Wed Apr 01 19:05:36 2015 +0530
@@ -25,7 +25,7 @@
 #ifndef X265_LOOPFILTER_H
 #define X265_LOOPFILTER_H
 
-void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
 void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
diff -r ac85c775620f -r f718abdc8004 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/encoder/sao.cpp	Wed Apr 01 19:05:36 2015 +0530
@@ -258,7 +258,7 @@
     pixel* tmpL;
     pixel* tmpU;
 
-    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
+    int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft[2];
     int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
 
     memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
@@ -279,7 +279,7 @@
     {
     case SAO_EO_0: // dir: -
     {
-        pixel firstPxl = 0, lastPxl = 0;
+        pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
         startX = !lpelx;
         endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
         if (ctuWidth & 15)
@@ -301,25 +301,38 @@
         }
         else
         {
-            for (y = 0; y < ctuHeight; y++)
+            for (y = 0; y < ctuHeight; y += 2)
             {
-                int signLeft = signOf(rec[startX] - tmpL[y]);
-
-                if (!lpelx)
-                    firstPxl = rec[0];
-
-                if (rpelx == picWidth)
-                    lastPxl = rec[ctuWidth - 1];
-
-                primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, (int8_t)signLeft);
-
-                if (!lpelx)
-                    rec[0] = firstPxl;
-
-                if (rpelx == picWidth)
-                    rec[ctuWidth - 1] = lastPxl;
-
-                rec += stride;
+                signLeft[0] = signOf(rec[startX] - tmpL[y]);
+                signLeft[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
+
+                if (!lpelx)
+                {
+                    firstPxl = rec[0];
+                    row1FirstPxl = rec[stride];
+                }
+
+                if (rpelx == picWidth)
+                {
+                    lastPxl = rec[ctuWidth - 1];
+                    row1LastPxl = rec[stride + ctuWidth - 1];
+                }
+
+                primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, signLeft, stride);
+
+                if (!lpelx)
+                {
+                    rec[0] = firstPxl;
+                    rec[stride] = row1FirstPxl;
+                }
+
+                if (rpelx == picWidth)
+                {
+                    rec[ctuWidth - 1] = lastPxl;
+                    rec[stride + ctuWidth - 1] = row1LastPxl;
+                }
+
+                rec += 2 * stride;
             }
         }
         break;
diff -r ac85c775620f -r f718abdc8004 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Mar 31 20:04:28 2015 -0500
+++ b/source/test/pixelharness.cpp	Wed Apr 01 19:05:36 2015 +0530
@@ -908,12 +908,10 @@
     for (int i = 0; i < ITERS; i++)
     {
         int width = 16 * (rand() % 4 + 1);
-        int8_t sign = rand() % 3;
-        if (sign == 2)
-            sign = -1;
-
-        ref(ref_dest, psbuf1 + j, width, sign);
-        checked(opt, opt_dest, psbuf1 + j, width, sign);
+        int stride = width + 1;
+
+        ref(ref_dest, psbuf1 + j, width, psbuf2 + j, stride);
+        checked(opt, opt_dest, psbuf1 + j, width, psbuf5 + j, stride);
 
         if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
             return false;
@@ -2058,7 +2056,7 @@
     if (opt.saoCuOrgE0)
     {
         HEADER0("SAO_EO_0");
-        REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
+        REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, psbuf2, 64);
     }
 
     if (opt.saoCuOrgE1)


More information about the x265-devel mailing list