[x265] [PATCH] sao: modify saoCuOrgE3_2Rows C code and add sse4 code

Divya Manivannan divya at multicorewareinc.com
Mon Apr 20 15:32:01 CEST 2015


# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429536293 -19800
#      Mon Apr 20 18:54:53 2015 +0530
# Node ID b0aff8e0b995bd0e507825be0796e18694b60f1f
# Parent  5c3443546cccea47316d59dbc4f892e1b6f8b1b5
sao: modify saoCuOrgE3_2Rows C code and add sse4 code

SAO_EO_3_2Rows  9.52x    1042.79         9930.47

diff -r 5c3443546ccc -r b0aff8e0b995 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/loopfilter.cpp	Mon Apr 20 18:54:53 2015 +0530
@@ -122,25 +122,21 @@
     }
 }
 
-void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* signDown)
+void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
 {
-    int8_t signDown1;
+    int8_t signDown;
     int8_t edgeType;
 
     for (int y = 0; y < 2; y++)
     {
-        edgeType = signDown[y] + upBuff1[startX] + 2;
-        upBuff1[startX - 1] = -signDown[y];
-        rec[startX] = x265_clip(rec[startX] + offsetEo[edgeType]);
-
         for (int x = startX + 1; x < endX; x++)
         {
-            signDown1 = signOf(rec[x] - rec[x + stride]);
-            edgeType = signDown1 + upBuff1[x] + 2;
-            upBuff1[x - 1] = -signDown1;
+            signDown = signOf(rec[x] - rec[x + stride]);
+            edgeType = signDown + upBuff1[x] + 2;
+            upBuff1[x - 1] = -signDown;
             rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
         }
-        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride + 1] - rec[endX]);
+        upBuff1[endX - 1] = upBuff[y];
         rec += stride + 1;
     }
 }
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/primitives.h
--- a/source/common/primitives.h	Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/primitives.h	Mon Apr 20 18:54:53 2015 +0530
@@ -172,7 +172,7 @@
 typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
-typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* signDown);
+typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Mon Apr 20 18:54:53 2015 +0530
@@ -1507,6 +1507,7 @@
         p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
         p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
         p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
+        p.saoCuOrgE3_2Rows = x265_saoCuOrgE3_2Rows_sse4;
         p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
 
         LUMA_ADDAVG(sse4);
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/x86/loopfilter.asm	Mon Apr 20 18:54:53 2015 +0530
@@ -582,6 +582,135 @@
     movhps          [r1 + r5 - 1], xm7
     RET
 
+;=============================================================================================================================
+;void saoCuOrgE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
+;=============================================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE3_2Rows, 3, 7, 8
+    mov             r3d, r3m
+    mov             r4d, r4m
+    movu            m5, [r2]
+    mov             r2d, r5m
+    mov             r6,  r6m
+
+    movh            m7, [r0 + r2]
+    movhps          m7, [r1 + r2 - 1]
+
+    inc             r4d
+    add             r0, r4
+    add             r1, r4
+
+    sub             r2d, r4d
+    pxor            m0, m0                      ; m0 = 0
+    mova            m6, [pb_2]                  ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+
+.loop:
+    movu            m1, [r0]                    ; m1 = pRec[x]
+    movu            m2, [r0 + r3]               ; m2 = pRec[x + iStride]
+
+    psubusb         m3, m2, m1
+    psubusb         m4, m1, m2
+    pcmpeqb         m3, m0
+    pcmpeqb         m4, m0
+    pcmpeqb         m2, m1
+
+    pabsb           m3, m3
+    por             m4, m3
+    pandn           m2, m4                      ; m2 = iSignDown
+
+    movu            m3, [r1]                    ; m3 = m_iUpBuff1
+
+    paddb           m3, m2
+    paddb           m3, m6                      ; m3 = uiEdgeType
+
+    pshufb          m4, m5, m3
+
+    psubb           m3, m0, m2
+    movu            [r1 - 1], m3
+
+    pmovzxbw        m2, m1
+    punpckhbw       m1, m0
+    pmovsxbw        m3, m4
+    punpckhbw       m4, m4
+    psraw           m4, 8
+
+    paddw           m2, m3
+    paddw           m1, m4
+    packuswb        m2, m1
+    movu            [r0], m2
+
+    add             r0, 16
+    add             r1, 16
+    sub             r2, 16
+    jg              .loop
+
+    add             r0, r2
+    add             r1, r2
+    movh            [r0], m7
+    movhps          [r1 - 1], m7
+
+    mov             r5d, r5m
+    mov             r2b, byte[r6]
+    mov             byte[r1 - 1], r2b
+
+    sub             r0, r5
+    lea             r0, [r0 + r3 + 1]
+
+    movh            m7, [r0 + r5]
+    movhps          m7, [r1 - 1]
+
+    sub             r1, r5
+    add             r0, r4
+    add             r1, r4
+    sub             r5d, r4d
+
+.loop1:
+    movu            m1, [r0]                    ; m1 = pRec[x]
+    movu            m2, [r0 + r3]               ; m2 = pRec[x + iStride]
+
+    psubusb         m3, m2, m1
+    psubusb         m4, m1, m2
+    pcmpeqb         m3, m0
+    pcmpeqb         m4, m0
+    pcmpeqb         m2, m1
+
+    pabsb           m3, m3
+    por             m4, m3
+    pandn           m2, m4                      ; m2 = iSignDown
+
+    movu            m3, [r1]                    ; m3 = m_iUpBuff1
+
+    paddb           m3, m2
+    paddb           m3, m6                      ; m3 = uiEdgeType
+
+    pshufb          m4, m5, m3
+
+    psubb           m3, m0, m2
+    movu            [r1 - 1], m3
+
+    pmovzxbw        m2, m1
+    punpckhbw       m1, m0
+    pmovsxbw        m3, m4
+    punpckhbw       m4, m4
+    psraw           m4, 8
+
+    paddw           m2, m3
+    paddw           m1, m4
+    packuswb        m2, m1
+    movu            [r0], m2
+
+    add             r0, 16
+    add             r1, 16
+    sub             r5, 16
+    jg              .loop1
+
+    movh            [r0 + r5], m7
+    movhps          [r1 + r5 - 1], m7
+
+    mov             r2b, byte[r6 + 1]
+    mov             byte[r1 + r5 - 1], r2b
+    RET
+
 ;=====================================================================================
 ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
 ;=====================================================================================
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/x86/loopfilter.h	Mon Apr 20 18:54:53 2015 +0530
@@ -34,6 +34,7 @@
 void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
+void x265_saoCuOrgE3_2Rows_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 5c3443546ccc -r b0aff8e0b995 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Sat Apr 18 10:02:19 2015 -0700
+++ b/source/encoder/sao.cpp	Mon Apr 20 18:54:53 2015 +0530
@@ -517,13 +517,26 @@
                 upBuff1[ctuWidth - 1] = lastSign;
 
             int diff = endY - startY;
-            for (y = 0; y < diff / 2; y++)
+            for (y = 0; y < (diff >> 1); y++)
             {
-                int8_t signDown[2];
-                signDown[0] = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
-                signDown[1] = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
+                int8_t signDown, signDown0, upBuff[2];
+                int edgeType1;
 
-                primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, signDown);
+                signDown = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
+                edgeType1 = signDown + upBuff1[startX] + 2;
+                rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType1]];
+
+                signDown = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
+                signDown0 = signOf(rec[startX + 1] - rec[startX + stride]);
+                edgeType1 = signDown - signDown0 + 2;
+                upBuff1[startX - 1] = -signDown;
+
+                upBuff[0] = signOf(rec[endX - 1 + stride] - rec[endX]);
+                upBuff[1] = signOf(rec[endX - 1 + 2 * stride] - rec[endX + stride]);
+
+                primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, upBuff);
+
+                rec[startX + stride] = m_clipTable[rec[startX + stride] + m_offsetEo[edgeType1]];
 
                 rec += 2 * stride;
             }


More information about the x265-devel mailing list