[x265] [PATCH 1 of 2] asm: new SSE4 primivite on saoStatsE3

Min Chen chenm003 at 163.com
Sat May 23 00:52:07 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1432332676 25200
# Node ID 3ce8e58d2d1cc58527be38bfe69ee42d82c3ccee
# Parent  234bc93bd51698801fad77cc861177ed019f5113
asm: new SSE4 primivite on saoStatsE3
---
 source/common/primitives.cpp         |    2 +
 source/common/primitives.h           |    5 +
 source/common/x86/asm-primitives.cpp |    2 +
 source/common/x86/loopfilter.h       |    1 +
 source/common/x86/pixel-util8.asm    |  146 ++++++++++++++++++++++++++++++++++
 source/encoder/sao.cpp               |   74 ++++++++++-------
 source/encoder/sao.h                 |    4 +-
 7 files changed, 203 insertions(+), 31 deletions(-)

diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/primitives.cpp
--- a/source/common/primitives.cpp	Thu May 21 16:34:48 2015 +0530
+++ b/source/common/primitives.cpp	Fri May 22 15:11:16 2015 -0700
@@ -56,6 +56,7 @@
 void setupFilterPrimitives_c(EncoderPrimitives &p);
 void setupIntraPrimitives_c(EncoderPrimitives &p);
 void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
+void setupSaoPrimitives_c(EncoderPrimitives &p);
 
 void setupCPrimitives(EncoderPrimitives &p)
 {
@@ -64,6 +65,7 @@
     setupFilterPrimitives_c(p);     // ipfilter.cpp
     setupIntraPrimitives_c(p);      // intrapred.cpp
     setupLoopFilterPrimitives_c(p); // loopfilter.cpp
+    setupSaoPrimitives_c(p);        // sao.cpp
 }
 
 void setupAliasPrimitives(EncoderPrimitives &p)
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/primitives.h
--- a/source/common/primitives.h	Thu May 21 16:34:48 2015 +0530
+++ b/source/common/primitives.h	Fri May 22 15:11:16 2015 -0700
@@ -173,6 +173,9 @@
 typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+
+typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
@@ -289,6 +292,8 @@
     saoCuOrgE3_t          saoCuOrgE3[2];
     saoCuOrgB0_t          saoCuOrgB0;
 
+    saoCuStatsE3_t        saoCuStatsE3;
+
     downscale_t           frameInitLowres;
     cutree_propagate_cost propagateCost;
 
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Fri May 22 15:11:16 2015 -0700
@@ -1797,6 +1797,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = x265_filterPixelToShort_6x16_sse4;
 
 #if X86_64
+        p.saoCuStatsE3 = x265_saoCuStatsE3_sse4;
+
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
         ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
 #endif
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/loopfilter.h	Fri May 22 15:11:16 2015 -0700
@@ -39,6 +39,7 @@
 void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+void x265_saoCuStatsE3_sse4(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 void x265_calSign_avx2(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Fri May 22 15:11:16 2015 -0700
@@ -53,6 +53,7 @@
 cextern pw_1
 cextern pw_0_15
 cextern pb_1
+cextern pb_128
 cextern pw_00ff
 cextern pw_1023
 cextern pw_3fff
@@ -6051,3 +6052,148 @@
     shl         r1d, 16
     or          eax, r1d
     RET
+
+
+;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+;{
+;    memset(tmp_stats, 0, sizeof(tmp_stats));
+;    memset(tmp_count, 0, sizeof(tmp_count));
+;    for (y = startY; y < endY; y++)
+;    {
+;        for (x = startX; x < endX; x++)
+;        {
+;            int signDown = signOf2(rec[x], rec[x + stride - 1]);
+;            uint32_t edgeType = signDown + upBuff1[x] + 2;
+;            upBuff1[x - 1] = (int8_t)(-signDown);
+;            tmp_stats[edgeType] += (fenc[x] - rec[x]);
+;            tmp_count[edgeType]++;
+;        }
+;        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+;        rec += stride;
+;        fenc += stride;
+;    }
+;    for (x = 0; x < NUM_EDGETYPE; x++)
+;    {
+;        stats[s_eoTable[x]] += tmp_stats[x];
+;        count[s_eoTable[x]] += tmp_count[x];
+;    }
+;}
+
+%if ARCH_X86_64
+; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
+INIT_XMM sse4
+cglobal saoCuStatsE3, 4,9,8,0-32    ; Stack: 5 of stats and 5 of count
+    mov         r4d, r4m
+    mov         r5d, r5m
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m0, [pb_128]
+    mova        m5, [pb_1]
+    mova        m6, [pb_2]
+    movh        m7, [r3 + r4]
+
+.loopH:
+    mov         r6d, r4d
+
+.loopW:
+    movu        m1, [r1]
+    movu        m2, [r1 + r2 - 1]
+
+    ; signDown
+    pxor        m1, m0
+    pxor        m2, m0
+    pcmpgtb     m3, m1, m2
+    pand        m3, m5
+    pcmpgtb     m2, m1
+    por         m2, m3
+    pxor        m3, m3
+    psubb       m3, m2
+
+    ; edgeType
+    movu        m4, [r3]
+    paddb       m4, m6
+    paddb       m2, m4
+
+    ; update upBuff1
+    movu        [r3 - 1], m3
+
+    ; stats[edgeType]
+    pxor        m1, m0
+    movu        m3, [r0]
+    punpckhbw   m4, m3, m1
+    punpcklbw   m3, m1
+    pmaddubsw   m3, [hmul_16p + 16]
+    pmaddubsw   m4, [hmul_16p + 16]
+
+    ; 16 pixels
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+    inc    word [rsp + r7 * 2]
+
+  %if (x < 8)
+    pextrw      r8d, m3, (x % 8)
+  %else
+    pextrw      r8d, m4, (x % 8)
+  %endif
+    movsx       r8d, r8w
+    add         [rsp + 5 * 2 + r7 * 4], r8d
+
+    dec         r6d
+    jz         .next
+%assign x x+1
+%endrep
+
+    add         r0, 16
+    add         r1, 16
+    add         r3, 16
+    jmp         .loopW
+
+.next:
+    ; restore pointer upBuff1
+    mov         r6d, r4d
+    and         r6d, 15
+
+    ; move to next row
+    sub         r6, r4
+    add         r3, r6
+    add         r6, r2
+    add         r0, r6
+    add         r1, r6
+    dec         r5d
+    jg         .loopH
+
+    ; restore unavailable pixels
+    movh        [r3 + r4], m7
+
+    ; sum to global buffer
+    mov         r1, r6m
+    mov         r0, r7m
+
+    ; s_eoTable = {1,2,0,3,4}
+    movzx       r6d, word [rsp + 0 * 2]
+    add         [r0 + 1 * 4], r6d
+    movzx       r6d, word [rsp + 1 * 2]
+    add         [r0 + 2 * 4], r6d
+    movzx       r6d, word [rsp + 2 * 2]
+    add         [r0 + 0 * 4], r6d
+    movzx       r6d, word [rsp + 3 * 2]
+    add         [r0 + 3 * 4], r6d
+    movzx       r6d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r6d
+
+    mov         r6d, [rsp + 5 * 2 + 0 * 4]
+    add         [r1 + 1 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 1 * 4]
+    add         [r1 + 2 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 2 * 4]
+    add         [r1 + 0 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 3 * 4]
+    add         [r1 + 3 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r1 + 4 * 4], r6d
+    RET
+%endif ; ARCH_X86_64
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Thu May 21 16:34:48 2015 +0530
+++ b/source/encoder/sao.cpp	Fri May 22 15:11:16 2015 -0700
@@ -57,7 +57,6 @@
 {
     return (count * offset - offsetOrg * 2) * offset;
 }
-
 } // end anonymous namespace
 
 
@@ -925,35 +924,7 @@
 
             primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
 
-            memset(tmp_stats, 0, sizeof(tmp_stats));
-            memset(tmp_count, 0, sizeof(tmp_count));
-
-            for (y = startY; y < endY; y++)
-            {
-                for (x = startX; x < endX; x++)
-                {
-                    int signDown = signOf2(rec[x], rec[x + stride - 1]);
-                    X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
-
-                    uint32_t edgeType = signDown + upBuff1[x] + 2;
-                    upBuff1[x - 1] = (int8_t)(-signDown);
-                    tmp_stats[edgeType] += (fenc[x] - rec[x]);
-                    tmp_count[edgeType]++;
-                }
-
-                upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
-
-                rec += stride;
-                fenc += stride;
-            }
-
-            stats = m_offsetOrg[plane][SAO_EO_3];
-            count = m_count[plane][SAO_EO_3];
-            for (x = 0; x < NUM_EDGETYPE; x++)
-            {
-                stats[s_eoTable[x]] += tmp_stats[x];
-                count[s_eoTable[x]] += tmp_count[x];
-            }
+            primitives.saoCuStatsE3(fenc0 + startX + startY * stride, rec0  + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
         }
     }
 }
@@ -1669,4 +1640,47 @@
         }
     }
 }
+
+// NOTE: must put in namespace x265 since we need class SAO
+void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+{
+    int x, y;
+    int32_t tmp_stats[SAO::NUM_EDGETYPE];
+    int32_t tmp_count[SAO::NUM_EDGETYPE];
+
+    memset(tmp_stats, 0, sizeof(tmp_stats));
+    memset(tmp_count, 0, sizeof(tmp_count));
+
+    for (y = 0; y < endY; y++)
+    {
+        for (x = 0; x < endX; x++)
+        {
+            int signDown = signOf2(rec[x], rec[x + stride - 1]);
+            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
+
+            uint32_t edgeType = signDown + upBuff1[x] + 2;
+            upBuff1[x - 1] = (int8_t)(-signDown);
+            tmp_stats[edgeType] += (fenc[x] - rec[x]);
+            tmp_count[edgeType]++;
+        }
+
+        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+
+        rec += stride;
+        fenc += stride;
+    }
+
+    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
+    {
+        stats[SAO::s_eoTable[x]] += tmp_stats[x];
+        count[SAO::s_eoTable[x]] += tmp_count[x];
+    }
 }
+
+void setupSaoPrimitives_c(EncoderPrimitives &p)
+{
+    // TODO: move other sao functions to here
+    p.saoCuStatsE3 = saoCuStatsE3_c;
+}
+}
+
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/encoder/sao.h
--- a/source/encoder/sao.h	Thu May 21 16:34:48 2015 +0530
+++ b/source/encoder/sao.h	Fri May 22 15:11:16 2015 -0700
@@ -52,7 +52,7 @@
 
 class SAO
 {
-protected:
+public:
 
     enum { SAO_MAX_DEPTH = 4 };
     enum { SAO_BO_BITS  = 5 };
@@ -68,6 +68,8 @@
     typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
     typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
 
+protected:
+
     /* allocated per part */
     PerClass*   m_count;
     PerClass*   m_offset;



More information about the x265-devel mailing list