[x265] [PATCH] asm: new SSE4 primivite on saoStatsE2

Min Chen chenm003 at 163.com
Wed May 27 01:34:28 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1432682219 25200
# Node ID c75ee6c1f0834505a6fd07a1ac9d487f798a44a0
# Parent  8ddc790790a46de9ceadea388f6271acdb3012ed
asm: new SSE4 primivite on saoStatsE2
---
 source/common/primitives.h           |    2 +
 source/common/x86/asm-primitives.cpp |    1 +
 source/common/x86/loopfilter.h       |    1 +
 source/common/x86/pixel-util8.asm    |  167 +++++++++++++++++++++++++++++++++-
 source/encoder/sao.cpp               |   69 ++++++++------
 source/test/pixelharness.cpp         |   74 +++++++++++++++
 source/test/pixelharness.h           |    1 +
 7 files changed, 285 insertions(+), 30 deletions(-)

diff -r 8ddc790790a4 -r c75ee6c1f083 source/common/primitives.h
--- a/source/common/primitives.h	Tue May 26 10:33:56 2015 +0530
+++ b/source/common/primitives.h	Tue May 26 16:16:59 2015 -0700
@@ -174,6 +174,7 @@
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 
+typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
@@ -292,6 +293,7 @@
     saoCuOrgE3_t          saoCuOrgE3[2];
     saoCuOrgB0_t          saoCuOrgB0;
 
+    saoCuStatsE2_t        saoCuStatsE2;
     saoCuStatsE3_t        saoCuStatsE3;
 
     downscale_t           frameInitLowres;
diff -r 8ddc790790a4 -r c75ee6c1f083 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue May 26 16:16:59 2015 -0700
@@ -1871,6 +1871,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = x265_filterPixelToShort_6x16_sse4;
 
 #if X86_64
+        p.saoCuStatsE2 = x265_saoCuStatsE2_sse4;
         p.saoCuStatsE3 = x265_saoCuStatsE3_sse4;
 
         ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
diff -r 8ddc790790a4 -r c75ee6c1f083 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/loopfilter.h	Tue May 26 16:16:59 2015 -0700
@@ -39,6 +39,7 @@
 void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
 void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+void x265_saoCuStatsE2_sse4(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count);
 void x265_saoCuStatsE3_sse4(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 void x265_calSign_avx2(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 8ddc790790a4 -r c75ee6c1f083 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Tue May 26 16:16:59 2015 -0700
@@ -6054,6 +6054,172 @@
     RET
 
 
+;void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
+;{
+;    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
+;    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
+;    int x, y;
+;    int32_t tmp_stats[SAO::NUM_EDGETYPE];
+;    int32_t tmp_count[SAO::NUM_EDGETYPE];
+;    memset(tmp_stats, 0, sizeof(tmp_stats));
+;    memset(tmp_count, 0, sizeof(tmp_count));
+;    for (y = 0; y < endY; y++)
+;    {
+;        upBufft[0] = signOf(rec[stride] - rec[-1]);
+;        for (x = 0; x < endX; x++)
+;        {
+;            int signDown = signOf2(rec[x], rec[x + stride + 1]);
+;            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
+;            uint32_t edgeType = signDown + upBuff1[x] + 2;
+;            upBufft[x + 1] = (int8_t)(-signDown);
+;            tmp_stats[edgeType] += (fenc[x] - rec[x]);
+;            tmp_count[edgeType]++;
+;        }
+;        std::swap(upBuff1, upBufft);
+;        rec += stride;
+;        fenc += stride;
+;    }
+;    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
+;    {
+;        stats[SAO::s_eoTable[x]] += tmp_stats[x];
+;        count[SAO::s_eoTable[x]] += tmp_count[x];
+;    }
+;}
+
+%if ARCH_X86_64
+; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
+INIT_XMM sse4
+cglobal saoCuStatsE2, 5,9,8,0-32    ; Stack: 5 of stats and 5 of count
+    mov         r5d, r5m
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m0, [pb_128]
+    mova        m5, [pb_1]
+    mova        m6, [pb_2]
+
+.loopH:
+    ; TODO: merge into below SIMD
+    ; get upBuffX[0]
+    mov         r6b, [r1 + r2]
+    sub         r6b, [r1 -  1]
+    seta        r6b
+    setb        r7b
+    sub         r6b, r7b
+    mov         [r4], r6b
+
+    ; backup unavailable pixels
+    movh        m7, [r4 + r5 + 1]
+
+    mov         r6d, r5d
+.loopW:
+    movu        m1, [r1]
+    movu        m2, [r1 + r2 + 1]
+
+    ; signDown
+    pxor        m1, m0
+    pxor        m2, m0
+    pcmpgtb     m3, m1, m2
+    pand        m3, m5
+    pcmpgtb     m2, m1
+    por         m2, m3
+    pxor        m3, m3
+    psubb       m3, m2
+
+    ; edgeType
+    movu        m4, [r3]
+    paddb       m4, m6
+    paddb       m2, m4
+
+    ; update upBuff1
+    movu        [r4 + 1], m3
+
+    ; stats[edgeType]
+    pxor        m1, m0
+    movu        m3, [r0]
+    punpckhbw   m4, m3, m1
+    punpcklbw   m3, m1
+    pmaddubsw   m3, [hmul_16p + 16]
+    pmaddubsw   m4, [hmul_16p + 16]
+
+    ; 16 pixels
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+    inc    word [rsp + r7 * 2]
+
+  %if (x < 8)
+    pextrw      r8d, m3, (x % 8)
+  %else
+    pextrw      r8d, m4, (x % 8)
+  %endif
+    movsx       r8d, r8w
+    add         [rsp + 5 * 2 + r7 * 4], r8d
+
+    dec         r6d
+    jz         .next
+%assign x x+1
+%endrep
+
+    add         r0, 16
+    add         r1, 16
+    add         r3, 16
+    add         r4, 16
+    jmp         .loopW
+
+.next:
+    xchg        r3, r4
+
+    ; restore pointer upBuff1
+    mov         r6d, r5d
+    and         r6d, 15
+
+    ; move to next row
+    sub         r6, r5
+    add         r3, r6
+    add         r4, r6
+    add         r6, r2
+    add         r0, r6
+    add         r1, r6
+
+    ; restore unavailable pixels
+    movh        [r3 + r5 + 1], m7
+
+    dec    byte r6m
+    jg         .loopH
+
+    ; sum to global buffer
+    mov         r1, r7m
+    mov         r0, r8m
+
+    ; s_eoTable = {1,2,0,3,4}
+    movzx       r6d, word [rsp + 0 * 2]
+    add         [r0 + 1 * 4], r6d
+    movzx       r6d, word [rsp + 1 * 2]
+    add         [r0 + 2 * 4], r6d
+    movzx       r6d, word [rsp + 2 * 2]
+    add         [r0 + 0 * 4], r6d
+    movzx       r6d, word [rsp + 3 * 2]
+    add         [r0 + 3 * 4], r6d
+    movzx       r6d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r6d
+
+    mov         r6d, [rsp + 5 * 2 + 0 * 4]
+    add         [r1 + 1 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 1 * 4]
+    add         [r1 + 2 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 2 * 4]
+    add         [r1 + 0 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 3 * 4]
+    add         [r1 + 3 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r1 + 4 * 4], r6d
+    RET
+%endif ; ARCH_X86_64
+
+
 ;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 ;{
 ;    memset(tmp_stats, 0, sizeof(tmp_stats));
@@ -6080,7 +6246,6 @@
 ;}
 
 %if ARCH_X86_64
-; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
 INIT_XMM sse4
 cglobal saoCuStatsE3, 4,9,8,0-32    ; Stack: 5 of stats and 5 of count
     mov         r4d, r4m
diff -r 8ddc790790a4 -r c75ee6c1f083 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue May 26 10:33:56 2015 +0530
+++ b/source/encoder/sao.cpp	Tue May 26 16:16:59 2015 -0700
@@ -866,35 +866,7 @@
 
             primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
 
-            memset(tmp_stats, 0, sizeof(tmp_stats));
-            memset(tmp_count, 0, sizeof(tmp_count));
-
-            for (y = startY; y < endY; y++)
-            {
-                upBufft[startX] = signOf(rec[startX + stride] - rec[startX - 1]);
-                for (x = startX; x < endX; x++)
-                {
-                    int signDown = signOf2(rec[x], rec[x + stride + 1]);
-                    X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
-                    uint32_t edgeType = signDown + upBuff1[x] + 2;
-                    upBufft[x + 1] = (int8_t)(-signDown);
-                    tmp_stats[edgeType] += (fenc[x] - rec[x]);
-                    tmp_count[edgeType]++;
-                }
-
-                std::swap(upBuff1, upBufft);
-
-                rec += stride;
-                fenc += stride;
-            }
-
-            stats = m_offsetOrg[plane][SAO_EO_2];
-            count = m_count[plane][SAO_EO_2];
-            for (x = 0; x < NUM_EDGETYPE; x++)
-            {
-                stats[s_eoTable[x]] += tmp_stats[x];
-                count[s_eoTable[x]] += tmp_count[x];
-            }
+            primitives.saoCuStatsE2(fenc0 + startX + startY * stride, rec0  + startX + startY * stride, stride, upBuff1 + startX, upBufft + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
         }
 
         // SAO_EO_3: // dir: 45
@@ -1642,6 +1614,44 @@
 }
 
 // NOTE: must put in namespace x265 since we need class SAO
+void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
+{
+    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
+    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
+
+    int x, y;
+    int32_t tmp_stats[SAO::NUM_EDGETYPE];
+    int32_t tmp_count[SAO::NUM_EDGETYPE];
+
+    memset(tmp_stats, 0, sizeof(tmp_stats));
+    memset(tmp_count, 0, sizeof(tmp_count));
+
+    for (y = 0; y < endY; y++)
+    {
+        upBufft[0] = signOf(rec[stride] - rec[-1]);
+        for (x = 0; x < endX; x++)
+        {
+            int signDown = signOf2(rec[x], rec[x + stride + 1]);
+            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
+            uint32_t edgeType = signDown + upBuff1[x] + 2;
+            upBufft[x + 1] = (int8_t)(-signDown);
+            tmp_stats[edgeType] += (fenc[x] - rec[x]);
+            tmp_count[edgeType]++;
+        }
+
+        std::swap(upBuff1, upBufft);
+
+        rec += stride;
+        fenc += stride;
+    }
+
+    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
+    {
+        stats[SAO::s_eoTable[x]] += tmp_stats[x];
+        count[SAO::s_eoTable[x]] += tmp_count[x];
+    }
+}
+
 void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
 {
     X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
@@ -1684,6 +1694,7 @@
 void setupSaoPrimitives_c(EncoderPrimitives &p)
 {
     // TODO: move other sao functions to here
+    p.saoCuStatsE2 = saoCuStatsE2_c;
     p.saoCuStatsE3 = saoCuStatsE3_c;
 }
 }
diff -r 8ddc790790a4 -r c75ee6c1f083 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue May 26 10:33:56 2015 +0530
+++ b/source/test/pixelharness.cpp	Tue May 26 16:16:59 2015 -0700
@@ -1016,6 +1016,60 @@
     return true;
 }
 
+bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt)
+{
+    enum { NUM_EDGETYPE = 5 };
+    int32_t stats_ref[NUM_EDGETYPE];
+    int32_t stats_vec[NUM_EDGETYPE];
+
+    int32_t count_ref[NUM_EDGETYPE];
+    int32_t count_vec[NUM_EDGETYPE];
+
+    int8_t _upBuff1_ref[MAX_CU_SIZE + 2], *upBuff1_ref = _upBuff1_ref + 1;
+    int8_t _upBufft_ref[MAX_CU_SIZE + 2], *upBufft_ref = _upBufft_ref + 1;
+    int8_t _upBuff1_vec[MAX_CU_SIZE + 2], *upBuff1_vec = _upBuff1_vec + 1;
+    int8_t _upBufft_vec[MAX_CU_SIZE + 2], *upBufft_vec = _upBufft_vec + 1;
+
+    int j = 0;
+
+    // NOTE: verify more times since our asm is NOT exact match to C, the output of upBuff* will be DIFFERENT
+    for (int i = 0; i < ITERS * 10; i++)
+    {
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
+        for (int x = 0; x < NUM_EDGETYPE; x++)
+        {
+            stats_ref[x] = stats_vec[x] = rand();
+            count_ref[x] = count_vec[x] = rand();
+        }
+
+        // initial sign
+        for (int x = 0; x < MAX_CU_SIZE + 2; x++)
+        {
+            _upBuff1_ref[x] = _upBuff1_vec[x] = (rand() % 3) - 1;
+            _upBufft_ref[x] = _upBufft_vec[x] = (rand() % 3) - 1;
+        }
+
+        intptr_t stride = 16 * (rand() % 4 + 1);
+        int endX = MAX_CU_SIZE - (rand() % 5) - 1;
+        int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
+
+        // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
+        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
+            || memcmp(_upBufft_ref, _upBufft_vec, sizeof(_upBufft_ref))
+            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+            || memcmp(count_ref, count_vec, sizeof(count_ref)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt)
 {
     enum { NUM_EDGETYPE = 5 };
@@ -1894,6 +1948,15 @@
         }
     }
 
+    if (opt.saoCuStatsE2)
+    {
+        if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2))
+        {
+            printf("saoCuStatsE2 failed\n");
+            return false;
+        }
+    }
+
     if (opt.saoCuStatsE3)
     {
         if (!check_saoCuStatsE3_t(ref.saoCuStatsE3, opt.saoCuStatsE3))
@@ -2304,6 +2367,17 @@
         REPORT_SPEEDUP(opt.saoCuOrgB0, ref.saoCuOrgB0, pbuf1, psbuf1, 64, 64, 64);
     }
 
+    if (opt.saoCuStatsE2)
+    {
+        int32_t stats[5], count[5];
+        int8_t upBuff1[MAX_CU_SIZE + 2];
+        int8_t upBufft[MAX_CU_SIZE + 2];
+        memset(upBuff1, 1, sizeof(upBuff1));
+        memset(upBufft, -1, sizeof(upBufft));
+        HEADER0("saoCuStatsE2");
+        REPORT_SPEEDUP(opt.saoCuStatsE2, ref.saoCuStatsE2, pbuf2, pbuf3, 64, upBuff1 + 1, upBufft + 1, 60, 61, stats, count);
+    }
+
     if (opt.saoCuStatsE3)
     {
         int8_t upBuff1[MAX_CU_SIZE + 2];
diff -r 8ddc790790a4 -r c75ee6c1f083 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue May 26 10:33:56 2015 +0530
+++ b/source/test/pixelharness.h	Tue May 26 16:16:59 2015 -0700
@@ -100,6 +100,7 @@
     bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
+    bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
     bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);



More information about the x265-devel mailing list