[x265] [PATCH 13 of 16] improve saoCuStatsE2 by use prepare (fenc - frec)

Min Chen chenm003 at 163.com
Wed Oct 7 00:55:24 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167719 18000
# Node ID 1b7be2d88bce205ad41a04f8b5874f22cd8763e7
# Parent  becaacf5b72bfc13860ce183e1ff411b5820163d
improve saoCuStatsE2 by use prepare (fenc - frec)
---
 source/common/primitives.h        |    2 +-
 source/common/x86/loopfilter.asm  |  162 ++++++++++++++++++++++++++++++++++++
 source/common/x86/loopfilter.h    |    2 +-
 source/common/x86/pixel-util8.asm |  166 -------------------------------------
 source/encoder/sao.cpp            |    8 +-
 source/test/pixelharness.cpp      |    6 +-
 6 files changed, 171 insertions(+), 175 deletions(-)

diff -r becaacf5b72b -r 1b7be2d88bce source/common/primitives.h
--- a/source/common/primitives.h	Tue Oct 06 16:41:57 2015 -0500
+++ b/source/common/primitives.h	Tue Oct 06 16:41:59 2015 -0500
@@ -179,7 +179,7 @@
 typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE2_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r becaacf5b72b -r 1b7be2d88bce source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:57 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:59 2015 -0500
@@ -2242,3 +2242,165 @@
     add         [r1 + 4 * 4], r6d
     RET
 %endif ; ARCH_X86_64
+
+
+;void saoCuStatsE2_c(const int16_t *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
+;{
+;    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
+;    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
+;    int x, y;
+;    int32_t tmp_stats[SAO::NUM_EDGETYPE];
+;    int32_t tmp_count[SAO::NUM_EDGETYPE];
+;    memset(tmp_stats, 0, sizeof(tmp_stats));
+;    memset(tmp_count, 0, sizeof(tmp_count));
+;    for (y = 0; y < endY; y++)
+;    {
+;        upBufft[0] = signOf(rec[stride] - rec[-1]);
+;        for (x = 0; x < endX; x++)
+;        {
+;            int signDown = signOf2(rec[x], rec[x + stride + 1]);
+;            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
+;            uint32_t edgeType = signDown + upBuff1[x] + 2;
+;            upBufft[x + 1] = (int8_t)(-signDown);
+;            tmp_stats[edgeType] += diff[x];
+;            tmp_count[edgeType]++;
+;        }
+;        std::swap(upBuff1, upBufft);
+;        rec += stride;
+;        fenc += stride;
+;    }
+;    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
+;    {
+;        stats[SAO::s_eoTable[x]] += tmp_stats[x];
+;        count[SAO::s_eoTable[x]] += tmp_count[x];
+;    }
+;}
+
+%if ARCH_X86_64
+; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
+INIT_XMM sse4
+cglobal saoCuStatsE2, 5,9,8,0-32    ; Stack: 5 of stats and 5 of count
+    mov         r5d, r5m
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m0, [pb_128]
+    mova        m5, [pb_1]
+    mova        m6, [pb_2]
+
+.loopH:
+    ; TODO: merge into SIMD in below
+    ; get upBuffX[0]
+    mov         r6b, [r1 + r2]
+    sub         r6b, [r1 -  1]
+    seta        r6b
+    setb        r7b
+    sub         r6b, r7b
+    mov         [r4], r6b
+
+    ; backup unavailable pixels
+    movh        m7, [r4 + r5 + 1]
+
+    mov         r6d, r5d
+.loopW:
+    movu        m1, [r1]
+    movu        m2, [r1 + r2 + 1]
+
+    ; signDown
+    pxor        m1, m0
+    pxor        m2, m0
+    pcmpgtb     m3, m1, m2
+    pand        m3, m5
+    pcmpgtb     m2, m1
+    por         m2, m3
+    pxor        m3, m3
+    psubb       m3, m2
+
+    ; edgeType
+    movu        m4, [r3]
+    paddb       m4, m6
+    paddb       m2, m4
+
+    ; update upBuff1
+    movu        [r4 + 1], m3
+
+    ; stats[edgeType]
+    pxor        m1, m0
+
+    ; 16 pixels
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+    inc    word [rsp + r7 * 2]
+
+    movsx       r8d, word [r0 + x * 2]
+    add         [rsp + 5 * 2 + r7 * 4], r8d
+
+    dec         r6d
+    jz         .next
+%assign x x+1
+%endrep
+
+    add         r0, 16*2
+    add         r1, 16
+    add         r3, 16
+    add         r4, 16
+    jmp        .loopW
+
+.next:
+    xchg        r3, r4
+
+    ; restore pointer upBuff1
+    mov         r6d, r5d
+    and         r6d, ~15
+    neg         r6                              ; MUST BE 64-bits, it is Negtive
+
+    ; move to next row
+
+    ; move back to start point
+    add         r3, r6
+    add         r4, r6
+
+    ; adjust with stride
+    lea         r0, [r0 + (r6 + 64) * 2]        ; 64 = MAX_CU_SIZE
+    add         r1, r2
+    add         r1, r6
+
+    ; restore unavailable pixels
+    movh        [r3 + r5 + 1], m7
+
+    dec    byte r6m
+    jg         .loopH
+
+    ; sum to global buffer
+    mov         r1, r7m
+    mov         r0, r8m
+
+    ; s_eoTable = {1,2,0,3,4}
+    movzx       r6d, word [rsp + 0 * 2]
+    add         [r0 + 1 * 4], r6d
+    movzx       r6d, word [rsp + 1 * 2]
+    add         [r0 + 2 * 4], r6d
+    movzx       r6d, word [rsp + 2 * 2]
+    add         [r0 + 0 * 4], r6d
+    movzx       r6d, word [rsp + 3 * 2]
+    add         [r0 + 3 * 4], r6d
+    movzx       r6d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r6d
+
+    mov         r6d, [rsp + 5 * 2 + 0 * 4]
+    add         [r1 + 1 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 1 * 4]
+    add         [r1 + 2 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 2 * 4]
+    add         [r1 + 0 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 3 * 4]
+    add         [r1 + 3 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r1 + 4 * 4], r6d
+    RET
+%endif ; ARCH_X86_64
+
+
diff -r becaacf5b72b -r 1b7be2d88bce source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Oct 06 16:41:57 2015 -0500
+++ b/source/common/x86/loopfilter.h	Tue Oct 06 16:41:59 2015 -0500
@@ -39,7 +39,7 @@
     void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE2_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
diff -r becaacf5b72b -r 1b7be2d88bce source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Oct 06 16:41:57 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Tue Oct 06 16:41:59 2015 -0500
@@ -6702,172 +6702,6 @@
     RET
 
 
-;void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
-;{
-;    X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
-;    X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
-;    int x, y;
-;    int32_t tmp_stats[SAO::NUM_EDGETYPE];
-;    int32_t tmp_count[SAO::NUM_EDGETYPE];
-;    memset(tmp_stats, 0, sizeof(tmp_stats));
-;    memset(tmp_count, 0, sizeof(tmp_count));
-;    for (y = 0; y < endY; y++)
-;    {
-;        upBufft[0] = signOf(rec[stride] - rec[-1]);
-;        for (x = 0; x < endX; x++)
-;        {
-;            int signDown = signOf2(rec[x], rec[x + stride + 1]);
-;            X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
-;            uint32_t edgeType = signDown + upBuff1[x] + 2;
-;            upBufft[x + 1] = (int8_t)(-signDown);
-;            tmp_stats[edgeType] += (fenc[x] - rec[x]);
-;            tmp_count[edgeType]++;
-;        }
-;        std::swap(upBuff1, upBufft);
-;        rec += stride;
-;        fenc += stride;
-;    }
-;    for (x = 0; x < SAO::NUM_EDGETYPE; x++)
-;    {
-;        stats[SAO::s_eoTable[x]] += tmp_stats[x];
-;        count[SAO::s_eoTable[x]] += tmp_count[x];
-;    }
-;}
-
-%if ARCH_X86_64
-; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
-INIT_XMM sse4
-cglobal saoCuStatsE2, 5,9,8,0-32    ; Stack: 5 of stats and 5 of count
-    mov         r5d, r5m
-
-    ; clear internal temporary buffer
-    pxor        m0, m0
-    mova        [rsp], m0
-    mova        [rsp + mmsize], m0
-    mova        m0, [pb_128]
-    mova        m5, [pb_1]
-    mova        m6, [pb_2]
-
-.loopH:
-    ; TODO: merge into below SIMD
-    ; get upBuffX[0]
-    mov         r6b, [r1 + r2]
-    sub         r6b, [r1 -  1]
-    seta        r6b
-    setb        r7b
-    sub         r6b, r7b
-    mov         [r4], r6b
-
-    ; backup unavailable pixels
-    movh        m7, [r4 + r5 + 1]
-
-    mov         r6d, r5d
-.loopW:
-    movu        m1, [r1]
-    movu        m2, [r1 + r2 + 1]
-
-    ; signDown
-    pxor        m1, m0
-    pxor        m2, m0
-    pcmpgtb     m3, m1, m2
-    pand        m3, m5
-    pcmpgtb     m2, m1
-    por         m2, m3
-    pxor        m3, m3
-    psubb       m3, m2
-
-    ; edgeType
-    movu        m4, [r3]
-    paddb       m4, m6
-    paddb       m2, m4
-
-    ; update upBuff1
-    movu        [r4 + 1], m3
-
-    ; stats[edgeType]
-    pxor        m1, m0
-    movu        m3, [r0]
-    punpckhbw   m4, m3, m1
-    punpcklbw   m3, m1
-    pmaddubsw   m3, [hmul_16p + 16]
-    pmaddubsw   m4, [hmul_16p + 16]
-
-    ; 16 pixels
-%assign x 0
-%rep 16
-    pextrb      r7d, m2, x
-    inc    word [rsp + r7 * 2]
-
-  %if (x < 8)
-    pextrw      r8d, m3, (x % 8)
-  %else
-    pextrw      r8d, m4, (x % 8)
-  %endif
-    movsx       r8d, r8w
-    add         [rsp + 5 * 2 + r7 * 4], r8d
-
-    dec         r6d
-    jz         .next
-%assign x x+1
-%endrep
-
-    add         r0, 16
-    add         r1, 16
-    add         r3, 16
-    add         r4, 16
-    jmp         .loopW
-
-.next:
-    xchg        r3, r4
-
-    ; restore pointer upBuff1
-    mov         r6d, r5d
-    and         r6d, 15
-
-    ; move to next row
-    sub         r6, r5
-    add         r3, r6
-    add         r4, r6
-    add         r6, r2
-    add         r0, r6
-    add         r1, r6
-
-    ; restore unavailable pixels
-    movh        [r3 + r5 + 1], m7
-
-    dec    byte r6m
-    jg         .loopH
-
-    ; sum to global buffer
-    mov         r1, r7m
-    mov         r0, r8m
-
-    ; s_eoTable = {1,2,0,3,4}
-    movzx       r6d, word [rsp + 0 * 2]
-    add         [r0 + 1 * 4], r6d
-    movzx       r6d, word [rsp + 1 * 2]
-    add         [r0 + 2 * 4], r6d
-    movzx       r6d, word [rsp + 2 * 2]
-    add         [r0 + 0 * 4], r6d
-    movzx       r6d, word [rsp + 3 * 2]
-    add         [r0 + 3 * 4], r6d
-    movzx       r6d, word [rsp + 4 * 2]
-    add         [r0 + 4 * 4], r6d
-
-    mov         r6d, [rsp + 5 * 2 + 0 * 4]
-    add         [r1 + 1 * 4], r6d
-    mov         r6d, [rsp + 5 * 2 + 1 * 4]
-    add         [r1 + 2 * 4], r6d
-    mov         r6d, [rsp + 5 * 2 + 2 * 4]
-    add         [r1 + 0 * 4], r6d
-    mov         r6d, [rsp + 5 * 2 + 3 * 4]
-    add         [r1 + 3 * 4], r6d
-    mov         r6d, [rsp + 5 * 2 + 4 * 4]
-    add         [r1 + 4 * 4], r6d
-    RET
-%endif ; ARCH_X86_64
-
-
 ;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 ;{
 ;    memset(tmp_stats, 0, sizeof(tmp_stats));
diff -r becaacf5b72b -r 1b7be2d88bce source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue Oct 06 16:41:57 2015 -0500
+++ b/source/encoder/sao.cpp	Tue Oct 06 16:41:59 2015 -0500
@@ -802,7 +802,7 @@
 
             primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
 
-            primitives.saoCuStatsE2(fenc0 + startX + startY * stride, rec0  + startX + startY * stride, stride, upBuff1 + startX, upBufft + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
+            primitives.saoCuStatsE2(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1 + startX, upBufft + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
         }
 
         // SAO_EO_3: // dir: 45
@@ -1629,7 +1629,7 @@
     }
 }
 
-void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsE2_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
 {
     X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
     X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
@@ -1650,14 +1650,14 @@
             X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
             uint32_t edgeType = signDown + upBuff1[x] + 2;
             upBufft[x + 1] = (int8_t)(-signDown);
-            tmp_stats[edgeType] += (fenc[x] - rec[x]);
+            tmp_stats[edgeType] += diff[x];
             tmp_count[edgeType]++;
         }
 
         std::swap(upBuff1, upBufft);
 
         rec += stride;
-        fenc += stride;
+        diff += MAX_CU_SIZE;
     }
 
     for (x = 0; x < SAO::NUM_EDGETYPE; x++)
diff -r becaacf5b72b -r 1b7be2d88bce source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Oct 06 16:41:57 2015 -0500
+++ b/source/test/pixelharness.cpp	Tue Oct 06 16:41:59 2015 -0500
@@ -1194,8 +1194,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
 
         // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
@@ -2877,7 +2877,7 @@
         memset(upBuff1, 1, sizeof(upBuff1));
         memset(upBufft, -1, sizeof(upBufft));
         HEADER0("saoCuStatsE2");
-        REPORT_SPEEDUP(opt.saoCuStatsE2, ref.saoCuStatsE2, pbuf2, pbuf3, 64, upBuff1 + 1, upBufft + 1, 60, 61, stats, count);
+        REPORT_SPEEDUP(opt.saoCuStatsE2, ref.saoCuStatsE2, sbuf2, pbuf3, 64, upBuff1 + 1, upBufft + 1, 60, 61, stats, count);
     }
 
     if (opt.saoCuStatsE3)



More information about the x265-devel mailing list