[x265] [PATCH 14 of 16] improve saoCuStatsE3 by use prepare (fenc - frec)

Min Chen chenm003 at 163.com
Wed Oct 7 00:55:25 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167722 18000
# Node ID 8fdd1b8fd4529b6966ab787f6c624f6056f77593
# Parent  1b7be2d88bce205ad41a04f8b5874f22cd8763e7
improve saoCuStatsE3 by use prepare (fenc - frec)
---
 source/common/primitives.h        |    2 +-
 source/common/x86/loopfilter.asm  |  139 +++++++++++++++++++++++++++++++++++
 source/common/x86/loopfilter.h    |    2 +-
 source/common/x86/pixel-util8.asm |  144 -------------------------------------
 source/encoder/sao.cpp            |    8 +-
 source/test/pixelharness.cpp      |    6 +-
 6 files changed, 148 insertions(+), 153 deletions(-)

diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/common/primitives.h
--- a/source/common/primitives.h	Tue Oct 06 16:41:59 2015 -0500
+++ b/source/common/primitives.h	Tue Oct 06 16:42:02 2015 -0500
@@ -180,7 +180,7 @@
 typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE2_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE3_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:59 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Tue Oct 06 16:42:02 2015 -0500
@@ -2404,3 +2404,142 @@
 %endif ; ARCH_X86_64
 
 
+
+
+;void saoStatE3(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+;{
+;    memset(tmp_stats, 0, sizeof(tmp_stats));
+;    memset(tmp_count, 0, sizeof(tmp_count));
+;    for (y = startY; y < endY; y++)
+;    {
+;        for (x = startX; x < endX; x++)
+;        {
+;            int signDown = signOf2(rec[x], rec[x + stride - 1]);
+;            uint32_t edgeType = signDown + upBuff1[x] + 2;
+;            upBuff1[x - 1] = (int8_t)(-signDown);
+;            tmp_stats[edgeType] += diff[x];
+;            tmp_count[edgeType]++;
+;        }
+;        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+;        rec += stride;
+;        fenc += stride;
+;    }
+;    for (x = 0; x < NUM_EDGETYPE; x++)
+;    {
+;        stats[s_eoTable[x]] += tmp_stats[x];
+;        count[s_eoTable[x]] += tmp_count[x];
+;    }
+;}
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE3, 4,9,8,0-32    ; Stack: 5 of stats and 5 of count
+    mov         r4d, r4m
+    mov         r5d, r5m
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m0, [pb_128]
+    mova        m5, [pb_1]
+    mova        m6, [pb_2]
+    movh        m7, [r3 + r4]
+
+.loopH:
+    mov         r6d, r4d
+
+.loopW:
+    movu        m1, [r1]
+    movu        m2, [r1 + r2 - 1]
+
+    ; signDown
+    pxor        m1, m0
+    pxor        m2, m0
+    pcmpgtb     m3, m1, m2
+    pand        m3, m5
+    pcmpgtb     m2, m1
+    por         m2, m3
+    pxor        m3, m3
+    psubb       m3, m2
+
+    ; edgeType
+    movu        m4, [r3]
+    paddb       m4, m6
+    paddb       m2, m4
+
+    ; update upBuff1
+    movu        [r3 - 1], m3
+
+    ; stats[edgeType]
+    pxor        m1, m0
+
+    ; 16 pixels
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+    inc    word [rsp + r7 * 2]
+
+    movsx       r8d, word [r0 + x * 2]
+    add         [rsp + 5 * 2 + r7 * 4], r8d
+
+    dec         r6d
+    jz         .next
+%assign x x+1
+%endrep
+
+    add         r0, 16*2
+    add         r1, 16
+    add         r3, 16
+    jmp         .loopW
+
+.next:
+    ; restore pointer upBuff1
+    mov         r6d, r4d
+    and         r6d, ~15
+    neg         r6                              ; MUST BE 64-bits, it is Negtive
+
+    ; move to next row
+
+    ; move back to start point
+    add         r3, r6
+
+    ; adjust with stride
+    lea         r0, [r0 + (r6 + 64) * 2]        ; 64 = MAX_CU_SIZE
+    add         r1, r2
+    add         r1, r6
+
+    dec         r5d
+    jg         .loopH
+
+    ; restore unavailable pixels
+    movh        [r3 + r4], m7
+
+    ; sum to global buffer
+    mov         r1, r6m
+    mov         r0, r7m
+
+    ; s_eoTable = {1,2,0,3,4}
+    movzx       r6d, word [rsp + 0 * 2]
+    add         [r0 + 1 * 4], r6d
+    movzx       r6d, word [rsp + 1 * 2]
+    add         [r0 + 2 * 4], r6d
+    movzx       r6d, word [rsp + 2 * 2]
+    add         [r0 + 0 * 4], r6d
+    movzx       r6d, word [rsp + 3 * 2]
+    add         [r0 + 3 * 4], r6d
+    movzx       r6d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r6d
+
+    mov         r6d, [rsp + 5 * 2 + 0 * 4]
+    add         [r1 + 1 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 1 * 4]
+    add         [r1 + 2 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 2 * 4]
+    add         [r1 + 0 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 3 * 4]
+    add         [r1 + 3 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r1 + 4 * 4], r6d
+    RET
+%endif ; ARCH_X86_64
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Oct 06 16:41:59 2015 -0500
+++ b/source/common/x86/loopfilter.h	Tue Oct 06 16:42:02 2015 -0500
@@ -40,7 +40,7 @@
     void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE2_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE3_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
 DECL_SAO(sse4);
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Oct 06 16:41:59 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Tue Oct 06 16:42:02 2015 -0500
@@ -6702,150 +6702,6 @@
     RET
 
 
-;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
-;{
-;    memset(tmp_stats, 0, sizeof(tmp_stats));
-;    memset(tmp_count, 0, sizeof(tmp_count));
-;    for (y = startY; y < endY; y++)
-;    {
-;        for (x = startX; x < endX; x++)
-;        {
-;            int signDown = signOf2(rec[x], rec[x + stride - 1]);
-;            uint32_t edgeType = signDown + upBuff1[x] + 2;
-;            upBuff1[x - 1] = (int8_t)(-signDown);
-;            tmp_stats[edgeType] += (fenc[x] - rec[x]);
-;            tmp_count[edgeType]++;
-;        }
-;        upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
-;        rec += stride;
-;        fenc += stride;
-;    }
-;    for (x = 0; x < NUM_EDGETYPE; x++)
-;    {
-;        stats[s_eoTable[x]] += tmp_stats[x];
-;        count[s_eoTable[x]] += tmp_count[x];
-;    }
-;}
-
-%if ARCH_X86_64
-INIT_XMM sse4
-cglobal saoCuStatsE3, 4,9,8,0-32    ; Stack: 5 of stats and 5 of count
-    mov         r4d, r4m
-    mov         r5d, r5m
-
-    ; clear internal temporary buffer
-    pxor        m0, m0
-    mova        [rsp], m0
-    mova        [rsp + mmsize], m0
-    mova        m0, [pb_128]
-    mova        m5, [pb_1]
-    mova        m6, [pb_2]
-    movh        m7, [r3 + r4]
-
-.loopH:
-    mov         r6d, r4d
-
-.loopW:
-    movu        m1, [r1]
-    movu        m2, [r1 + r2 - 1]
-
-    ; signDown
-    pxor        m1, m0
-    pxor        m2, m0
-    pcmpgtb     m3, m1, m2
-    pand        m3, m5
-    pcmpgtb     m2, m1
-    por         m2, m3
-    pxor        m3, m3
-    psubb       m3, m2
-
-    ; edgeType
-    movu        m4, [r3]
-    paddb       m4, m6
-    paddb       m2, m4
-
-    ; update upBuff1
-    movu        [r3 - 1], m3
-
-    ; stats[edgeType]
-    pxor        m1, m0
-    movu        m3, [r0]
-    punpckhbw   m4, m3, m1
-    punpcklbw   m3, m1
-    pmaddubsw   m3, [hmul_16p + 16]
-    pmaddubsw   m4, [hmul_16p + 16]
-
-    ; 16 pixels
-%assign x 0
-%rep 16
-    pextrb      r7d, m2, x
-    inc    word [rsp + r7 * 2]
-
-  %if (x < 8)
-    pextrw      r8d, m3, (x % 8)
-  %else
-    pextrw      r8d, m4, (x % 8)
-  %endif
-    movsx       r8d, r8w
-    add         [rsp + 5 * 2 + r7 * 4], r8d
-
-    dec         r6d
-    jz         .next
-%assign x x+1
-%endrep
-
-    add         r0, 16
-    add         r1, 16
-    add         r3, 16
-    jmp         .loopW
-
-.next:
-    ; restore pointer upBuff1
-    mov         r6d, r4d
-    and         r6d, 15
-
-    ; move to next row
-    sub         r6, r4
-    add         r3, r6
-    add         r6, r2
-    add         r0, r6
-    add         r1, r6
-    dec         r5d
-    jg         .loopH
-
-    ; restore unavailable pixels
-    movh        [r3 + r4], m7
-
-    ; sum to global buffer
-    mov         r1, r6m
-    mov         r0, r7m
-
-    ; s_eoTable = {1,2,0,3,4}
-    movzx       r6d, word [rsp + 0 * 2]
-    add         [r0 + 1 * 4], r6d
-    movzx       r6d, word [rsp + 1 * 2]
-    add         [r0 + 2 * 4], r6d
-    movzx       r6d, word [rsp + 2 * 2]
-    add         [r0 + 0 * 4], r6d
-    movzx       r6d, word [rsp + 3 * 2]
-    add         [r0 + 3 * 4], r6d
-    movzx       r6d, word [rsp + 4 * 2]
-    add         [r0 + 4 * 4], r6d
-
-    mov         r6d, [rsp + 5 * 2 + 0 * 4]
-    add         [r1 + 1 * 4], r6d
-    mov         r6d, [rsp + 5 * 2 + 1 * 4]
-    add         [r1 + 2 * 4], r6d
-    mov         r6d, [rsp + 5 * 2 + 2 * 4]
-    add         [r1 + 0 * 4], r6d
-    mov         r6d, [rsp + 5 * 2 + 3 * 4]
-    add         [r1 + 3 * 4], r6d
-    mov         r6d, [rsp + 5 * 2 + 4 * 4]
-    add         [r1 + 4 * 4], r6d
-    RET
-%endif ; ARCH_X86_64
-
-
 ; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int subPosBase)
 ;for (int i = 0; i < MLS_CG_SIZE; i++)
 ;{
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue Oct 06 16:41:59 2015 -0500
+++ b/source/encoder/sao.cpp	Tue Oct 06 16:42:02 2015 -0500
@@ -830,7 +830,7 @@
 
             primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
 
-            primitives.saoCuStatsE3(fenc0 + startX + startY * stride, rec0  + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
+            primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0  + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
         }
     }
 }
@@ -1667,7 +1667,7 @@
     }
 }
 
-void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsE3_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
 {
     X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
     X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
@@ -1689,14 +1689,14 @@
 
             uint32_t edgeType = signDown + upBuff1[x] + 2;
             upBuff1[x - 1] = (int8_t)(-signDown);
-            tmp_stats[edgeType] += (fenc[x] - rec[x]);
+            tmp_stats[edgeType] += diff[x];
             tmp_count[edgeType]++;
         }
 
         upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
 
         rec += stride;
-        fenc += stride;
+        diff += MAX_CU_SIZE;
     }
 
     for (x = 0; x < SAO::NUM_EDGETYPE; x++)
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Oct 06 16:41:59 2015 -0500
+++ b/source/test/pixelharness.cpp	Tue Oct 06 16:42:02 2015 -0500
@@ -1245,8 +1245,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -2886,7 +2886,7 @@
         int32_t stats[5], count[5];
         memset(upBuff1, 1, sizeof(upBuff1));
         HEADER0("saoCuStatsE3");
-        REPORT_SPEEDUP(opt.saoCuStatsE3, ref.saoCuStatsE3, pbuf2, pbuf3, 64, upBuff1 + 1, 60, 61, stats, count);
+        REPORT_SPEEDUP(opt.saoCuStatsE3, ref.saoCuStatsE3, sbuf2, pbuf3, 64, upBuff1 + 1, 60, 61, stats, count);
     }
 
     if (opt.planecopy_sp)



More information about the x265-devel mailing list