[x265] [PATCH 11 of 16] improve saoCuStatsE0 by use prepare (fenc -	frec)
    Min Chen 
    chenm003 at 163.com
       
    Wed Oct  7 00:55:22 CEST 2015
    
    
  
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167714 18000
# Node ID 72d345dcf13e3c715f767ff624f5ff47043e01a6
# Parent  5f448e155870fdbb34242fa51c5a4eeebd71ebc0
improve saoCuStatsE0 by use prepare (fenc - frec)
---
 source/common/primitives.h       |    2 +-
 source/common/x86/loopfilter.asm |   77 +++++++++++++++++---------------------
 source/common/x86/loopfilter.h   |    2 +-
 source/encoder/sao.cpp           |   10 +++--
 source/test/pixelharness.cpp     |    6 +-
 5 files changed, 45 insertions(+), 52 deletions(-)
diff -r 5f448e155870 -r 72d345dcf13e source/common/primitives.h
--- a/source/common/primitives.h	Tue Oct 06 16:41:52 2015 -0500
+++ b/source/common/primitives.h	Tue Oct 06 16:41:54 2015 -0500
@@ -177,7 +177,7 @@
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 
 typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
diff -r 5f448e155870 -r 72d345dcf13e source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:52 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:54 2015 -0500
@@ -26,6 +26,7 @@
 ;*****************************************************************************/
 
 %include "x86inc.asm"
+%include "x86util.asm"
 
 SECTION_RODATA 32
 pb_31:      times 32 db 31
@@ -2030,23 +2031,29 @@
 %endif
 
 ;-----------------------------------------------------------------------------------------------------------------------
-; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;-----------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsE0, 5,9,8, 0-32
+cglobal saoCuStatsE0, 3,10,6, 0-32
     mov         r3d, r3m
-    mov         r8, r5mp
+    mov         r4d, r4m
+    mov         r9, r5mp
 
     ; clear internal temporary buffer
     pxor        m0, m0
     mova        [rsp], m0
     mova        [rsp + mmsize], m0
     mova        m4, [pb_128]
-    mova        m5, [hmul_16p + 16]
-    mova        m6, [pb_2]
+    mova        m5, [pb_2]
     xor         r7d, r7d
 
+    ; correct stride for diff[] and rec
+    mov         r6d, r3d
+    and         r6d, ~15
+    sub         r2, r6
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
+
 .loopH:
     mov         r5d, r3d
 
@@ -2060,62 +2067,46 @@
     pinsrb      m0, r7d, 15
 
 .loopL:
-    movu        m7, [r1]
+    movu        m3, [r1]
     movu        m2, [r1 + 1]
 
-    pxor        m1, m7, m4
-    pxor        m3, m2, m4
-    pcmpgtb     m2, m1, m3
-    pcmpgtb     m3, m1
-    pand        m2, [pb_1]
-    por         m2, m3              ; signRight
+    pxor        m1, m3, m4
+    pxor        m2, m4
+    pcmpgtb     m3, m1, m2
+    pcmpgtb     m2, m1
+    pand        m3, [pb_1]
+    por         m2, m3                          ; signRight
 
     palignr     m3, m2, m0, 15
-    psignb      m3, m4              ; signLeft
+    psignb      m3, m4                          ; signLeft
 
     mova        m0, m2
     paddb       m2, m3
-    paddb       m2, m6              ; edgeType
+    paddb       m2, m5                          ; edgeType
 
     ; stats[edgeType]
-    movu        m3, [r0]            ; fenc[0-15]
-    punpckhbw   m1, m3, m7
-    punpcklbw   m3, m7
-    pmaddubsw   m1, m5
-    pmaddubsw   m3, m5
-
 %assign x 0
 %rep 16
     pextrb      r7d, m2, x
 
-%if (x < 8)
-    pextrw      r6d, m3, (x % 8)
-%else
-    pextrw      r6d, m1, (x % 8)
-%endif
-    movsx       r6d, r6w
+    movsx       r6d, word [r0 + x * 2]
     inc         word [rsp + r7 * 2]             ; tmp_count[edgeType]++
     add         [rsp + 5 * 2 + r7 * 4], r6d     ; tmp_stats[edgeType] += (fenc[x] - rec[x])
     dec         r5d
-    jz          .next
+    jz         .next
 %assign x x+1
 %endrep
 
-    add         r0q, 16
-    add         r1q, 16
-    jmp         .loopL
+    add         r0, 16*2
+    add         r1, 16
+    jmp        .loopL
 
 .next:
-    mov         r6d, r3d
-    and         r6d, 15
-
-    sub         r6, r3
-    add         r6, r2
-    add         r0, r6
-    add         r1, r6
+    sub         r0, r8
+    add         r1, r2
 
     dec         r4d
-    jnz         .loopH
+    jnz        .loopH
 
     ; sum to global buffer
     mov         r0, r6mp
@@ -2133,15 +2124,15 @@
     add         [r0 + 4 * 4], r5d
 
     mov         r6d, [rsp + 5 * 2 + 0 * 4]
-    add         [r8 + 1 * 4], r6d
+    add         [r9 + 1 * 4], r6d
     mov         r5d, [rsp + 5 * 2 + 1 * 4]
-    add         [r8 + 2 * 4], r5d
+    add         [r9 + 2 * 4], r5d
     mov         r6d, [rsp + 5 * 2 + 2 * 4]
-    add         [r8 + 0 * 4], r6d
+    add         [r9 + 0 * 4], r6d
     mov         r5d, [rsp + 5 * 2 + 3 * 4]
-    add         [r8 + 3 * 4], r5d
+    add         [r9 + 3 * 4], r5d
     mov         r6d, [rsp + 5 * 2 + 4 * 4]
-    add         [r8 + 4 * 4], r6d
+    add         [r9 + 4 * 4], r6d
     RET
 %endif
 
diff -r 5f448e155870 -r 72d345dcf13e source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Oct 06 16:41:52 2015 -0500
+++ b/source/common/x86/loopfilter.h	Tue Oct 06 16:41:54 2015 -0500
@@ -37,7 +37,7 @@
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
     void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
diff -r 5f448e155870 -r 72d345dcf13e source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue Oct 06 16:41:52 2015 -0500
+++ b/source/encoder/sao.cpp	Tue Oct 06 16:41:54 2015 -0500
@@ -752,7 +752,7 @@
             startX = !lpelx;
             endX   = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
 
-            primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
+            primitives.saoCuStatsE0(diff + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
         }
 
         // SAO_EO_1: // dir: |
@@ -1559,12 +1559,14 @@
     }
 }
 
-void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 {
     int x, y;
     int32_t tmp_stats[SAO::NUM_EDGETYPE];
     int32_t tmp_count[SAO::NUM_EDGETYPE];
 
+    X265_CHECK(endX <= MAX_CU_SIZE, "endX too big\n");
+
     memset(tmp_stats, 0, sizeof(tmp_stats));
     memset(tmp_count, 0, sizeof(tmp_count));
 
@@ -1579,11 +1581,11 @@
             signLeft = -signRight;
 
             X265_CHECK(edgeType <= 4, "edgeType check failure\n");
-            tmp_stats[edgeType] += (fenc[x] - rec[x]);
+            tmp_stats[edgeType] += diff[x];
             tmp_count[edgeType]++;
         }
 
-        fenc += stride;
+        diff += MAX_CU_SIZE;
         rec += stride;
     }
 
diff -r 5f448e155870 -r 72d345dcf13e source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Oct 06 16:41:52 2015 -0500
+++ b/source/test/pixelharness.cpp	Tue Oct 06 16:41:54 2015 -0500
@@ -1098,8 +1098,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5) - 1;
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
 
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
@@ -2857,7 +2857,7 @@
     {
         int32_t stats[33], count[33];
         HEADER0("saoCuStatsE0");
-        REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count);
+        REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, sbuf2, pbuf3, 64, 60, 61, stats, count);
     }
 
     if (opt.saoCuStatsE1)
    
    
More information about the x265-devel
mailing list