[x265] [PATCH 10 of 16] prepare on (fenc - frec) and improve saoCuStatsBO

Min Chen chenm003 at 163.com
Wed Oct 7 00:55:21 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167712 18000
# Node ID 5f448e155870fdbb34242fa51c5a4eeebd71ebc0
# Parent  5429d2f26ebfa2245e1a754a4355caf5c7f13c27
prepare on (fenc - frec) and improve saoCuStatsBO
---
 source/common/primitives.h       |    2 +-
 source/common/x86/loopfilter.asm |   37 +++++++++++--------------------------
 source/common/x86/loopfilter.h   |    2 +-
 source/encoder/sao.cpp           |   22 ++++++++++++++++++----
 source/test/pixelharness.cpp     |    6 +++---
 5 files changed, 34 insertions(+), 35 deletions(-)

diff -r 5429d2f26ebf -r 5f448e155870 source/common/primitives.h
--- a/source/common/primitives.h	Tue Oct 06 16:41:50 2015 -0500
+++ b/source/common/primitives.h	Tue Oct 06 16:41:52 2015 -0500
@@ -176,7 +176,7 @@
 typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
 typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 
-typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
diff -r 5429d2f26ebf -r 5f448e155870 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:50 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:52 2015 -0500
@@ -1989,14 +1989,12 @@
 %endif
 
 ;--------------------------------------------------------------------------------------------------------------------------
-; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;--------------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsBO, 7,12,6
-    mova        m3, [hmul_16p + 16]
-    mova        m4, [pb_124]
-    mova        m5, [pb_4]
+cglobal saoCuStatsBO, 7,12,2
+    mova        m0, [pb_124]
     xor         r7d, r7d
 
 .loopH:
@@ -2005,42 +2003,29 @@
     mov         r9d, r3d
 .loopL:
     movu        m1, [r11]
-    movu        m0, [r10]
-
-    punpckhbw   m2, m0, m1
-    punpcklbw   m0, m1
-    psrlw       m1, 1               ; rec[x] >> boShift
-    pmaddubsw   m2, m3
-    pmaddubsw   m0, m3
-    pand        m1, m4
-    paddb       m1, m5
+    psrlw       m1, 1                   ; rec[x] >> boShift
+    pand        m1, m0
 
 %assign x 0
 %rep 16
     pextrb      r7d, m1, x
-
-%if (x < 8)
-    pextrw      r8d, m0, (x % 8)
-%else
-    pextrw      r8d, m2, (x % 8)
-%endif
-    movsx       r8d, r8w
-    inc         dword  [r6 + r7]    ; count[classIdx]++
-    add         [r5 + r7], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
+    movsx       r8d, word [r10 + x*2]   ; diff[x]
+    inc         dword  [r6 + r7 + 4]    ; count[classIdx]++
+    add         [r5 + r7 + 4], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
     dec         r9d
     jz          .next
 %assign x x+1
 %endrep
 
-    add         r10, 16
+    add         r10, 16*2
     add         r11, 16
     jmp         .loopL
 
 .next:
-    add         r0, r2
+    add         r0, 64*2                ; MAX_CU_SIZE
     add         r1, r2
     dec         r4d
-    jnz         .loopH
+    jnz        .loopH
     RET
 %endif
 
diff -r 5429d2f26ebf -r 5f448e155870 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Oct 06 16:41:50 2015 -0500
+++ b/source/common/x86/loopfilter.h	Tue Oct 06 16:41:52 2015 -0500
@@ -36,7 +36,7 @@
     void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
-    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
diff -r 5429d2f26ebf -r 5f448e155870 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue Oct 06 16:41:50 2015 -0500
+++ b/source/encoder/sao.cpp	Tue Oct 06 16:41:52 2015 -0500
@@ -712,6 +712,20 @@
     int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
     int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
 
+    ALIGN_VAR_32(int16_t, diff[MAX_CU_SIZE * MAX_CU_SIZE]);
+
+    // Calculate (fenc - frec) and put into diff[]
+    // WARNING: *) May read beyond bound on video than width or height is NOT multiple of cuSize
+    //          *) MUST BE handle ColorSpace other than 420 yourself!
+    //primitives.cu[g_maxLog2CUSize - 2 - (plane != 0)].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
+    for(int y = 0; y < ctuHeight; y++)
+    {
+        for(int x = 0; x < ctuWidth; x++)
+        {
+            diff[y * MAX_CU_SIZE + x] = (fenc0[y * stride + x] - rec0[y * stride + x]);
+        }
+    }
+
     // SAO_BO:
     {
         if (m_param->bSaoNonDeblocked)
@@ -723,7 +737,7 @@
         endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
         endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB + plane_offset;
 
-        primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
+        primitives.saoCuStatsBO(diff, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
     }
 
     {
@@ -1526,7 +1540,7 @@
 }
 
 // NOTE: must put in namespace X265_NS since we need class SAO
-void saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 {
     int x, y;
     const int boShift = X265_DEPTH - SAO_BO_BITS;
@@ -1536,11 +1550,11 @@
         for (x = 0; x < endX; x++)
         {
             int classIdx = 1 + (rec[x] >> boShift);
-            stats[classIdx] += (fenc[x] - rec[x]);
+            stats[classIdx] += diff[x];
             count[classIdx]++;
         }
 
-        fenc += stride;
+        diff += MAX_CU_SIZE;
         rec += stride;
     }
 }
diff -r 5429d2f26ebf -r 5f448e155870 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Oct 06 16:41:50 2015 -0500
+++ b/source/test/pixelharness.cpp	Tue Oct 06 16:41:52 2015 -0500
@@ -1062,8 +1062,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5);
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
 
         if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
             return false;
@@ -2850,7 +2850,7 @@
     {
         int32_t stats[33], count[33];
         HEADER0("saoCuStatsBO");
-        REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, pbuf2, pbuf3, 64, 60, 61, stats, count);
+        REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, sbuf2, pbuf3, 64, 60, 61, stats, count);
     }
 
     if (opt.saoCuStatsE0)



More information about the x265-devel mailing list