[x265] [PATCH 12 of 16] improve saoCuStatsE1 by use prepare (fenc - frec)

Min Chen chenm003 at 163.com
Wed Oct 7 00:55:23 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167717 18000
# Node ID becaacf5b72bfc13860ce183e1ff411b5820163d
# Parent  72d345dcf13e3c715f767ff624f5ff47043e01a6
improve saoCuStatsE1 by use prepare (fenc - frec)
---
 source/common/primitives.h       |    2 +-
 source/common/x86/loopfilter.asm |   31 +++++++++----------------------
 source/common/x86/loopfilter.h   |    2 +-
 source/encoder/sao.cpp           |   10 ++++------
 source/test/pixelharness.cpp     |    6 +++---
 5 files changed, 18 insertions(+), 33 deletions(-)

diff -r 72d345dcf13e -r becaacf5b72b source/common/primitives.h
--- a/source/common/primitives.h	Tue Oct 06 16:41:54 2015 -0500
+++ b/source/common/primitives.h	Tue Oct 06 16:41:57 2015 -0500
@@ -178,7 +178,7 @@
 
 typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
 typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
 
diff -r 72d345dcf13e -r becaacf5b72b source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:54 2015 -0500
+++ b/source/common/x86/loopfilter.asm	Tue Oct 06 16:41:57 2015 -0500
@@ -2137,11 +2137,11 @@
 %endif
 
 ;-------------------------------------------------------------------------------------------------------------------------------------------
-; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
 ;-------------------------------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
 INIT_XMM sse4
-cglobal saoCuStatsE1, 4,12,9,0-32    ; Stack: 5 of stats and 5 of count
+cglobal saoCuStatsE1, 4,12,8,0-32    ; Stack: 5 of stats and 5 of count
     mov         r5d, r5m
     mov         r4d, r4m
     mov         r11d, r5d
@@ -2153,7 +2153,6 @@
     mova        m0, [pb_128]
     mova        m5, [pb_1]
     mova        m6, [pb_2]
-    mova        m8, [hmul_16p + 16]
     movh        m7, [r3 + r4]
 
 .loopH:
@@ -2170,11 +2169,11 @@
     pxor        m1, m0
     pxor        m2, m0
     pcmpgtb     m3, m1, m2
+    pcmpgtb     m2, m1
     pand        m3, m5
-    pcmpgtb     m2, m1
     por         m2, m3
     pxor        m3, m3
-    psubb       m3, m2      ; -signDown
+    psubb       m3, m2                          ; -signDown
 
     ; edgeType
     movu        m4, [r11]
@@ -2184,26 +2183,14 @@
     ; update upBuff1
     movu        [r11], m3
 
-    ; stats[edgeType]
-    pxor        m1, m0
-    movu        m3, [r9]
-    punpckhbw   m4, m3, m1
-    punpcklbw   m3, m1
-    pmaddubsw   m3, m8
-    pmaddubsw   m4, m8
-
     ; 16 pixels
 %assign x 0
 %rep 16
     pextrb      r7d, m2, x
     inc         word [rsp + r7 * 2]
 
-  %if (x < 8)
-    pextrw      r8d, m3, (x % 8)
-  %else
-    pextrw      r8d, m4, (x % 8)
-  %endif
-    movsx       r8d, r8w
+    ; stats[edgeType]
+    movsx       r8d, word [r9 + x * 2]
     add         [rsp + 5 * 2 + r7 * 4], r8d
 
     dec         r6d
@@ -2211,14 +2198,14 @@
 %assign x x+1
 %endrep
 
-    add         r9, 16
+    add         r9, 16*2
     add         r10, 16
     add         r11, 16
-    jmp         .loopW
+    jmp        .loopW
 
 .next:
     ; restore pointer upBuff1
-    add         r0, r2
+    add         r0, 64*2                        ; MAX_CU_SIZE
     add         r1, r2
 
     dec         r5d
diff -r 72d345dcf13e -r becaacf5b72b source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Oct 06 16:41:54 2015 -0500
+++ b/source/common/x86/loopfilter.h	Tue Oct 06 16:41:57 2015 -0500
@@ -38,7 +38,7 @@
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
     void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
-    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 72d345dcf13e -r becaacf5b72b source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Tue Oct 06 16:41:54 2015 -0500
+++ b/source/encoder/sao.cpp	Tue Oct 06 16:41:57 2015 -0500
@@ -763,7 +763,6 @@
                 skipR = 4;
             }
 
-            fenc = fenc0;
             rec  = rec0;
 
             startY = !tpely;
@@ -771,13 +770,12 @@
             endY   = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
             if (!tpely)
             {
-                fenc += stride;
                 rec += stride;
             }
 
             primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
 
-            primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
+            primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
         }
 
         // SAO_EO_2: // dir: 135
@@ -1596,7 +1594,7 @@
     }
 }
 
-void saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
 {
     X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
     X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");
@@ -1617,10 +1615,10 @@
             uint32_t edgeType = signDown + upBuff1[x] + 2;
             upBuff1[x] = (int8_t)(-signDown);
 
-            tmp_stats[edgeType] += (fenc[x] - rec[x]);
+            tmp_stats[edgeType] += diff[x];
             tmp_count[edgeType]++;
         }
-        fenc += stride;
+        diff += MAX_CU_SIZE;
         rec += stride;
     }
 
diff -r 72d345dcf13e -r becaacf5b72b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Oct 06 16:41:54 2015 -0500
+++ b/source/test/pixelharness.cpp	Tue Oct 06 16:41:57 2015 -0500
@@ -1142,8 +1142,8 @@
         int endX = MAX_CU_SIZE - (rand() % 5);
         int endY = MAX_CU_SIZE - (rand() % 4) - 1;
 
-        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
-        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+        ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
 
         if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
             || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -2866,7 +2866,7 @@
         int8_t upBuff1[MAX_CU_SIZE + 2];
         memset(upBuff1, 1, sizeof(upBuff1));
         HEADER0("saoCuStatsE1");
-        REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, pbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count);
+        REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, sbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count);
     }
 
     if (opt.saoCuStatsE2)



More information about the x265-devel mailing list