[x265] [PATCH 1 of 3] asm: sse4 code for saoCuStatsBO, improved 185378c->131279c

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jul 7 11:35:35 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1436247875 -19800
#      Tue Jul 07 11:14:35 2015 +0530
# Node ID e0166f09f332af72a83eb059d878044db15f59bd
# Parent  523540864864752baea88ba0ac78cf292364bf7e
asm: sse4 code for saoCuStatsBO, improved 185378c->131279c

diff -r 523540864864 -r e0166f09f332 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jul 06 14:12:55 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 07 11:14:35 2015 +0530
@@ -2497,6 +2497,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
 
 #if X86_64
+        p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
         p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
         p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
 
diff -r 523540864864 -r e0166f09f332 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Mon Jul 06 14:12:55 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Tue Jul 07 11:14:35 2015 +0530
@@ -29,6 +29,7 @@
 
 SECTION_RODATA 32
 pb_31:      times 32 db 31
+pb_124:     times 32 db 124
 pb_15:      times 32 db 15
 pb_movemask_32:  times 32 db 0x00
                  times 32 db 0xFF
@@ -41,6 +42,8 @@
 cextern pw_1023
 cextern pb_movemask
 cextern pw_1
+cextern hmul_16p
+cextern pb_4
 
 
 ;============================================================================================================
@@ -1984,3 +1987,59 @@
 .end:
     RET
 %endif
+
+;--------------------------------------------------------------------------------------------------------------------------
+; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+;--------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsBO, 7,12,6
+    mova        m3, [hmul_16p + 16]
+    mova        m4, [pb_124]
+    mova        m5, [pb_4]
+    xor         r7d, r7d
+
+.loopH:
+    mov         r10, r0
+    mov         r11, r1
+    mov         r9d, r3d
+.loopL:
+    movu        m1, [r11]
+    movu        m0, [r10]
+
+    punpckhbw   m2, m0, m1
+    punpcklbw   m0, m1
+    psrlw       m1, 1               ; rec[x] >> boShift
+    pmaddubsw   m2, m3
+    pmaddubsw   m0, m3
+    pand        m1, m4
+    paddb       m1, m5
+
+%assign x 0
+%rep 16
+    pextrb      r7d, m1, x
+
+%if (x < 8)
+    pextrw      r8d, m0, (x % 8)
+%else
+    pextrw      r8d, m2, (x % 8)
+%endif
+    movsx       r8d, r8w
+    inc         dword  [r6 + r7]    ; count[classIdx]++
+    add         [r5 + r7], r8d      ; stats[classIdx] += (fenc[x] - rec[x]);
+    dec         r9d
+    jz          .next
+%assign x x+1
+%endrep
+
+    add         r10, 16
+    add         r11, 16
+    jmp         .loopL
+
+.next:
+    add         r0, r2
+    add         r1, r2
+    dec         r4d
+    jnz         .loopH
+    RET
+%endif
diff -r 523540864864 -r e0166f09f332 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Mon Jul 06 14:12:55 2015 +0530
+++ b/source/common/x86/loopfilter.h	Tue Jul 07 11:14:35 2015 +0530
@@ -35,6 +35,7 @@
     void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
+    void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 523540864864 -r e0166f09f332 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Jul 06 14:12:55 2015 +0530
+++ b/source/test/pixelharness.cpp	Tue Jul 07 11:14:35 2015 +0530
@@ -1017,6 +1017,42 @@
     return true;
 }
 
+bool PixelHarness::check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt)
+{
+    enum { NUM_EDGETYPE = 33 }; // classIdx = 1 + (rec[x] >> 3);
+    int32_t stats_ref[NUM_EDGETYPE];
+    int32_t stats_vec[NUM_EDGETYPE];
+
+    int32_t count_ref[NUM_EDGETYPE];
+    int32_t count_vec[NUM_EDGETYPE];
+
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
+        for (int x = 0; x < NUM_EDGETYPE; x++)
+        {
+            stats_ref[x] = stats_vec[x] = rand();
+            count_ref[x] = count_vec[x] = rand();
+        }
+
+        intptr_t stride = 16 * (rand() % 4 + 1);
+        int endX = MAX_CU_SIZE - (rand() % 5);
+        int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+        ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
+
+        if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt)
 {
     enum { NUM_EDGETYPE = 5 };
@@ -2094,6 +2130,15 @@
         }
     }
 
+    if (opt.saoCuStatsBO)
+    {
+        if (!check_saoCuStatsBO_t(ref.saoCuStatsBO, opt.saoCuStatsBO))
+        {
+            printf("saoCuStatsBO failed\n");
+            return false;
+        }
+    }
+
     if (opt.saoCuStatsE2)
     {
         if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2))
@@ -2526,6 +2571,13 @@
         REPORT_SPEEDUP(opt.saoCuOrgB0, ref.saoCuOrgB0, pbuf1, psbuf1, 64, 64, 64);
     }
 
+    if (opt.saoCuStatsBO)
+    {
+        int32_t stats[33], count[33];
+        HEADER0("saoCuStatsBO");
+        REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, pbuf2, pbuf3, 64, 60, 61, stats, count);
+    }
+
     if (opt.saoCuStatsE2)
     {
         int32_t stats[5], count[5];
diff -r 523540864864 -r e0166f09f332 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Mon Jul 06 14:12:55 2015 +0530
+++ b/source/test/pixelharness.h	Tue Jul 07 11:14:35 2015 +0530
@@ -100,6 +100,7 @@
     bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
+    bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt);
     bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
     bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);


More information about the x265-devel mailing list