[x265] [PATCH 2 of 3] asm: sse4 code for saoCuStatsE0, improved 250341c->147284c

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jul 7 11:35:36 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1436251628 -19800
#      Tue Jul 07 12:17:08 2015 +0530
# Node ID 235930aae11da04863e3fb13905e2d1d95e3dc0a
# Parent  e0166f09f332af72a83eb059d878044db15f59bd
asm: sse4 code for saoCuStatsE0, improved 250341c->147284c

diff -r e0166f09f332 -r 235930aae11d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 07 12:17:08 2015 +0530
@@ -2498,6 +2498,7 @@
 
 #if X86_64
         p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
+        p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
         p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
         p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
 
diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Tue Jul 07 12:17:08 2015 +0530
@@ -2043,3 +2043,119 @@
     jnz         .loopH
     RET
 %endif
+
+;-----------------------------------------------------------------------------------------------------------------------
+; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+;-----------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE0, 5,8,8, 0-32
+    mov         r3d, r3m
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m4, [pb_128]
+    mova        m5, [hmul_16p + 16]
+    mova        m6, [pb_2]
+    xor         r7d, r7d
+
+.loopH:
+    mov         r5d, r3d
+
+    ; calculate signLeft
+    mov         r7b, [r1]
+    sub         r7b, [r1 - 1]
+    seta        r7b
+    setb        r6b
+    sub         r7b, r6b
+    neg         r7b
+    pinsrb      m0, r7d, 15
+
+.loopL:
+    movu        m7, [r1]
+    movu        m2, [r1 + 1]
+
+    pxor        m1, m7, m4
+    pxor        m3, m2, m4
+    pcmpgtb     m2, m1, m3
+    pcmpgtb     m3, m1
+    pand        m2, [pb_1]
+    por         m2, m3              ; signRight
+
+    palignr     m3, m2, m0, 15
+    psignb      m3, m4              ; signLeft
+
+    mova        m0, m2
+    paddb       m2, m3
+    paddb       m2, m6              ; edgeType
+
+    ; stats[edgeType]
+    movu        m3, [r0]            ; fenc[0-15]
+    punpckhbw   m1, m3, m7
+    punpcklbw   m3, m7
+    pmaddubsw   m1, m5
+    pmaddubsw   m3, m5
+
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+
+%if (x < 8)
+    pextrw      r6d, m3, (x % 8)
+%else
+    pextrw      r6d, m1, (x % 8)
+%endif
+    movsx       r6d, r6w
+    inc         word [rsp + r7 * 2]             ; tmp_count[edgeType]++
+    add         [rsp + 5 * 2 + r7 * 4], r6d     ; tmp_stats[edgeType] += (fenc[x] - rec[x])
+    dec         r5d
+    jz          .next
+%assign x x+1
+%endrep
+
+    add         r0q, 16
+    add         r1q, 16
+    jmp         .loopL
+
+.next:
+    mov         r6d, r3d
+    and         r6d, 15
+
+    sub         r6, r3
+    add         r6, r2
+    add         r0, r6
+    add         r1, r6
+
+    dec         r4d
+    jnz         .loopH
+
+    ; sum to global buffer
+    mov         r1, r5m
+    mov         r0, r6m
+
+    ; s_eoTable = {1, 2, 0, 3, 4}
+    movzx       r5d, word [rsp + 0 * 2]
+    add         [r0 + 1 * 4], r5d
+    movzx       r6d, word [rsp + 1 * 2]
+    add         [r0 + 2 * 4], r6d
+    movzx       r5d, word [rsp + 2 * 2]
+    add         [r0 + 0 * 4], r5d
+    movzx       r6d, word [rsp + 3 * 2]
+    add         [r0 + 3 * 4], r6d
+    movzx       r5d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r5d
+
+    mov         r6d, [rsp + 5 * 2 + 0 * 4]
+    add         [r1 + 1 * 4], r6d
+    mov         r5d, [rsp + 5 * 2 + 1 * 4]
+    add         [r1 + 2 * 4], r5d
+    mov         r6d, [rsp + 5 * 2 + 2 * 4]
+    add         [r1 + 0 * 4], r6d
+    mov         r5d, [rsp + 5 * 2 + 3 * 4]
+    add         [r1 + 3 * 4], r5d
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r1 + 4 * 4], r6d
+    RET
+%endif
diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/loopfilter.h	Tue Jul 07 12:17:08 2015 +0530
@@ -36,6 +36,7 @@
     void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
     void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r e0166f09f332 -r 235930aae11d source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Jul 07 11:14:35 2015 +0530
+++ b/source/test/pixelharness.cpp	Tue Jul 07 12:17:08 2015 +0530
@@ -1053,6 +1053,42 @@
     return true;
 }
 
+bool PixelHarness::check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt)
+{
+    enum { NUM_EDGETYPE = 5 };
+    int32_t stats_ref[NUM_EDGETYPE];
+    int32_t stats_vec[NUM_EDGETYPE];
+
+    int32_t count_ref[NUM_EDGETYPE];
+    int32_t count_vec[NUM_EDGETYPE];
+
+    int j = 0;
+    for (int i = 0; i < ITERS; i++)
+    {
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
+        for (int x = 0; x < NUM_EDGETYPE; x++)
+        {
+            stats_ref[x] = stats_vec[x] = rand();
+            count_ref[x] = count_vec[x] = rand();
+        }
+
+        intptr_t stride = 16 * (rand() % 4 + 1);
+        int endX = MAX_CU_SIZE - (rand() % 5) - 1;
+        int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+        ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
+        checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
+
+        if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt)
 {
     enum { NUM_EDGETYPE = 5 };
@@ -2139,6 +2175,15 @@
         }
     }
 
+    if (opt.saoCuStatsE0)
+    {
+        if (!check_saoCuStatsE0_t(ref.saoCuStatsE0, opt.saoCuStatsE0))
+        {
+            printf("saoCuStatsE0 failed\n");
+            return false;
+        }
+    }
+
     if (opt.saoCuStatsE2)
     {
         if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2))
@@ -2578,6 +2623,13 @@
         REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, pbuf2, pbuf3, 64, 60, 61, stats, count);
     }
 
+    if (opt.saoCuStatsE0)
+    {
+        int32_t stats[33], count[33];
+        HEADER0("saoCuStatsE0");
+        REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count);
+    }
+
     if (opt.saoCuStatsE2)
     {
         int32_t stats[5], count[5];
diff -r e0166f09f332 -r 235930aae11d source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue Jul 07 11:14:35 2015 +0530
+++ b/source/test/pixelharness.h	Tue Jul 07 12:17:08 2015 +0530
@@ -101,6 +101,7 @@
     bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
     bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt);
+    bool check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt);
     bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
     bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);


More information about the x265-devel mailing list