[x265] [PATCH 3 of 3] asm: sse4 code for saoCuStatsE1, improved 320369c->151086c

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Tue Jul 7 11:35:37 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1436252372 -19800
#      Tue Jul 07 12:29:32 2015 +0530
# Node ID 25a8323b886f480347f4b0813f7ded18e579704a
# Parent  235930aae11da04863e3fb13905e2d1d95e3dc0a
asm: sse4 code for saoCuStatsE1, improved 320369c->151086c

diff -r 235930aae11d -r 25a8323b886f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jul 07 12:29:32 2015 +0530
@@ -2499,6 +2499,7 @@
 #if X86_64
         p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
+        p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
         p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
         p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
 
diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Tue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Tue Jul 07 12:29:32 2015 +0530
@@ -2159,3 +2159,122 @@
     add         [r1 + 4 * 4], r6d
     RET
 %endif
+
+;-------------------------------------------------------------------------------------------------------------------------------------------
+; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+;-------------------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE1, 4,11,9,0-32    ; Stack: 5 of stats and 5 of count
+    mov         r4d, r4m
+    mov         r5d, r5m
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m0, [pb_128]
+    mova        m5, [pb_1]
+    mova        m6, [pb_2]
+    mova        m8, [hmul_16p + 16]
+    movh        m7, [r3 + r4]
+
+.loopH:
+    mov         r6d, r4d
+    mov         r9, r0
+    mov         r10, r1
+    mov         r5, r3
+
+.loopW:
+    movu        m1, [r10]
+    movu        m2, [r10 + r2]
+
+    ; signDown
+    pxor        m1, m0
+    pxor        m2, m0
+    pcmpgtb     m3, m1, m2
+    pand        m3, m5
+    pcmpgtb     m2, m1
+    por         m2, m3
+    pxor        m3, m3
+    psubb       m3, m2      ; -signDown
+
+    ; edgeType
+    movu        m4, [r5]
+    paddb       m4, m6
+    paddb       m2, m4
+
+    ; update upBuff1
+    movu        [r5], m3
+
+    ; stats[edgeType]
+    pxor        m1, m0
+    movu        m3, [r9]
+    punpckhbw   m4, m3, m1
+    punpcklbw   m3, m1
+    pmaddubsw   m3, m8
+    pmaddubsw   m4, m8
+
+    ; 16 pixels
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+    inc         word [rsp + r7 * 2]
+
+  %if (x < 8)
+    pextrw      r8d, m3, (x % 8)
+  %else
+    pextrw      r8d, m4, (x % 8)
+  %endif
+    movsx       r8d, r8w
+    add         [rsp + 5 * 2 + r7 * 4], r8d
+
+    dec         r6d
+    jz         .next
+%assign x x+1
+%endrep
+
+    add         r9, 16
+    add         r10, 16
+    add         r5, 16
+    jmp         .loopW
+
+.next:
+    ; restore pointer upBuff1
+    add         r0, r2
+    add         r1, r2
+
+    dec         byte r5m
+    jg         .loopH
+
+    ; restore unavailable pixels
+    movh        [r3 + r4], m7
+
+    ; sum to global buffer
+    mov         r1, r6m
+    mov         r0, r7m
+
+    ; s_eoTable = {1,2,0,3,4}
+    movzx       r6d, word [rsp + 0 * 2]
+    add         [r0 + 1 * 4], r6d
+    movzx       r6d, word [rsp + 1 * 2]
+    add         [r0 + 2 * 4], r6d
+    movzx       r6d, word [rsp + 2 * 2]
+    add         [r0 + 0 * 4], r6d
+    movzx       r6d, word [rsp + 3 * 2]
+    add         [r0 + 3 * 4], r6d
+    movzx       r6d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r6d
+
+    mov         r6d, [rsp + 5 * 2 + 0 * 4]
+    add         [r1 + 1 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 1 * 4]
+    add         [r1 + 2 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 2 * 4]
+    add         [r1 + 0 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 3 * 4]
+    add         [r1 + 3 * 4], r6d
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r1 + 4 * 4], r6d
+    RET
+%endif ; ARCH_X86_64
diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Tue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/loopfilter.h	Tue Jul 07 12:29:32 2015 +0530
@@ -37,6 +37,7 @@
     void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
     void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+    void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
     void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 235930aae11d -r 25a8323b886f source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Tue Jul 07 12:17:08 2015 +0530
+++ b/source/test/pixelharness.cpp	Tue Jul 07 12:29:32 2015 +0530
@@ -1089,6 +1089,52 @@
     return true;
 }
 
+bool PixelHarness::check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt)
+{
+    enum { NUM_EDGETYPE = 5 };
+    int32_t stats_ref[NUM_EDGETYPE];
+    int32_t stats_vec[NUM_EDGETYPE];
+
+    int32_t count_ref[NUM_EDGETYPE];
+    int32_t count_vec[NUM_EDGETYPE];
+
+    int8_t _upBuff1_ref[MAX_CU_SIZE + 2], *upBuff1_ref = _upBuff1_ref + 1;
+    int8_t _upBuff1_vec[MAX_CU_SIZE + 2], *upBuff1_vec = _upBuff1_vec + 1;
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        // initialize input data to random, the dynamic range wrong but good to verify our asm code
+        for (int x = 0; x < NUM_EDGETYPE; x++)
+        {
+            stats_ref[x] = stats_vec[x] = rand();
+            count_ref[x] = count_vec[x] = rand();
+        }
+
+        // initial sign
+        for (int x = 0; x < MAX_CU_SIZE + 2; x++)
+            _upBuff1_ref[x] = _upBuff1_vec[x] = (rand() % 3) - 1;
+
+        intptr_t stride = 16 * (rand() % 4 + 1);
+        int endX = MAX_CU_SIZE - (rand() % 5);
+        int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+        ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+        checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+
+        if (   memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
+            || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+            || memcmp(count_ref, count_vec, sizeof(count_ref)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
 bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt)
 {
     enum { NUM_EDGETYPE = 5 };
@@ -2184,6 +2230,15 @@
         }
     }
 
+    if (opt.saoCuStatsE1)
+    {
+        if (!check_saoCuStatsE1_t(ref.saoCuStatsE1, opt.saoCuStatsE1))
+        {
+            printf("saoCuStatsE1 failed\n");
+            return false;
+        }
+    }
+
     if (opt.saoCuStatsE2)
     {
         if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2))
@@ -2630,6 +2685,15 @@
         REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count);
     }
 
+    if (opt.saoCuStatsE1)
+    {
+        int32_t stats[5], count[5];
+        int8_t upBuff1[MAX_CU_SIZE + 2];
+        memset(upBuff1, 1, sizeof(upBuff1));
+        HEADER0("saoCuStatsE1");
+        REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, pbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count);
+    }
+
     if (opt.saoCuStatsE2)
     {
         int32_t stats[5], count[5];
diff -r 235930aae11d -r 25a8323b886f source/test/pixelharness.h
--- a/source/test/pixelharness.h	Tue Jul 07 12:17:08 2015 +0530
+++ b/source/test/pixelharness.h	Tue Jul 07 12:29:32 2015 +0530
@@ -102,6 +102,7 @@
     bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
     bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt);
     bool check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt);
+    bool check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt);
     bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
     bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);


More information about the x265-devel mailing list