[x265] [PATCH] asm: high_bit_depth sse4 version of saoCuStatsE0 & saoCuStatsE1

Min Chen chenm003 at 163.com
Wed Feb 17 09:22:44 CET 2016


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1455685530 -28800
# Node ID 5f1cce3482b5cd0ab0367b8dcab6198c0e362f37
# Parent  07986e2a495a915d3ffe86fae29298b46724b5fa
asm: high_bit_depth sse4 version of saoCuStatsE0 & saoCuStatsE1

diff -r 07986e2a495a -r 5f1cce3482b5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 17 12:39:42 2016 +0800
+++ b/source/common/x86/asm-primitives.cpp	Wed Feb 17 13:05:30 2016 +0800
@@ -1166,6 +1166,10 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].p2s = PFX(filterPixelToShort_2x16_sse4);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
         p.costCoeffRemain = PFX(costCoeffRemain_sse4);
+#if X86_64
+        p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
+        p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
+#endif
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 07986e2a495a -r 5f1cce3482b5 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Wed Feb 17 12:39:42 2016 +0800
+++ b/source/common/x86/loopfilter.asm	Wed Feb 17 13:05:30 2016 +0800
@@ -154,7 +154,9 @@
     sub         r4d, 16
     jnz        .loopH
     RET
-%else ; HIGH_BIT_DEPTH
+
+%else ; HIGH_BIT_DEPTH == 1
+
 cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
 
     mov         r4d, r4m
@@ -240,7 +242,7 @@
     sub         r4d, 16
     jnz        .loopH
     RET
-%endif
+%endif ; HIGH_BIT_DEPTH == 0
 
 INIT_YMM avx2
 %if HIGH_BIT_DEPTH
@@ -2061,8 +2063,10 @@
 ; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
 ;-----------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
+
+%if HIGH_BIT_DEPTH == 1
 INIT_XMM sse4
-cglobal saoCuStatsE0, 3,10,6, 0-32
+cglobal saoCuStatsE0, 3,10,8, 0-32
     mov         r3d, r3m
     mov         r4d, r4m
     mov         r9, r5mp
@@ -2071,7 +2075,7 @@
     pxor        m0, m0
     mova        [rsp], m0
     mova        [rsp + mmsize], m0
-    mova        m4, [pb_128]
+    mova        m4, [pw_1]
     mova        m5, [pb_2]
     xor         r7d, r7d
 
@@ -2081,12 +2085,14 @@
     sub         r2, r6
     lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
 
+    FIX_STRIDES r2
+
 .loopH:
     mov         r5d, r3d
 
     ; calculate signLeft
-    mov         r7b, [r1]
-    sub         r7b, [r1 - 1]
+    mov         r7w, [r1]
+    sub         r7w, [r1 - SIZEOF_PIXEL]
     seta        r7b
     setb        r6b
     sub         r7b, r6b
@@ -2094,21 +2100,30 @@
     pinsrb      m0, r7d, 15
 
 .loopL:
+
     movu        m3, [r1]
-    movu        m2, [r1 + 1]
-
-    pxor        m1, m3, m4
-    pxor        m2, m4
-    pcmpgtb     m3, m1, m2
-    pcmpgtb     m2, m1
-    pand        m3, [pb_1]
-    por         m2, m3                          ; signRight
+    movu        m2, [r1 + SIZEOF_PIXEL]
+    pcmpgtw     m6, m3, m2
+    pcmpgtw     m2, m3
+    pand        m6, m4
+    por         m2, m6
+
+    movu        m3, [r1 + mmsize]
+    movu        m6, [r1 + mmsize + SIZEOF_PIXEL]
+    pcmpgtw     m7, m3, m6
+    pcmpgtw     m6, m3
+    pand        m7, m4
+    por         m7, m6
+
+    packsswb    m2, m7                          ; signRight
 
     palignr     m3, m2, m0, 15
-    psignb      m3, m4                          ; signLeft
+
+    pxor        m6, m6
+    psubb       m6, m3                          ; signLeft
 
     mova        m0, m2
-    paddb       m2, m3
+    paddb       m2, m6
     paddb       m2, m5                          ; edgeType
 
     ; stats[edgeType]
@@ -2125,7 +2140,7 @@
 %endrep
 
     add         r0, 16*2
-    add         r1, 16
+    add         r1, 16 * SIZEOF_PIXEL
     jmp        .loopL
 
 .next:
@@ -2155,6 +2170,106 @@
     mov         r6d, [rsp + 5 * 2 + 4 * 4]
     add         [r9 + 4 * 4], r6d
     RET
+%endif ; HIGH_BIT_DEPTH=1
+
+
+%if HIGH_BIT_DEPTH == 0
+INIT_XMM sse4
+cglobal saoCuStatsE0, 3,10,6, 0-32
+    mov         r3d, r3m
+    mov         r4d, r4m
+    mov         r9, r5mp
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m4, [pb_128]
+    mova        m5, [pb_2]
+    xor         r7d, r7d
+
+    ; correct stride for diff[] and rec
+    mov         r6d, r3d
+    and         r6d, ~15
+    sub         r2, r6
+    lea         r8, [(r6 - 64) * 2]             ; 64 = MAX_CU_SIZE
+
+.loopH:
+    mov         r5d, r3d
+
+    ; calculate signLeft
+    mov         r7b, [r1]
+    sub         r7b, [r1 - SIZEOF_PIXEL]
+    seta        r7b
+    setb        r6b
+    sub         r7b, r6b
+    neg         r7b
+    pinsrb      m0, r7d, 15
+
+.loopL:
+    movu        m3, [r1]
+    movu        m2, [r1 + SIZEOF_PIXEL]
+
+    pxor        m1, m3, m4
+    pxor        m2, m4
+    pcmpgtb     m3, m1, m2
+    pcmpgtb     m2, m1
+    pand        m3, [pb_1]
+
+    por         m2, m3                          ; signRight
+
+    palignr     m3, m2, m0, 15
+    psignb      m3, m4                          ; signLeft
+
+    mova        m0, m2
+    paddb       m2, m3
+    paddb       m2, m5                          ; edgeType
+
+    ; stats[edgeType]
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+
+    movsx       r6d, word [r0 + x * 2]
+    inc         word [rsp + r7 * 2]             ; tmp_count[edgeType]++
+    add         [rsp + 5 * 2 + r7 * 4], r6d     ; tmp_stats[edgeType] += (fenc[x] - rec[x])
+    dec         r5d
+    jz         .next
+%assign x x+1
+%endrep
+
+    add         r0, 16*2
+    add         r1, 16 * SIZEOF_PIXEL
+    jmp        .loopL
+
+.next:
+    sub         r0, r8
+    add         r1, r2
+
+    dec         r4d
+    jnz        .loopH
+
+    ; sum to global buffer
+    mov         r0, r6mp
+
+    ; s_eoTable = {1, 2, 0, 3, 4}
+    pmovzxwd    m0, [rsp + 0 * 2]
+    pshufd      m0, m0, q3102
+    movu        m1, [r0]
+    paddd       m0, m1
+    movu        [r0], m0
+    movzx       r5d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r5d
+
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
+    pshufd      m0, m0, q3102
+    movu        m1, [r9]
+    paddd       m0, m1
+    movu        [r9], m0
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r9 + 4 * 4], r6d
+    RET
+%endif ; HIGH_BIT_DEPTH=0
 
 
 ;-----------------------------------------------------------------------------------------------------------------------
@@ -2341,6 +2456,112 @@
 ; saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
 ;-------------------------------------------------------------------------------------------------------------------------------------------
 %if ARCH_X86_64
+
+%if HIGH_BIT_DEPTH
+INIT_XMM sse4
+cglobal saoCuStatsE1, 4,12,8,0-32    ; Stack: 5 of stats and 5 of count
+    mov         r5d, r5m
+    mov         r4d, r4m
+
+    ; clear internal temporary buffer
+    pxor        m0, m0
+    mova        [rsp], m0
+    mova        [rsp + mmsize], m0
+    mova        m5, [pw_1]
+    mova        m6, [pb_2]
+    movh        m7, [r3 + r4]
+
+    FIX_STRIDES r2d
+
+.loopH:
+    mov         r6d, r4d
+    mov         r9, r0
+    mov         r10, r1
+    mov         r11, r3
+
+.loopW:
+    ; signDown
+    movu        m1, [r10]
+    movu        m2, [r10 + r2]
+    pcmpgtw     m3, m1, m2
+    pcmpgtw     m2, m1
+    pand        m3, m5
+    por         m2, m3
+
+    movu        m3, [r10 + mmsize]
+    movu        m4, [r10 + mmsize + r2]
+    pcmpgtw     m0, m3, m4
+    pcmpgtw     m4, m3
+    pand        m0, m5
+    por         m4, m0
+    packsswb    m2, m4
+
+    pxor        m3, m3
+    psubb       m3, m2                          ; -signDown
+
+    ; edgeType
+    movu        m4, [r11]
+    paddb       m4, m6
+    paddb       m2, m4
+
+    ; update upBuff1
+    movu        [r11], m3
+
+    ; 16 pixels
+%assign x 0
+%rep 16
+    pextrb      r7d, m2, x
+    inc         word [rsp + r7 * 2]
+
+    ; stats[edgeType]
+    movsx       r8d, word [r9 + x * 2]
+    add         [rsp + 5 * 2 + r7 * 4], r8d
+
+    dec         r6d
+    jz         .next
+%assign x x+1
+%endrep
+
+    add         r9, mmsize * 2
+    add         r10, mmsize * SIZEOF_PIXEL
+    add         r11, mmsize
+    jmp        .loopW
+
+.next:
+    ; restore pointer upBuff1
+    add         r0, 64*2                        ; MAX_CU_SIZE
+    add         r1, r2
+
+    dec         r5d
+    jg         .loopH
+
+    ; restore unavailable pixels
+    movh        [r3 + r4], m7
+
+    ; sum to global buffer
+    mov         r1, r6m
+    mov         r0, r7m
+
+    ; s_eoTable = {1,2,0,3,4}
+    pmovzxwd    m0, [rsp + 0 * 2]
+    pshufd      m0, m0, q3102
+    movu        m1, [r0]
+    paddd       m0, m1
+    movu        [r0], m0
+    movzx       r5d, word [rsp + 4 * 2]
+    add         [r0 + 4 * 4], r5d
+
+    movu        m0, [rsp + 5 * 2 + 0 * 4]
+    pshufd      m0, m0, q3102
+    movu        m1, [r1]
+    paddd       m0, m1
+    movu        [r1], m0
+    mov         r6d, [rsp + 5 * 2 + 4 * 4]
+    add         [r1 + 4 * 4], r6d
+    RET
+
+%else ; HIGH_BIT_DEPTH == 1
+
 INIT_XMM sse4
 cglobal saoCuStatsE1, 4,12,8,0-32    ; Stack: 5 of stats and 5 of count
     mov         r5d, r5m
@@ -2435,6 +2656,7 @@
     mov         r6d, [rsp + 5 * 2 + 4 * 4]
     add         [r1 + 4 * 4], r6d
     RET
+%endif ; HIGH_BIT_DEPTH == 0
 
 
 INIT_YMM avx2



More information about the x265-devel mailing list