[x265] [PATCH 3 of 5] asm: AVX2 version of saoCuStatsE1, (131370c -> 41189c)

Min Chen chenm003 at 163.com
Wed Dec 9 23:27:11 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1449698983 21600
# Node ID 7ad2050bc2aaa8083b4e2de14d5846e5074b7b73
# Parent  2073ed3429fe81af14b46aca6a14e0b34405f615
asm: AVX2 version of saoCuStatsE1, (131370c -> 41189c)
---
 source/common/x86/asm-primitives.cpp |    1 +
 source/common/x86/loopfilter.asm     |  184 +++++++++++++++++++++++++++++++++-
 source/encoder/sao.cpp               |    2 +
 3 files changed, 184 insertions(+), 3 deletions(-)

diff -r 2073ed3429fe -r 7ad2050bc2aa source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Dec 09 16:09:40 2015 -0600
+++ b/source/common/x86/asm-primitives.cpp	Wed Dec 09 16:09:43 2015 -0600
@@ -3637,6 +3637,7 @@
         p.frameInitLowres = PFX(frame_init_lowres_core_avx2);
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
         p.saoCuStatsE0 = PFX(saoCuStatsE0_avx2);
+        p.saoCuStatsE1 = PFX(saoCuStatsE1_avx2);
 
         if (cpuMask & X265_CPU_BMI2)
         {
diff -r 2073ed3429fe -r 7ad2050bc2aa source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Wed Dec 09 16:09:40 2015 -0600
+++ b/source/common/x86/loopfilter.asm	Wed Dec 09 16:09:43 2015 -0600
@@ -2238,7 +2238,7 @@
     pmaddwd     m4, m5, m2
     paddd       m15, m4
 
-    sub         r5d, 16
+    sub         r5d, r7d
     jle        .next
 
     add         r0, 16*2
@@ -2299,7 +2299,7 @@
     phaddd      xm3, xm1
     phaddd      xm2, xm4
     phaddd      xm3, xm2
-    psubd       xm3, xm0, xm3               ; negtive to compensate PMADDWD sign algorithm problem
+    psubd       xm3, xm0, xm3               ; negtive for compensate PMADDWD sign algorithm problem
 
     ; sum stats[4] only
     HADDD       xm5, xm6
@@ -2321,7 +2321,6 @@
 cglobal saoCuStatsE1, 4,12,8,0-32    ; Stack: 5 of stats and 5 of count
     mov         r5d, r5m
     mov         r4d, r4m
-    mov         r11d, r5d
 
     ; clear internal temporary buffer
     pxor        m0, m0
@@ -2412,6 +2411,185 @@
     mov         r6d, [rsp + 5 * 2 + 4 * 4]
     add         [r1 + 4 * 4], r6d
     RET
+
+
+INIT_YMM avx2
+cglobal saoCuStatsE1, 4,13,16       ; Stack: 5 of stats and 5 of count
+    mov         r5d, r5m
+    mov         r4d, r4m
+
+    ; clear internal temporary buffer
+    pxor        xm6, xm6                            ; count[0]
+    pxor        xm7, xm7                            ; count[1]
+    pxor        xm8, xm8                            ; count[2]
+    pxor        xm9, xm9                            ; count[3]
+    pxor        xm10, xm10                          ; count[4]
+    pxor        xm11, xm11                          ; stats[0]
+    pxor        xm12, xm12                          ; stats[1]
+    pxor        xm13, xm13                          ; stats[2]
+    pxor        xm14, xm14                          ; stats[3]
+    pxor        xm15, xm15                          ; stats[4]
+    mova        m0, [pb_128]
+    mova        m5, [pb_1]
+
+    ; save unavailable bound pixel
+    push  qword [r3 + r4]
+
+    ; unavailable mask
+    lea         r12, [pb_movemask_32 + 32]
+
+.loopH:
+    mov         r6d, r4d
+    mov         r9, r0
+    mov         r10, r1
+    mov         r11, r3
+
+.loopW:
+    movu        xm1, [r10]
+    movu        xm2, [r10 + r2]
+
+    ; signDown
+    pxor        xm1, xm0
+    pxor        xm2, xm0
+    pcmpgtb     xm3, xm1, xm2
+    pcmpgtb     xm2, xm1
+    pand        xm3, xm5
+    por         xm2, xm3
+    psignb      xm3, xm2, xm0                       ; -signDown
+
+    ; edgeType
+    movu        xm4, [r11]
+    paddb       xm4, [pb_2]
+    paddb       xm2, xm4
+
+    ; update upBuff1 (must be delay, above code modify memory[r11])
+    movu        [r11], xm3
+
+    ; m[1-4] free in here
+
+    ; get current process mask
+    mov         r7d, 16
+    mov         r8d, r6d
+    cmp         r6d, r7d
+    cmovge      r8d, r7d
+    neg         r8
+    movu        xm1, [r12 + r8]
+
+    ; tmp_count[edgeType]++
+    ; tmp_stats[edgeType] += (fenc[x] - rec[x])
+    pxor        xm3, xm3
+    por         xm1, xm2                            ; apply unavailable pixel mask
+    movu        m4, [r9]                            ; up to 14bits
+
+    pcmpeqb     xm3, xm1, xm3
+    psubb       xm6, xm3
+    pmovsxbw    m2, xm3
+    pmaddwd     m3, m4, m2
+    paddd       m11, m3
+
+    pcmpeqb     xm3, xm1, xm5
+    psubb       xm7, xm3
+    pmovsxbw    m2, xm3
+    pmaddwd     m3, m4, m2
+    paddd       m12, m3
+
+    pcmpeqb     xm3, xm1, [pb_2]
+    psubb       xm8, xm3
+    pmovsxbw    m2, xm3
+    pmaddwd     m3, m4, m2
+    paddd       m13, m3
+
+    pcmpeqb     xm3, xm1, [pb_3]
+    psubb       xm9, xm3
+    pmovsxbw    m2, xm3
+    pmaddwd     m3, m4, m2
+    paddd       m14, m3
+
+    pcmpeqb     xm3, xm1, [pb_4]
+    psubb       xm10, xm3
+    pmovsxbw    m2, xm3
+    pmaddwd     m3, m4, m2
+    paddd       m15, m3
+
+    sub         r6d, r7d
+    jle        .next
+
+    add         r9, 16*2
+    add         r10, 16
+    add         r11, 16
+    jmp        .loopW
+
+.next:
+    ; restore pointer upBuff1
+    add         r0, 64*2                            ; MAX_CU_SIZE
+    add         r1, r2
+
+    dec         r5d
+    jg         .loopH
+
+    ; restore unavailable pixels
+    pop   qword [r3 + r4]
+
+    ; sum to global buffer
+    mov         r1, r6m
+    mov         r0, r7m
+
+    ; sum into word
+    ; WARNING: There have a ovberflow bug on case Block64x64 with ALL pixels are SAME type (HM algorithm never pass Block64x64 into here)
+    pxor        xm0, xm0
+    psadbw      xm1, xm6, xm0
+    psadbw      xm2, xm7, xm0
+    psadbw      xm3, xm8, xm0
+    psadbw      xm4, xm9, xm0
+    psadbw      xm5, xm10, xm0
+    pshufd      xm1, xm1, q3120
+    pshufd      xm2, xm2, q3120
+    pshufd      xm3, xm3, q3120
+    pshufd      xm4, xm4, q3120
+
+    ; sum count[4] only
+    movhlps     xm6, xm5
+    paddd       xm5, xm6
+
+    ; sum count[s_eoTable]
+    ; s_eoTable = {1, 2, 0, 3, 4}
+    punpcklqdq  xm3, xm1
+    punpcklqdq  xm2, xm4
+    phaddd      xm3, xm2
+    movu        xm1, [r0]
+    paddd       xm3, xm1
+    movu        [r0], xm3
+    movd        r5d, xm5
+    add         [r0 + 4 * 4], r5d
+
+    ; sum stats[s_eoTable]
+    vextracti128 xm1, m11, 1
+    paddd       xm1, xm11
+    vextracti128 xm2, m12, 1
+    paddd       xm2, xm12
+    vextracti128 xm3, m13, 1
+    paddd       xm3, xm13
+    vextracti128 xm4, m14, 1
+    paddd       xm4, xm14
+    vextracti128 xm5, m15, 1
+    paddd       xm5, xm15
+
+    ; s_eoTable = {1, 2, 0, 3, 4}
+    phaddd      xm3, xm1
+    phaddd      xm2, xm4
+    phaddd      xm3, xm2
+    psubd       xm3, xm0, xm3               ; negtive for compensate PMADDWD sign algorithm problem
+
+    ; sum stats[4] only
+    HADDD       xm5, xm6
+    psubd       xm5, xm0, xm5
+
+    movu        xm1, [r1]
+    paddd       xm3, xm1
+    movu        [r1], xm3
+    movd        r6d, xm5
+    add         [r1 + 4 * 4], r6d
+    RET
 %endif ; ARCH_X86_64
 
 %if ARCH_X86_64
diff -r 2073ed3429fe -r 7ad2050bc2aa source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Wed Dec 09 16:09:40 2015 -0600
+++ b/source/encoder/sao.cpp	Wed Dec 09 16:09:43 2015 -0600
@@ -1595,6 +1595,7 @@
     memset(tmp_stats, 0, sizeof(tmp_stats));
     memset(tmp_count, 0, sizeof(tmp_count));
 
+    X265_CHECK(endX * endY <= (4096 - 16), "Assembly of saoE1 may overflow with this block size\n");
     for (y = 0; y < endY; y++)
     {
         for (x = 0; x < endX; x++)
@@ -1604,6 +1605,7 @@
             uint32_t edgeType = signDown + upBuff1[x] + 2;
             upBuff1[x] = (int8_t)(-signDown);
 
+            X265_CHECK(edgeType <= 4, "edgeType check failure\n");
             tmp_stats[edgeType] += diff[x];
             tmp_count[edgeType]++;
         }



More information about the x265-devel mailing list