[x265] [PATCH] asm: AVX2 version costCoeffNxN, 1367c -> 1186c

Fri Nov 20 23:03:27 CET 2015

# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1448057004 21600
# Node ID 178fe732b4b15fb419ce71ad44fdaa97ed5b0d45
# Parent  c303685732c784a11665218ca24643b769638511
asm: AVX2 version costCoeffNxN, 1367c -> 1186c
---
 source/common/x86/asm-primitives.cpp |    6 +
 source/common/x86/const-a.asm        |    1 +
 source/common/x86/pixel-util.h       |    1 +
 source/common/x86/pixel-util8.asm    |  173 ++++++++++++++++++++++++++++++++++
 source/encoder/entropy.cpp           |    2 +-
 source/test/pixelharness.cpp         |   22 ++++-
 6 files changed, 199 insertions(+), 6 deletions(-)

diff -r c303685732c7 -r 178fe732b4b1 source/common/x86/asm-primitives.cpp

--- a/source/common/x86/asm-primitives.cpp	Thu Nov 19 17:29:39 2015 -0600
+++ b/source/common/x86/asm-primitives.cpp	Fri Nov 20 16:03:24 2015 -0600
@@ -2156,7 +2156,10 @@
         p.pu[LUMA_4x4].luma_hvpp = interp_8tap_hv_pp_cpu<LUMA_4x4>;             // ALL_LUMA_PU_T has declared all sizes except 4x4, hence calling luma_hvpp[4x4] 
 
         if (cpuMask & X265_CPU_BMI2)
+        {
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
+            p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
+        }
     }
 }
 #else // if HIGH_BIT_DEPTH
@@ -3653,7 +3656,10 @@
         p.propagateCost = PFX(mbtree_propagate_cost_avx2);
 
         if (cpuMask & X265_CPU_BMI2)
+        {
             p.scanPosLast = PFX(scanPosLast_avx2_bmi2);
+            p.costCoeffNxN = PFX(costCoeffNxN_avx2_bmi2);
+        }
         p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
         p.chroma[X265_CSP_I420].cu[CHROMA_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_avx2);
         p.chroma[X265_CSP_I422].cu[CHROMA_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_avx2);
diff -r c303685732c7 -r 178fe732b4b1 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Thu Nov 19 17:29:39 2015 -0600
+++ b/source/common/x86/const-a.asm	Fri Nov 20 16:03:24 2015 -0600
@@ -114,6 +114,7 @@
                             times  7 dw 0xff
 const hmul_16p,             times 16 db   1
                             times  8 db   1,  -1
+const pw_exp2_0_15,                  dw 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768
 
 
 ;; 32-bit constants
diff -r c303685732c7 -r 178fe732b4b1 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Thu Nov 19 17:29:39 2015 -0600
+++ b/source/common/x86/pixel-util.h	Fri Nov 20 16:03:24 2015 -0600
@@ -56,5 +56,6 @@
 int PFX(scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize));
 uint32_t PFX(findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]));
 uint32_t PFX(costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
+uint32_t PFX(costCoeffNxN_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase));
 
 #endif // ifndef X265_PIXEL_UTIL_H
diff -r c303685732c7 -r 178fe732b4b1 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Nov 19 17:29:39 2015 -0600
+++ b/source/common/x86/pixel-util8.asm	Fri Nov 20 16:03:24 2015 -0600
@@ -49,6 +49,7 @@
 mask_ff:                times 16 db 0xff
                         times 16 db 0
 deinterleave_shuf:      times  2 db 0, 2, 4, 6, 8, 10, 12, 14, 1, 3, 5, 7, 9, 11, 13, 15
+interleave_shuf:        times  2 db 0, 8, 1, 9, 2, 10, 3, 11, 4, 12, 5, 13, 6, 14, 7, 15
 deinterleave_word_shuf: times  2 db 0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15
 hmulw_16p:              times  8 dw 1
                         times  4 dw 1, -1
@@ -78,6 +79,7 @@
 cextern trans8_shuf
 cextern_naked private_prefix %+ _entropyStateBits
 cextern pb_movemask
+cextern pw_exp2_0_15
 
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -6874,6 +6876,177 @@
 %endif
     and         eax, 0xFFFFFF
     RET
+
+
+; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase)
+INIT_YMM avx2,bmi2
+cglobal costCoeffNxN, 6,10,5
+    add             r2d, r2d
+
+    ; abs(coeff)
+    movq            xm1, [r1]
+    movhps          xm1, [r1 + r2]
+    movq            xm2, [r1 + r2 * 2]
+    lea             r2, [r2 * 3]
+    movhps          xm2, [r1 + r2]
+    vinserti128     m1, m1, xm2, 1
+    pabsw           m1, m1
+    ; r[1-2] free here
+
+    ; loading tabSigCtx
+    mova            xm2, [r4]
+    ; r[4] free here
+
+    ; WARNING: beyond-bound read here!
+    ; loading scan table
+    mov             r2d, r8m
+    bzhi            r4d, r5d, r2d                   ; clear non-scan mask bits
+    mov             r6d, r2d
+    xor             r2d, 15
+    movu            m0, [r0 + r2 * 2]
+    packuswb        m0, m0
+    pxor            m0, [pb_15]
+    vpermq          m0, m0, q3120
+    add             r4d, r2d                        ; r4d = (scanPosSigOff == 15) -> (numNonZero == 0)
+    mov             r2d, r6d
+
+    ; reorder tabSigCtx (+offset)
+    pshufb          xm2, xm0
+    vpbroadcastb    xm3, r7m
+    paddb           xm2, xm3
+    ; r[0-1] free here
+
+    ; reorder coeff
+    pshufb          m1, [deinterleave_shuf]
+    vpermq          m1, m1, q3120
+    pshufb          m1, m0
+    vpermq          m1, m1, q3120
+    pshufb          m1, [interleave_shuf]
+    ; r[0-1], m[2-3] free here
+
+    ; sig mask
+    pxor            xm3, xm3
+    movd            xm4, r5d
+    vpbroadcastw    m4, xm4
+    pandn           m4, m4, [pw_exp2_0_15]
+    pcmpeqw         m4, m3
+
+    ; absCoeff[numNonZero] = tmpCoeff[blkPos]
+    ; [0-3]
+    movq            r0, xm4
+    movq            r1, xm1
+    pext            r6, r1, r0
+    mov       qword [r3], r6
+    popcnt          r0, r0
+    shr             r0, 3
+    add             r3, r0
+
+    ; [4-7]
+    pextrq          r0, xm4, 1
+    pextrq          r1, xm1, 1
+    pext            r6, r1, r0
+    mov       qword [r3], r6
+    popcnt          r0, r0
+    shr             r0, 3
+    add             r3, r0
+
+    ; [8-B]
+    vextracti128    xm4, m4, 1
+    movq            r0, xm4
+    vextracti128    xm1, m1, 1
+    movq            r1, xm1
+    pext            r6, r1, r0
+    mov       qword [r3], r6
+    popcnt          r0, r0
+    shr             r0, 3
+    add             r3, r0
+
+    ; [C-F]
+    pextrq          r0, xm4, 1
+    pextrq          r1, xm1, 1
+    pext            r6, r1, r0
+    mov       qword [r3], r6
+    ; r[0-1,3] free here
+
+    ; register mapping
+    ; m0 - Zigzag
+    ; m1 - sigCtx
+    ; r0 - x265_entropyStateBits
+    ; r1 - baseCtx
+    ; r2 - scanPosSigOff
+    ; r5 - scanFlagMask
+    ; r6 - sum
+    ; {r3,r4} - ctxSig[15-0]
+    ; r8m - (numNonZero != 0) || (subPosBase == 0)
+    lea             r0, [private_prefix %+ _entropyStateBits]
+    mov             r1, r6mp
+    xor             r6d, r6d
+    xor             r8d, r8d
+
+    test            r2d, r2d
+    jz             .idx_zero
+
+;   {
+;        const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
+;        ctxSig = cnt & posZeroMask;
+;        const uint32_t mstate = baseCtx[ctxSig];
+;        const uint32_t mps = mstate & 1;
+;        const uint32_t stateBits = x265_entropyStateBits[mstate ^ sig];
+;        uint32_t nextState = (stateBits >> 24) + mps;
+;        if ((mstate ^ sig) == 1)
+;            nextState = sig;
+;        baseCtx[ctxSig] = (uint8_t)nextState;
+;        sum += stateBits;
+;    }
+;    absCoeff[numNonZero] = tmpCoeff[blkPos];
+;    numNonZero += sig;
+;    scanPosSigOff--;
+.loop:
+    shr             r5d, 1
+    setc            r8b                             ; r8 = sig
+    movd            r7d, xm2                        ; r7 = ctxSig
+    movzx           r7d, r7b
+    psrldq          xm2, 1
+    movzx           r9d, byte [r1 + r7]             ; mstate = baseCtx[ctxSig]
+    mov             r3d, r9d
+    and             r3b, 1                          ; mps = mstate & 1
+    xor             r9d, r8d                        ; r9 = mstate ^ sig
+    add             r6d, [r0 + r9 * 4]              ; sum += entropyStateBits[mstate ^ sig]
+    add             r3b, byte [r0 + r9 * 4 + 3]     ; nextState = (stateBits >> 24) + mps
+    cmp             r9d, 1
+    cmove           r3d, r8d
+    mov        byte [r1 + r7], r3b
+
+    dec             r2d
+    jg             .loop
+
+.idx_zero:
+    xor             r2d, r2d
+    cmp        word r9m, 0
+    sete            r2b
+    add             r4d, r2d                        ; (numNonZero != 0) || (subPosBase == 0)
+    jz             .exit
+
+    dec             r2b
+    movd            r3d, xm2
+    and             r2d, r3d
+
+    movzx           r3d, byte [r1 + r2]             ; mstate = baseCtx[ctxSig]
+    mov             r4d, r5d
+    xor             r5d, r3d                        ; r0 = mstate ^ sig
+    and             r3b, 1                          ; mps = mstate & 1
+    add             r6d, [r0 + r5 * 4]              ; sum += x265_entropyStateBits[mstate ^ sig]
+    add             r3b, [r0 + r5 * 4 + 3]          ; nextState = (stateBits >> 24) + mps
+    cmp             r5b, 1
+    cmove           r3d, r4d
+    mov        byte [r1 + r2], r3b
+
+.exit:
+%ifnidn eax,r6d
+    mov             eax, r6d
+%endif
+    and             eax, 0xFFFFFF
+    RET
 %endif ; ARCH_X86_64
 
 
diff -r c303685732c7 -r 178fe732b4b1 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Thu Nov 19 17:29:39 2015 -0600
+++ b/source/encoder/entropy.cpp	Fri Nov 20 16:03:24 2015 -0600
@@ -1566,7 +1566,7 @@
     uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
     uint32_t c1 = 1;
     int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
-    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
+    ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE) + 1]);   // extra 2 bytes space for AVX2 assembly
     uint32_t numNonZero = 1;
     unsigned long lastNZPosInCG;
     unsigned long firstNZPosInCG;
diff -r c303685732c7 -r 178fe732b4b1 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Thu Nov 19 17:29:39 2015 -0600
+++ b/source/test/pixelharness.cpp	Fri Nov 20 16:03:24 2015 -0600
@@ -1590,8 +1590,8 @@
 bool PixelHarness::check_costCoeffNxN(costCoeffNxN_t ref, costCoeffNxN_t opt)
 {
     ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 3]);
-    ALIGN_VAR_32(uint16_t, ref_absCoeff[1 << MLS_CG_SIZE]);
-    ALIGN_VAR_32(uint16_t, opt_absCoeff[1 << MLS_CG_SIZE]);
+    ALIGN_VAR_32(uint16_t, ref_absCoeff[(1 << MLS_CG_SIZE)]);
+    ALIGN_VAR_32(uint16_t, opt_absCoeff[(1 << MLS_CG_SIZE) + 4]);
 
     memset(ref_absCoeff, 0xCD, sizeof(ref_absCoeff));
     memset(opt_absCoeff, 0xCD, sizeof(opt_absCoeff));
@@ -1617,6 +1617,12 @@
         ref_src[32 * 32 + i] = 0x1234;
     }
 
+    // Safe check magic
+    opt_absCoeff[(1 << MLS_CG_SIZE) + 0] = 0x0123;
+    opt_absCoeff[(1 << MLS_CG_SIZE) + 1] = 0x4567;
+    opt_absCoeff[(1 << MLS_CG_SIZE) + 2] = 0xBA98;
+    opt_absCoeff[(1 << MLS_CG_SIZE) + 3] = 0xFEDC;
+
     // generate CABAC context table
     uint8_t m_contextState_ref[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
     uint8_t m_contextState_opt[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
@@ -1707,8 +1713,8 @@
             continue;
 
         const uint32_t blkPosBase = scanTbl[subPosBase];
-        uint32_t ref_sum = ref(scanTblCG4x4, &ref_src[blkPosBase + i], trSize, ref_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)ref_baseCtx, offset, rand_scanPosSigOff, subPosBase);
-        uint32_t opt_sum = (uint32_t)checked(opt, scanTblCG4x4, &ref_src[blkPosBase + i], trSize, opt_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)opt_baseCtx, offset, rand_scanPosSigOff, subPosBase);
+        uint32_t ref_sum = ref(scanTblCG4x4, &ref_src[blkPosBase + i], (intptr_t)trSize, ref_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)ref_baseCtx, offset, rand_scanPosSigOff, subPosBase);
+        uint32_t opt_sum = (uint32_t)checked(opt, scanTblCG4x4, &ref_src[blkPosBase + i], (intptr_t)trSize, opt_absCoeff + numNonZero, rand_tabSigCtx, scanFlagMask, (uint8_t*)opt_baseCtx, offset, rand_scanPosSigOff, subPosBase);
 
         if (ref_sum != opt_sum)
             return false;
@@ -1716,7 +1722,13 @@
             return false;
 
         // NOTE: just first rand_numCoeff valid, but I check full buffer for confirm no overwrite bug
-        if (memcmp(ref_absCoeff, opt_absCoeff, sizeof(ref_absCoeff)))
+        if (memcmp(ref_absCoeff, opt_absCoeff, rand_numCoeff * sizeof(ref_absCoeff[0])))
+            return false;
+
+        // Check memory beyond-bound write
+        if (   opt_absCoeff[(1 << MLS_CG_SIZE) + 1] != 0x4567
+            || opt_absCoeff[(1 << MLS_CG_SIZE) + 2] != 0xBA98
+            || opt_absCoeff[(1 << MLS_CG_SIZE) + 3] != 0xFEDC)
             return false;
 
         reportfail();