[x265] [PATCH 1 of 2] asm: new SSE2 primivite costC1C2Flag in codeCoeffNxN()

Min Chen chenm003 at 163.com
Fri Jun 12 22:51:56 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1434142304 25200
# Node ID e235ed1f42d71d12ce089e1cd586778ac45976b2
# Parent  4d2da861ec98105cfa4bf118235678b6491a1c93
asm: new SSE2 primivite costC1C2Flag in codeCoeffNxN()
---
 source/common/dct.cpp                |   55 ++++++++++++++
 source/common/primitives.h           |    3 +
 source/common/x86/asm-primitives.cpp |    3 +
 source/common/x86/pixel-util.h       |    1 +
 source/common/x86/pixel-util8.asm    |  134 ++++++++++++++++++++++++++++++++++
 source/encoder/entropy.cpp           |   54 ++-----------
 6 files changed, 205 insertions(+), 45 deletions(-)

diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/dct.cpp
--- a/source/common/dct.cpp	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/dct.cpp	Fri Jun 12 13:51:44 2015 -0700
@@ -920,6 +920,60 @@
     return sum;
 }
 
+
+uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
+{
+    uint32_t sum = 0;
+    uint32_t c1 = 1;
+    uint32_t firstC2Idx = 8;
+    uint32_t firstC2Flag = 2;
+    uint32_t c1Next = 0xFFFFFFFE;
+
+    int idx = 0;
+    do
+    {
+        uint32_t symbol1 = absCoeff[idx] > 1;
+        uint32_t symbol2 = absCoeff[idx] > 2;
+        //encodeBin(symbol1, baseCtxMod[c1]);
+        {
+            const uint32_t mstate = baseCtxMod[c1];
+            baseCtxMod[c1] = sbacNext(mstate, symbol1);
+            sum += sbacGetEntropyBits(mstate, symbol1);
+        }
+
+        if (symbol1)
+            c1Next = 0;
+
+        if (symbol1 + firstC2Flag == 3)
+            firstC2Flag = symbol2;
+
+        if (symbol1 + firstC2Idx == 9)
+            firstC2Idx  = idx;
+
+        c1 = (c1Next & 3);
+        c1Next >>= 2;
+        X265_CHECK(c1 <= 3, "c1 check failure\n");
+        idx++;
+    }
+    while(idx < numC1Flag);
+
+    if (!c1)
+    {
+        X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
+
+        baseCtxMod += ctxOffset;
+
+        //encodeBin(firstC2Flag, baseCtxMod[0]);
+        {
+            const uint32_t mstate = baseCtxMod[0];
+            baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
+            sum += sbacGetEntropyBits(mstate, firstC2Flag);
+        }
+    }
+
+    return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
+}
+
 }  // closing - anonymous file-static namespace
 
 namespace X265_NS {
@@ -956,5 +1010,6 @@
     p.findPosFirstLast = findPosFirstLast_c;
     p.costCoeffNxN = costCoeffNxN_c;
     p.costCoeffRemain = costCoeffRemain_c;
+    p.costC1C2Flag = costC1C2Flag_c;
 }
 }
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/primitives.h
--- a/source/common/primitives.h	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/primitives.h	Fri Jun 12 13:51:44 2015 -0700
@@ -188,6 +188,7 @@
 
 typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
 typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
+typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
 
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -315,6 +316,8 @@
 
     costCoeffNxN_t        costCoeffNxN;
     costCoeffRemain_t     costCoeffRemain;
+    costC1C2Flag_t        costC1C2Flag;
+
 
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Fri Jun 12 13:51:44 2015 -0700
@@ -2165,6 +2165,9 @@
         p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
 #if X86_64
         p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
+
+        // TODO: it is passed smoke test, but we need testbench, so temporary disable
+        //p.costC1C2Flag = x265_costC1C2Flag_sse2;
 #endif
         p.idst4x4 = x265_idst4_sse2;
         p.dst4x4 = x265_dst4_sse2;
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/pixel-util.h	Fri Jun 12 13:51:44 2015 -0700
@@ -84,6 +84,7 @@
 
 uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
 uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero, int idx);
+uint32_t x265_costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset);
 
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/pixel-util8.asm	Fri Jun 12 13:51:44 2015 -0700
@@ -72,6 +72,7 @@
 cextern hmul_16p
 cextern trans8_shuf
 cextern_naked g_entropyStateBits
+cextern pb_movemask
 
 ;-----------------------------------------------------------------------------
 ; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -6646,3 +6647,136 @@
 
     mov         eax, r5d
     RET
+
+
+; uint32_t costC1C2Flag(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
+;idx = 0;
+;do
+;{
+;    uint32_t symbol1 = absCoeff[idx] > 1;
+;    uint32_t symbol2 = absCoeff[idx] > 2;
+;    {
+;        const uint32_t mstate = baseCtxMod[c1];
+;        baseCtxMod[c1] = sbacNext(mstate, symbol1);
+;        sum += sbacGetEntropyBits(mstate, symbol1);
+;    }
+;    if (symbol1)
+;        c1Next = 0;
+;    if (symbol1 + firstC2Flag == 3)
+;        firstC2Flag = symbol2;
+;    if (symbol1 + firstC2Idx == 9)
+;        firstC2Idx  = idx;
+;    c1 = (c1Next & 3);
+;    c1Next >>= 2;
+;    idx++;
+;}
+;while(idx < numC1Flag);
+;if (!c1)
+;{
+;    baseCtxMod = &m_contextState[(bIsLuma ? 0 : NUM_ABS_FLAG_CTX_LUMA) + OFF_ABS_FLAG_CTX + ctxSet];
+;    {
+;        const uint32_t mstate = baseCtxMod[0];
+;        baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
+;        sum += sbacGetEntropyBits(mstate, firstC2Flag);
+;    }
+;}
+;m_fracBits += (sum & 0xFFFFFF);
+
+
+; TODO: we need more register, so I writen code as x64 only, but it is easy to portab to x86 platform
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal costC1C2Flag, 4,12,2
+
+    mova        m0, [r0]
+    packsswb    m0, m0
+
+    pcmpgtb     m1, m0, [pb_1]
+    pcmpgtb     m0, [pb_2]
+
+    ; get mask for 'X>1'
+    pmovmskb    r0d, m1
+    mov         r11d, r0d
+
+    ; clear unavailable coeff flags
+    xor         r6d, r6d
+    bts         r6d, r1d
+    dec         r6d
+    and         r11d, r6d
+
+    ; calculate firstC2Idx
+    or          r11d, 0x100                     ; default value setting to 8
+    bsf         r11d, r11d
+
+    lea         r5, [g_entropyStateBits]
+    xor         r6d, r6d
+    mov         r4d, 0xFFFFFFF9
+
+    ; register mapping
+    ; r4d       - nextC1
+    ; r5        - g_entropyStateBits
+    ; r6d       - sum
+    ; r[7-10]   - tmp
+    ; r11d      - firstC2Idx (not use in loop)
+
+    ; process c1 flag
+.loop:
+    ; const uint32_t mstate = baseCtx[ctxSig];
+    ; const uint32_t mps = mstate & 1;
+    ; const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
+    ; uint32_t nextState = (stateBits >> 24) + mps;
+    ; if ((mstate ^ sig) == 1)
+    ;     nextState = sig;
+    mov         r10d, r4d                       ; c1
+    and         r10d, 3
+    shr         r4d, 2
+
+    xor         r7d, r7d
+    shr         r0d, 1
+    cmovc       r4d, r7d                        ; c1 <- 0 when C1Flag=1
+    setc        r7b                             ; symbol1
+
+    movzx       r8d, byte [r2 + r10]            ; mstate = baseCtx[c1]
+    mov         r9d, r7d                        ; sig = symbol1
+    xor         r7d, r8d                        ; mstate ^ sig
+    and         r8d, 1                          ; mps = mstate & 1
+    add         r6d, [r5 + r7 * 4]              ; sum += g_entropyStateBits[mstate ^ sig]
+    add         r8b, [r5 + r7 * 4 + 3]          ; nextState = (stateBits >> 24) + mps
+    cmp         r7b, 1                          ; if ((mstate ^ sig) == 1) nextState = sig;
+    cmove       r8d, r9d
+    mov    byte [r2 + r10], r8b
+
+    dec         r1d
+    jg         .loop
+
+    ; check and generate c1 flag
+    shl         r4d, 30
+    jnz        .quit
+
+    ; move to c2 ctx
+    add         r2, r3
+
+    ; process c2 flag
+    pmovmskb    r8d, m0
+    bt          r8d, r11d
+    setc        r7b
+
+    movzx       r8d, byte [r2]                  ; mstate = baseCtx[c1]
+    mov         r1d, r7d                        ; sig = symbol1
+    xor         r7d, r8d                        ; mstate ^ sig
+    and         r8d, 1                          ; mps = mstate & 1
+    add         r6d, [r5 + r7 * 4]              ; sum += g_entropyStateBits[mstate ^ sig]
+    add         r8b, [r5 + r7 * 4 + 3]          ; nextState = (stateBits >> 24) + mps
+    cmp         r7b, 1                          ; if ((mstate ^ sig) == 1) nextState = sig;
+    cmove       r8d, r1d
+    mov    byte [r2], r8b
+
+.quit:
+    shrd        r4d, r11d, 4
+%ifnidn r6d,eax
+    mov         eax, r6d
+%endif
+    and         eax, 0x00FFFFFF
+    or          eax, r4d
+    RET
+%endif ; ARCH_X86_64
diff -r 4d2da861ec98 -r e235ed1f42d7 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Wed Jun 10 07:55:00 2015 -0700
+++ b/source/encoder/entropy.cpp	Fri Jun 12 13:51:44 2015 -0700
@@ -1741,7 +1741,7 @@
             bool signHidden = (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD);
             uint32_t ctxSet = ((subSet > 0) & bIsLuma) ? 2 : 0;
 
-            ctxSet += (c1 == 0);
+            ctxSet += !(c1 & 3);
 
             c1 = 1;
             uint8_t *baseCtxMod = &m_contextState[(bIsLuma ? 0 : NUM_ONE_FLAG_CTX_LUMA) + OFF_ONE_FLAG_CTX + 4 * ctxSet];
@@ -1749,55 +1749,19 @@
             uint32_t numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER);
             X265_CHECK(numC1Flag > 0, "numC1Flag check failure\n");
 
+            uint8_t baseCtxModX0[160], baseCtxModX1[160];
+            memcpy(baseCtxModX0, m_contextState, sizeof(m_contextState));
+            memcpy(baseCtxModX1, m_contextState, sizeof(m_contextState));
+
             uint32_t firstC2Idx = 8;
             uint32_t firstC2Flag = 2;
             uint32_t c1Next = 0xFFFFFFFE;
             if (!m_bitIf)
             {
-                uint32_t sum = 0;
-                // Fast RD path
-                idx = 0;
-                do
-                {
-                    uint32_t symbol1 = absCoeff[idx] > 1;
-                    uint32_t symbol2 = absCoeff[idx] > 2;
-                    //encodeBin(symbol1, baseCtxMod[c1]);
-                    {
-                        const uint32_t mstate = baseCtxMod[c1];
-                        baseCtxMod[c1] = sbacNext(mstate, symbol1);
-                        sum += sbacGetEntropyBits(mstate, symbol1);
-                    }
-
-                    if (symbol1)
-                        c1Next = 0;
-
-                    if (symbol1 + firstC2Flag == 3)
-                        firstC2Flag = symbol2;
-
-                    if (symbol1 + firstC2Idx == 9)
-                        firstC2Idx  = idx;
-
-                    c1 = (c1Next & 3);
-                    c1Next >>= 2;
-                    X265_CHECK(c1 <= 3, "c1 check failure\n");
-                    idx++;
-                }
-                while(idx < numC1Flag);
-
-                if (!c1)
-                {
-                    X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
-
-                    baseCtxMod = &m_contextState[(bIsLuma ? 0 : NUM_ABS_FLAG_CTX_LUMA) + OFF_ABS_FLAG_CTX + ctxSet];
-
-                    //encodeBin(firstC2Flag, baseCtxMod[0]);
-                    {
-                        const uint32_t mstate = baseCtxMod[0];
-                        baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
-                        sum += sbacGetEntropyBits(mstate, firstC2Flag);
-                    }
-                }
-                m_fracBits += (sum & 0xFFFFFF);
+                uint32_t sum = primitives.costC1C2Flag(absCoeff, numC1Flag, baseCtxMod, (bIsLuma ? 0 : NUM_ABS_FLAG_CTX_LUMA - NUM_ONE_FLAG_CTX_LUMA) + (OFF_ABS_FLAG_CTX - OFF_ONE_FLAG_CTX) - 3 * ctxSet);
+                c1 = ((sum >> 26) & 3);
+                firstC2Idx = (sum >> 28);
+                m_fracBits += sum & 0x00FFFFFF;
 
                 const int hiddenShift = (bHideFirstSign & signHidden) ? 1 : 0;
                 //encodeBinsEP((coeffSigns >> hiddenShift), numNonZero - hiddenShift);



More information about the x265-devel mailing list