[x265] [PATCH 1 of 2] asm: new SSE2 primivite costC1C2Flag in codeCoeffNxN()
Min Chen
chenm003 at 163.com
Fri Jun 12 22:51:56 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1434142304 25200
# Node ID e235ed1f42d71d12ce089e1cd586778ac45976b2
# Parent 4d2da861ec98105cfa4bf118235678b6491a1c93
asm: new SSE2 primivite costC1C2Flag in codeCoeffNxN()
---
source/common/dct.cpp | 55 ++++++++++++++
source/common/primitives.h | 3 +
source/common/x86/asm-primitives.cpp | 3 +
source/common/x86/pixel-util.h | 1 +
source/common/x86/pixel-util8.asm | 134 ++++++++++++++++++++++++++++++++++
source/encoder/entropy.cpp | 54 ++-----------
6 files changed, 205 insertions(+), 45 deletions(-)
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/dct.cpp
--- a/source/common/dct.cpp Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/dct.cpp Fri Jun 12 13:51:44 2015 -0700
@@ -920,6 +920,60 @@
return sum;
}
+
+uint32_t costC1C2Flag_c(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
+{
+ uint32_t sum = 0;
+ uint32_t c1 = 1;
+ uint32_t firstC2Idx = 8;
+ uint32_t firstC2Flag = 2;
+ uint32_t c1Next = 0xFFFFFFFE;
+
+ int idx = 0;
+ do
+ {
+ uint32_t symbol1 = absCoeff[idx] > 1;
+ uint32_t symbol2 = absCoeff[idx] > 2;
+ //encodeBin(symbol1, baseCtxMod[c1]);
+ {
+ const uint32_t mstate = baseCtxMod[c1];
+ baseCtxMod[c1] = sbacNext(mstate, symbol1);
+ sum += sbacGetEntropyBits(mstate, symbol1);
+ }
+
+ if (symbol1)
+ c1Next = 0;
+
+ if (symbol1 + firstC2Flag == 3)
+ firstC2Flag = symbol2;
+
+ if (symbol1 + firstC2Idx == 9)
+ firstC2Idx = idx;
+
+ c1 = (c1Next & 3);
+ c1Next >>= 2;
+ X265_CHECK(c1 <= 3, "c1 check failure\n");
+ idx++;
+ }
+ while(idx < numC1Flag);
+
+ if (!c1)
+ {
+ X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
+
+ baseCtxMod += ctxOffset;
+
+ //encodeBin(firstC2Flag, baseCtxMod[0]);
+ {
+ const uint32_t mstate = baseCtxMod[0];
+ baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
+ sum += sbacGetEntropyBits(mstate, firstC2Flag);
+ }
+ }
+
+ return (sum & 0x00FFFFFF) + (c1 << 26) + (firstC2Idx << 28);
+}
+
} // closing - anonymous file-static namespace
namespace X265_NS {
@@ -956,5 +1010,6 @@
p.findPosFirstLast = findPosFirstLast_c;
p.costCoeffNxN = costCoeffNxN_c;
p.costCoeffRemain = costCoeffRemain_c;
+ p.costC1C2Flag = costC1C2Flag_c;
}
}
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/primitives.h
--- a/source/common/primitives.h Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/primitives.h Fri Jun 12 13:51:44 2015 -0700
@@ -188,6 +188,7 @@
typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
+typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -315,6 +316,8 @@
costCoeffNxN_t costCoeffNxN;
costCoeffRemain_t costCoeffRemain;
+ costC1C2Flag_t costC1C2Flag;
+
/* There is one set of chroma primitives per color space. An encoder will
* have just a single color space and thus it will only ever use one entry
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Fri Jun 12 13:51:44 2015 -0700
@@ -2165,6 +2165,9 @@
p.cu[BLOCK_4x4].idct = x265_idct4_sse2;
#if X86_64
p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
+
+ // TODO: it is passed smoke test, but we need testbench, so temporary disable
+ //p.costC1C2Flag = x265_costC1C2Flag_sse2;
#endif
p.idst4x4 = x265_idst4_sse2;
p.dst4x4 = x265_dst4_sse2;
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/pixel-util.h Fri Jun 12 13:51:44 2015 -0700
@@ -84,6 +84,7 @@
uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero, int idx);
+uint32_t x265_costC1C2Flag_sse2(uint16_t *absCoeff, intptr_t numNonZero, uint8_t *baseCtxMod, intptr_t ctxOffset);
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r 4d2da861ec98 -r e235ed1f42d7 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Jun 10 07:55:00 2015 -0700
+++ b/source/common/x86/pixel-util8.asm Fri Jun 12 13:51:44 2015 -0700
@@ -72,6 +72,7 @@
cextern hmul_16p
cextern trans8_shuf
cextern_naked g_entropyStateBits
+cextern pb_movemask
;-----------------------------------------------------------------------------
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
@@ -6646,3 +6647,136 @@
mov eax, r5d
RET
+
+
+; uint32_t costC1C2Flag(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset)
+;idx = 0;
+;do
+;{
+; uint32_t symbol1 = absCoeff[idx] > 1;
+; uint32_t symbol2 = absCoeff[idx] > 2;
+; {
+; const uint32_t mstate = baseCtxMod[c1];
+; baseCtxMod[c1] = sbacNext(mstate, symbol1);
+; sum += sbacGetEntropyBits(mstate, symbol1);
+; }
+; if (symbol1)
+; c1Next = 0;
+; if (symbol1 + firstC2Flag == 3)
+; firstC2Flag = symbol2;
+; if (symbol1 + firstC2Idx == 9)
+; firstC2Idx = idx;
+; c1 = (c1Next & 3);
+; c1Next >>= 2;
+; idx++;
+;}
+;while(idx < numC1Flag);
+;if (!c1)
+;{
+; baseCtxMod = &m_contextState[(bIsLuma ? 0 : NUM_ABS_FLAG_CTX_LUMA) + OFF_ABS_FLAG_CTX + ctxSet];
+; {
+; const uint32_t mstate = baseCtxMod[0];
+; baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
+; sum += sbacGetEntropyBits(mstate, firstC2Flag);
+; }
+;}
+;m_fracBits += (sum & 0xFFFFFF);
+
+
+; TODO: we need more register, so I writen code as x64 only, but it is easy to portab to x86 platform
+%if ARCH_X86_64
+INIT_XMM sse2
+cglobal costC1C2Flag, 4,12,2
+
+ mova m0, [r0]
+ packsswb m0, m0
+
+ pcmpgtb m1, m0, [pb_1]
+ pcmpgtb m0, [pb_2]
+
+ ; get mask for 'X>1'
+ pmovmskb r0d, m1
+ mov r11d, r0d
+
+ ; clear unavailable coeff flags
+ xor r6d, r6d
+ bts r6d, r1d
+ dec r6d
+ and r11d, r6d
+
+ ; calculate firstC2Idx
+ or r11d, 0x100 ; default value setting to 8
+ bsf r11d, r11d
+
+ lea r5, [g_entropyStateBits]
+ xor r6d, r6d
+ mov r4d, 0xFFFFFFF9
+
+ ; register mapping
+ ; r4d - nextC1
+ ; r5 - g_entropyStateBits
+ ; r6d - sum
+ ; r[7-10] - tmp
+ ; r11d - firstC2Idx (not use in loop)
+
+ ; process c1 flag
+.loop:
+ ; const uint32_t mstate = baseCtx[ctxSig];
+ ; const uint32_t mps = mstate & 1;
+ ; const uint32_t stateBits = g_entropyStateBits[mstate ^ sig];
+ ; uint32_t nextState = (stateBits >> 24) + mps;
+ ; if ((mstate ^ sig) == 1)
+ ; nextState = sig;
+ mov r10d, r4d ; c1
+ and r10d, 3
+ shr r4d, 2
+
+ xor r7d, r7d
+ shr r0d, 1
+ cmovc r4d, r7d ; c1 <- 0 when C1Flag=1
+ setc r7b ; symbol1
+
+ movzx r8d, byte [r2 + r10] ; mstate = baseCtx[c1]
+ mov r9d, r7d ; sig = symbol1
+ xor r7d, r8d ; mstate ^ sig
+ and r8d, 1 ; mps = mstate & 1
+ add r6d, [r5 + r7 * 4] ; sum += g_entropyStateBits[mstate ^ sig]
+ add r8b, [r5 + r7 * 4 + 3] ; nextState = (stateBits >> 24) + mps
+ cmp r7b, 1 ; if ((mstate ^ sig) == 1) nextState = sig;
+ cmove r8d, r9d
+ mov byte [r2 + r10], r8b
+
+ dec r1d
+ jg .loop
+
+ ; check and generate c1 flag
+ shl r4d, 30
+ jnz .quit
+
+ ; move to c2 ctx
+ add r2, r3
+
+ ; process c2 flag
+ pmovmskb r8d, m0
+ bt r8d, r11d
+ setc r7b
+
+ movzx r8d, byte [r2] ; mstate = baseCtx[c1]
+ mov r1d, r7d ; sig = symbol1
+ xor r7d, r8d ; mstate ^ sig
+ and r8d, 1 ; mps = mstate & 1
+ add r6d, [r5 + r7 * 4] ; sum += g_entropyStateBits[mstate ^ sig]
+ add r8b, [r5 + r7 * 4 + 3] ; nextState = (stateBits >> 24) + mps
+ cmp r7b, 1 ; if ((mstate ^ sig) == 1) nextState = sig;
+ cmove r8d, r1d
+ mov byte [r2], r8b
+
+.quit:
+ shrd r4d, r11d, 4
+%ifnidn r6d,eax
+ mov eax, r6d
+%endif
+ and eax, 0x00FFFFFF
+ or eax, r4d
+ RET
+%endif ; ARCH_X86_64
diff -r 4d2da861ec98 -r e235ed1f42d7 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp Wed Jun 10 07:55:00 2015 -0700
+++ b/source/encoder/entropy.cpp Fri Jun 12 13:51:44 2015 -0700
@@ -1741,7 +1741,7 @@
bool signHidden = (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD);
uint32_t ctxSet = ((subSet > 0) & bIsLuma) ? 2 : 0;
- ctxSet += (c1 == 0);
+ ctxSet += !(c1 & 3);
c1 = 1;
uint8_t *baseCtxMod = &m_contextState[(bIsLuma ? 0 : NUM_ONE_FLAG_CTX_LUMA) + OFF_ONE_FLAG_CTX + 4 * ctxSet];
@@ -1749,55 +1749,19 @@
uint32_t numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER);
X265_CHECK(numC1Flag > 0, "numC1Flag check failure\n");
+ uint8_t baseCtxModX0[160], baseCtxModX1[160];
+ memcpy(baseCtxModX0, m_contextState, sizeof(m_contextState));
+ memcpy(baseCtxModX1, m_contextState, sizeof(m_contextState));
+
uint32_t firstC2Idx = 8;
uint32_t firstC2Flag = 2;
uint32_t c1Next = 0xFFFFFFFE;
if (!m_bitIf)
{
- uint32_t sum = 0;
- // Fast RD path
- idx = 0;
- do
- {
- uint32_t symbol1 = absCoeff[idx] > 1;
- uint32_t symbol2 = absCoeff[idx] > 2;
- //encodeBin(symbol1, baseCtxMod[c1]);
- {
- const uint32_t mstate = baseCtxMod[c1];
- baseCtxMod[c1] = sbacNext(mstate, symbol1);
- sum += sbacGetEntropyBits(mstate, symbol1);
- }
-
- if (symbol1)
- c1Next = 0;
-
- if (symbol1 + firstC2Flag == 3)
- firstC2Flag = symbol2;
-
- if (symbol1 + firstC2Idx == 9)
- firstC2Idx = idx;
-
- c1 = (c1Next & 3);
- c1Next >>= 2;
- X265_CHECK(c1 <= 3, "c1 check failure\n");
- idx++;
- }
- while(idx < numC1Flag);
-
- if (!c1)
- {
- X265_CHECK((firstC2Flag <= 1), "firstC2FlagIdx check failure\n");
-
- baseCtxMod = &m_contextState[(bIsLuma ? 0 : NUM_ABS_FLAG_CTX_LUMA) + OFF_ABS_FLAG_CTX + ctxSet];
-
- //encodeBin(firstC2Flag, baseCtxMod[0]);
- {
- const uint32_t mstate = baseCtxMod[0];
- baseCtxMod[0] = sbacNext(mstate, firstC2Flag);
- sum += sbacGetEntropyBits(mstate, firstC2Flag);
- }
- }
- m_fracBits += (sum & 0xFFFFFF);
+ uint32_t sum = primitives.costC1C2Flag(absCoeff, numC1Flag, baseCtxMod, (bIsLuma ? 0 : NUM_ABS_FLAG_CTX_LUMA - NUM_ONE_FLAG_CTX_LUMA) + (OFF_ABS_FLAG_CTX - OFF_ONE_FLAG_CTX) - 3 * ctxSet);
+ c1 = ((sum >> 26) & 3);
+ firstC2Idx = (sum >> 28);
+ m_fracBits += sum & 0x00FFFFFF;
const int hiddenShift = (bHideFirstSign & signHidden) ? 1 : 0;
//encodeBinsEP((coeffSigns >> hiddenShift), numNonZero - hiddenShift);
More information about the x265-devel
mailing list