[x265] [PATCH 6 of 6] asm: improve costCoeffRemain by bypass uncoded coeff
Min Chen
chenm003 at 163.com
Tue Jun 9 20:06:06 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1433872879 25200
# Node ID e5b6f0a984bdd8ab16b63fb1c11a508a444515ec
# Parent 134670771e0c1dd0800c3e9db0a1f9f69c467e36
asm: improve costCoeffRemain by bypass uncoded coeff
---
source/common/dct.cpp | 17 ++++----
source/common/primitives.h | 2 +-
source/common/x86/pixel-util.h | 2 +-
source/common/x86/pixel-util8.asm | 41 +++++++++----------
source/encoder/entropy.cpp | 82 ++++++++++++++++++++++++++++++++-----
5 files changed, 101 insertions(+), 43 deletions(-)
diff -r 134670771e0c -r e5b6f0a984bd source/common/dct.cpp
--- a/source/common/dct.cpp Tue Jun 09 11:01:15 2015 -0700
+++ b/source/common/dct.cpp Tue Jun 09 11:01:19 2015 -0700
@@ -874,19 +874,19 @@
return (sum & 0xFFFFFF);
}
-uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero)
+uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero, int idx)
{
uint32_t goRiceParam = 0;
- int firstCoeff2 = 1;
- uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
uint32_t sum = 0;
- int idx = 0;
+ int baseLevel = 3;
do
{
- int baseLevel = (baseLevelN & 3) | firstCoeff2;
- X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
- baseLevelN >>= 2;
+ if (idx >= C1FLAG_NUMBER)
+ baseLevel = 1;
+
+ // TODO: the IDX is not really idx, so this check inactive
+ //X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
int codeNumber = absCoeff[idx] - baseLevel;
if (codeNumber >= 0)
@@ -912,8 +912,7 @@
goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
}
- if (absCoeff[idx] >= 2)
- firstCoeff2 = 0;
+ baseLevel = 2;
idx++;
}
while(idx < numNonZero);
diff -r 134670771e0c -r e5b6f0a984bd source/common/primitives.h
--- a/source/common/primitives.h Tue Jun 09 11:01:15 2015 -0700
+++ b/source/common/primitives.h Tue Jun 09 11:01:19 2015 -0700
@@ -187,7 +187,7 @@
typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
-typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero);
+typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Tue Jun 09 11:01:15 2015 -0700
+++ b/source/common/x86/pixel-util.h Tue Jun 09 11:01:19 2015 -0700
@@ -83,7 +83,7 @@
uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
-uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero);
+uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero, int idx);
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r 134670771e0c -r e5b6f0a984bd source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:15 2015 -0700
+++ b/source/common/x86/pixel-util8.asm Tue Jun 09 11:01:19 2015 -0700
@@ -6572,7 +6572,7 @@
;}
;while(idx < numNonZero);
-; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero)
+; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero, int idx)
INIT_XMM sse4
cglobal costCoeffRemain, 0,7,1
; assign RCX to R3
@@ -6580,48 +6580,43 @@
%if WIN64
DECLARE_REG_TMP 3,1,2,0
mov t0, r0
+ mov r4d, r2d
%elif ARCH_X86_64
; *nix x64 didn't do anything
DECLARE_REG_TMP 0,1,2,3
+ mov r4d, r2d
%else ; X86_32
DECLARE_REG_TMP 6,3,2,1
mov t0, r0m
+ mov r4d, r2m
%endif
- mova m0, [t0]
- packsswb m0, [t0 + mmsize]
- pcmpgtb m0, [pb_1]
- pmovmskb r2d, m0
- bsf r2d, r2d
- lea r2d, [r2 * 2 + 1]
- xor r4d, r4d
- bts r4d, r2d
- dec r4d
- and r4d, 0x55555555
- or r4d, 0x5555AAAA
-
xor t3d, t3d
xor r5d, r5d
+ lea t0, [t0 + r4 * 2]
+ mov r2d, 3
+
; register mapping
- ; r4d - baseLevelN
- ; r2 - tmp
+ ; r2d - baseLevel & tmp
+ ; r4d - idx
; t3 - goRiceParam
- ; eax - tmp - absCoeff[idx]
+ ; eax - absCoeff[idx] & tmp
; r5 - sum
.loop:
+ mov eax, 1
+ cmp r4d, 8
+ cmovge r2d, eax
+
movzx eax, word [t0]
add t0, 2
- mov r2d, r4d
- and r2d, 3
- shr r4d, 2
sub eax, r2d ; codeNumber = absCoeff[idx] - baseLevel
jl .next
shr eax, t3b ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION
- lea r2d, [eax - 3 + 1] ; CLZ(cidx, codeNumber + 1);
+ lea r2d, [rax - 3 + 1] ; CLZ(cidx, codeNumber + 1);
bsr r2d, r2d
add r2d, r2d ; codeNumber = (length + length)
@@ -6644,8 +6639,10 @@
add t3b, al
.next:
- dec dword r1m
- jnz .loop
+ inc r4d
+ mov r2d, 2
+ cmp r4d, r1m
+ jl .loop
mov eax, r5d
RET
diff -r 134670771e0c -r e5b6f0a984bd source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp Tue Jun 09 11:01:15 2015 -0700
+++ b/source/encoder/entropy.cpp Tue Jun 09 11:01:19 2015 -0700
@@ -1431,6 +1431,55 @@
encodeBin(cu.getCbf(absPartIdx, ttype, lowestTUDepth), m_contextState[OFF_QT_CBF_CTX + ctx]);
}
+#if CHECKED_BUILD || _DEBUG
+uint32_t costCoeffRemain_c0(uint16_t *absCoeff, int numNonZero)
+{
+ uint32_t goRiceParam = 0;
+ int firstCoeff2 = 1;
+ uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
+
+ uint32_t sum = 0;
+ int idx = 0;
+ do
+ {
+ int baseLevel = (baseLevelN & 3) | firstCoeff2;
+ X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
+ baseLevelN >>= 2;
+ int codeNumber = absCoeff[idx] - baseLevel;
+
+ if (codeNumber >= 0)
+ {
+ //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
+ uint32_t length = 0;
+
+ codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
+ if (codeNumber >= 0)
+ {
+ {
+ unsigned long cidx;
+ CLZ(cidx, codeNumber + 1);
+ length = cidx;
+ }
+ X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
+
+ codeNumber = (length + length);
+ }
+ sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
+
+ if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+ goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+ X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
+ }
+ if (absCoeff[idx] >= 2)
+ firstCoeff2 = 0;
+ idx++;
+ }
+ while(idx < numNonZero);
+
+ return sum;
+}
+#endif // debug only code
+
void Entropy::codeCoeffNxN(const CUData& cu, const coeff_t* coeff, uint32_t absPartIdx, uint32_t log2TrSize, TextType ttype)
{
uint32_t trSize = 1 << log2TrSize;
@@ -1519,7 +1568,7 @@
uint8_t * const baseCtx = bIsLuma ? &m_contextState[OFF_SIG_FLAG_CTX] : &m_contextState[OFF_SIG_FLAG_CTX + NUM_SIG_FLAG_CTX_LUMA];
uint32_t c1 = 1;
int scanPosSigOff = scanPosLast - (lastScanSet << MLS_CG_SIZE) - 1;
- ALIGN_VAR_32(uint16_t, absCoeff[1 << MLS_CG_SIZE]);
+ ALIGN_VAR_32(uint16_t, absCoeff[(1 << MLS_CG_SIZE)]);
uint32_t numNonZero = 1;
unsigned long lastNZPosInCG;
unsigned long firstNZPosInCG;
@@ -1700,6 +1749,7 @@
uint32_t numC1Flag = X265_MIN(numNonZero, C1FLAG_NUMBER);
X265_CHECK(numC1Flag > 0, "numC1Flag check failure\n");
+ uint32_t firstC2Idx = 8;
uint32_t firstC2Flag = 2;
uint32_t c1Next = 0xFFFFFFFE;
if (!m_bitIf)
@@ -1720,9 +1770,13 @@
if (symbol1)
c1Next = 0;
+
if (symbol1 + firstC2Flag == 3)
firstC2Flag = symbol2;
+ if (symbol1 + firstC2Idx == 9)
+ firstC2Idx = idx;
+
c1 = (c1Next & 3);
c1Next >>= 2;
X265_CHECK(c1 <= 3, "c1 check failure\n");
@@ -1749,9 +1803,10 @@
//encodeBinsEP((coeffSigns >> hiddenShift), numNonZero - hiddenShift);
m_fracBits += (numNonZero - hiddenShift) << 15;
- if (!c1 || numNonZero > C1FLAG_NUMBER)
+ if (numNonZero > firstC2Idx)
{
- uint32_t sum = primitives.costCoeffRemain(absCoeff, numNonZero);
+ sum = primitives.costCoeffRemain(absCoeff, numNonZero, firstC2Idx);
+ X265_CHECK(sum == costCoeffRemain_c0(absCoeff, numNonZero), "costCoeffRemain check failure\n");
m_fracBits += ((uint64_t)sum << 15);
}
}
@@ -1771,6 +1826,9 @@
if (symbol1 + firstC2Flag == 3)
firstC2Flag = symbol2;
+ if (symbol1 + firstC2Idx == 9)
+ firstC2Idx = idx;
+
c1 = (c1Next & 3);
c1Next >>= 2;
X265_CHECK(c1 <= 3, "c1 check failure\n");
@@ -1793,15 +1851,17 @@
{
// Standard path
uint32_t goRiceParam = 0;
+ int baseLevel = 3;
+#if CHECKED_BUILD || _DEBUG
int firstCoeff2 = 1;
- uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
-
- idx = 0;
+#endif
+ idx = firstC2Idx;
do
{
- int baseLevel = (baseLevelN & 3) | firstCoeff2;
+ if (idx >= C1FLAG_NUMBER)
+ baseLevel = 1;
+ // TODO: fast algorithm maybe broken this check logic
X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
- baseLevelN >>= 2;
if (absCoeff[idx] >= baseLevel)
{
@@ -1810,8 +1870,10 @@
goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
}
- if (absCoeff[idx] >= 2)
- firstCoeff2 = 0;
+#if CHECKED_BUILD || _DEBUG
+ firstCoeff2 = 0;
+#endif
+ baseLevel = 2;
idx++;
}
while(idx < numNonZero);
More information about the x265-devel
mailing list