[x265] [PATCH 1 of 6] asm: SSE4 of costCoeffRemain in codeCoeffNxN()
Min Chen
chenm003 at 163.com
Tue Jun 9 20:06:01 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1433872858 25200
# Node ID d41bc83c21b9f1eba483ec4b5e298a33b6ee1c1e
# Parent b252468dde7ffca57da27575388d95ce538945d2
asm: SSE4 of costCoeffRemain in codeCoeffNxN()
---
source/common/dct.cpp | 49 +++++++++++++++
source/common/primitives.h | 2 +
source/common/x86/asm-primitives.cpp | 1 +
source/common/x86/pixel-util.h | 1 +
source/common/x86/pixel-util8.asm | 111 ++++++++++++++++++++++++++++++++++
source/encoder/entropy.cpp | 40 +------------
6 files changed, 166 insertions(+), 38 deletions(-)
diff -r b252468dde7f -r d41bc83c21b9 source/common/dct.cpp
--- a/source/common/dct.cpp Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/dct.cpp Tue Jun 09 11:00:58 2015 -0700
@@ -30,6 +30,7 @@
#include "common.h"
#include "primitives.h"
#include "contexts.h" // costCoeffNxN_c
+#include "threading.h" // CLZ
using namespace x265;
@@ -873,6 +874,53 @@
return (sum & 0xFFFFFF);
}
+uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero)
+{
+ uint32_t goRiceParam = 0;
+ int firstCoeff2 = 1;
+ uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
+
+ uint32_t sum = 0;
+ int idx = 0;
+ do
+ {
+ int baseLevel = (baseLevelN & 3) | firstCoeff2;
+ X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
+ baseLevelN >>= 2;
+ int codeNumber = absCoeff[idx] - baseLevel;
+
+ if (codeNumber >= 0)
+ {
+ //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
+ uint32_t length = 0;
+
+ codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
+ if (codeNumber >= 0)
+ {
+ {
+ unsigned long cidx;
+ CLZ(cidx, codeNumber + 1);
+ length = cidx;
+ }
+ X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
+
+ codeNumber = (length + length);
+ }
+ sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
+
+ if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+ goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+ X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
+ }
+ if (absCoeff[idx] >= 2)
+ firstCoeff2 = 0;
+ idx++;
+ }
+ while(idx < numNonZero);
+
+ return sum;
+}
+
} // closing - anonymous file-static namespace
namespace x265 {
@@ -908,5 +956,6 @@
p.scanPosLast = scanPosLast_c;
p.findPosFirstLast = findPosFirstLast_c;
p.costCoeffNxN = costCoeffNxN_c;
+ p.costCoeffRemain = costCoeffRemain_c;
}
}
diff -r b252468dde7f -r d41bc83c21b9 source/common/primitives.h
--- a/source/common/primitives.h Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/primitives.h Tue Jun 09 11:00:58 2015 -0700
@@ -187,6 +187,7 @@
typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero);
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -313,6 +314,7 @@
findPosFirstLast_t findPosFirstLast;
costCoeffNxN_t costCoeffNxN;
+ costCoeffRemain_t costCoeffRemain;
/* There is one set of chroma primitives per color space. An encoder will
* have just a single color space and thus it will only ever use one entry
diff -r b252468dde7f -r d41bc83c21b9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jun 09 11:00:58 2015 -0700
@@ -2113,6 +2113,8 @@
// TODO: it is passed smoke test, but we need testbench, so temporary disable
//p.costCoeffNxN = x265_costCoeffNxN_sse4;
#endif
+ // TODO: it is passed smoke test, but we need testbench to active it, so temporary disable
+ p.costCoeffRemain = x265_costCoeffRemain_sse4;
}
if (cpuMask & X265_CPU_AVX)
{
diff -r b252468dde7f -r d41bc83c21b9 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/pixel-util.h Tue Jun 09 11:00:58 2015 -0700
@@ -83,6 +83,7 @@
uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero);
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r b252468dde7f -r d41bc83c21b9 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Jun 09 11:00:58 2015 -0700
@@ -6538,3 +6538,114 @@
and eax, 0xFFFFFF
RET
%endif ; ARCH_X86_64
+
+
+;uint32_t goRiceParam = 0;
+;int firstCoeff2 = 1;
+;uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
+;idx = 0;
+;do
+;{
+; int baseLevel = (baseLevelN & 3) | firstCoeff2;
+; baseLevelN >>= 2;
+; int codeNumber = absCoeff[idx] - baseLevel;
+; if (codeNumber >= 0)
+; {
+; uint32_t length = 0;
+; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
+; if (codeNumber >= 0)
+; {
+; {
+; unsigned long cidx;
+; CLZ(cidx, codeNumber + 1);
+; length = cidx;
+; }
+; codeNumber = (length + length);
+; }
+; sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
+; if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+; goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+; }
+; if (absCoeff[idx] >= 2)
+; firstCoeff2 = 0;
+; idx++;
+;}
+;while(idx < numNonZero);
+
+; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero)
+INIT_XMM sse4
+cglobal costCoeffRemain, 0,7,1
+ ; assign RCX to R3
+ ; RAX always in R6 and free
+ %if WIN64
+ DECLARE_REG_TMP 3,1,2,0
+ mov t0, r0
+ %elif ARCH_X86_64
+ ; *nix x64 didn't do anything
+ DECLARE_REG_TMP 0,1,2,3
+ %else ; X86_32
+ DECLARE_REG_TMP 6,3,2,1
+ mov t0, r0m
+ %endif
+
+ mova m0, [t0]
+ packsswb m0, [t0 + mmsize]
+ pcmpgtb m0, [pb_1]
+ pmovmskb r2d, m0
+ bsf r2d, r2d
+ lea r2d, [r2 * 2 + 1]
+ xor r4d, r4d
+ bts r4d, r2d
+ dec r4d
+ and r4d, 0x55555555
+ or r4d, 0x5555AAAA
+
+ xor t3d, t3d
+ xor r5d, r5d
+
+ ; register mapping
+ ; r4d - baseLevelN
+ ; r2 - tmp
+ ; t3 - goRiceParam
+ ; eax - tmp - absCoeff[idx]
+ ; r5 - sum
+
+.loop:
+ movzx eax, word [t0]
+ add t0, 2
+ mov r2d, r4d
+ and r2d, 3
+ shr r4d, 2
+ sub eax, r2d ; codeNumber = absCoeff[idx] - baseLevel
+ jl .next
+
+ shr eax, t3b ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION
+
+ lea r2d, [eax - 3 + 1] ; CLZ(cidx, codeNumber + 1);
+ bsr r2d, r2d
+ add r2d, r2d ; codeNumber = (length + length)
+
+ sub eax, 3
+ cmovge eax, r2d
+
+ lea eax, [3 + 1 + t3 + rax] ; sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber)
+ add r5d, eax
+
+ ; if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+ ; goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+ cmp t3d, 4
+ setl al
+
+ mov r2d, 3
+ shl r2d, t3b
+ cmp word [t0 - 2], r2w
+ setg r2b
+ and al, r2b
+ add t3b, al
+
+.next:
+ dec dword r1m
+ jnz .loop
+
+ mov eax, r5d
+ RET
diff -r b252468dde7f -r d41bc83c21b9 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp Tue Jun 09 10:16:44 2015 +0530
+++ b/source/encoder/entropy.cpp Tue Jun 09 11:00:58 2015 -0700
@@ -1747,44 +1747,8 @@
if (!m_bitIf)
{
- uint32_t sum = 0;
- // FastRd path
- idx = 0;
- do
- {
- int baseLevel = (baseLevelN & 3) | firstCoeff2;
- X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
- baseLevelN >>= 2;
- int codeNumber = absCoeff[idx] - baseLevel;
-
- if (codeNumber >= 0)
- {
- //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
- uint32_t length = 0;
-
- codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
- if (codeNumber >= 0)
- {
- {
- unsigned long cidx;
- CLZ(cidx, codeNumber + 1);
- length = cidx;
- }
- X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
-
- codeNumber = (length + length);
- }
- sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
-
- if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
- goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
- X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
- }
- if (absCoeff[idx] >= 2)
- firstCoeff2 = 0;
- idx++;
- }
- while(idx < numNonZero);
+ // Fast RD path
+ uint32_t sum = primitives.costCoeffRemain(absCoeff, numNonZero);
m_fracBits += ((uint64_t)sum << 15);
}
else
More information about the x265-devel
mailing list