[x265] [PATCH] asm: ssse3 version of findPosFirstLast, 365c -> 75c
Min Chen
chenm003 at 163.com
Mon Apr 20 13:58:35 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1429531109 -28800
# Node ID e4ba0f96286048b8037201c0543675819f32ba0a
# Parent bc9631ab9d36475c7d625fc9272ee0ddd7f5bfd7
asm: ssse3 version of findPosFirstLast, 365c -> 75c
---
source/common/constants.cpp | 2 +-
source/common/dct.cpp | 32 ++++++++++++++++
source/common/primitives.h | 2 +
source/common/quant.cpp | 15 +------
source/common/x86/asm-primitives.cpp | 2 +
source/common/x86/pixel-util.h | 1 +
source/common/x86/pixel-util8.asm | 38 ++++++++++++++++++
source/test/pixelharness.cpp | 69 ++++++++++++++++++++++++++++++++++
source/test/pixelharness.h | 1 +
9 files changed, 149 insertions(+), 13 deletions(-)
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/constants.cpp
--- a/source/common/constants.cpp Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/constants.cpp Mon Apr 20 19:58:29 2015 +0800
@@ -324,7 +324,7 @@
4, 12, 20, 28, 5, 13, 21, 29, 6, 14, 22, 30, 7, 15, 23, 31, 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 }
};
-const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4] =
+ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE][4 * 4]) =
{
{ 0, 4, 1, 8, 5, 2, 12, 9, 6, 3, 13, 10, 7, 14, 11, 15 },
{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 },
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/dct.cpp
--- a/source/common/dct.cpp Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/dct.cpp Mon Apr 20 19:58:29 2015 +0800
@@ -785,6 +785,37 @@
return scanPosLast - 1;
}
+uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
+{
+ int n;
+
+ for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
+ {
+ const uint32_t idx = scanTbl[n];
+ const uint32_t idxY = idx / MLS_CG_SIZE;
+ const uint32_t idxX = idx % MLS_CG_SIZE;
+ if (dstCoeff[idxY * trSize + idxX])
+ break;
+ }
+
+ X265_CHECK(n >= 0, "non-zero coeff scan failuare!\n");
+
+ uint32_t lastNZPosInCG = (uint32_t)n;
+
+ for (n = 0;; n++)
+ {
+ const uint32_t idx = scanTbl[n];
+ const uint32_t idxY = idx / MLS_CG_SIZE;
+ const uint32_t idxX = idx % MLS_CG_SIZE;
+ if (dstCoeff[idxY * trSize + idxX])
+ break;
+ }
+
+ uint32_t firstNZPosInCG = (uint32_t)n;
+
+ return ((lastNZPosInCG << 16) | firstNZPosInCG);
+}
+
} // closing - anonymous file-static namespace
namespace x265 {
@@ -818,5 +849,6 @@
p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
p.findPosLast = findPosLast_c;
+ p.findPosFirstLast = findPosFirstLast_c;
}
}
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/primitives.h
--- a/source/common/primitives.h Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/primitives.h Mon Apr 20 19:58:29 2015 +0800
@@ -181,6 +181,7 @@
typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
typedef int (*findPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -293,6 +294,7 @@
findPosLast_t findPosLast;
+ findPosFirstLast_t findPosFirstLast;
/* There is one set of chroma primitives per color space. An encoder will
* have just a single color space and thus it will only ever use one entry
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/quant.cpp
--- a/source/common/quant.cpp Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/quant.cpp Mon Apr 20 19:58:29 2015 +0800
@@ -1138,19 +1138,10 @@
/* measure distance between first and last non-zero coef in this
* coding group */
- for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
- if (dstCoeff[codeParams.scan[n + subPos]])
- break;
+ const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
+ int firstNZPosInCG = (uint16_t)posFirstLast;
+ int lastNZPosInCG = posFirstLast >> 16;
- X265_CHECK(n >= 0, "non-zero coeff scan failuare!\n");
-
- int lastNZPosInCG = n;
-
- for (n = 0;; n++)
- if (dstCoeff[codeParams.scan[n + subPos]])
- break;
-
- int firstNZPosInCG = n;
if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
{
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp Mon Apr 20 19:58:29 2015 +0800
@@ -1015,6 +1015,7 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = x265_filterPixelToShort_4x2_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = x265_filterPixelToShort_8x2_ssse3;
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = x265_filterPixelToShort_8x6_ssse3;
+ p.findPosFirstLast = x265_findPosFirstLast_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -1498,6 +1499,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = x265_filterPixelToShort_32x32_ssse3;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = x265_filterPixelToShort_32x48_ssse3;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = x265_filterPixelToShort_32x64_ssse3;
+ p.findPosFirstLast = x265_findPosFirstLast_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/x86/pixel-util.h Mon Apr 20 19:58:29 2015 +0800
@@ -79,6 +79,7 @@
void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/x86/pixel-util8.asm Mon Apr 20 19:58:29 2015 +0800
@@ -5659,3 +5659,41 @@
lea eax, [r11d - 1]
RET
%endif
+
+
+;-----------------------------------------------------------------------------
+; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal findPosFirstLast, 3,3,3
+ ; convert stride to int16_t
+ add r1d, r1d
+
+ ; loading scan table and convert to Byte
+ mova m0, [r2]
+ packuswb m0, [r2 + mmsize]
+
+ ; loading 16 of coeff
+ movh m1, [r0]
+ movhps m1, [r0 + r1]
+ movh m2, [r0 + r1 * 2]
+ lea r1, [r1 * 3]
+ movhps m2, [r0 + r1]
+ packsswb m1, m2
+
+ ; get non-zero mask
+ pxor m2, m2
+ pcmpeqb m1, m2
+
+ ; reorder by Zigzag scan
+ pshufb m1, m0
+
+ ; get First and Last pos
+ xor eax, eax
+ pmovmskb r0d, m1
+ not r0w
+ bsr r1w, r0w
+ bsf ax, r0w
+ shl r1d, 16
+ or eax, r1d
+ RET
diff -r bc9631ab9d36 -r e4ba0f962860 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Apr 20 12:41:16 2015 +0800
+++ b/source/test/pixelharness.cpp Mon Apr 20 19:58:29 2015 +0800
@@ -1266,6 +1266,53 @@
return true;
}
+bool PixelHarness::check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt)
+{
+ ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 2]);
+
+ for (int i = 0; i < 32 * 32; i++)
+ {
+ ref_src[i] = rand() & SHORT_MAX;
+ }
+
+ // extra test area all of 0x1234
+ for (int i = 0; i < ITERS * 2; i++)
+ {
+ ref_src[32 * 32 + i] = 0x1234;
+ }
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int rand_scan_type = rand() % NUM_SCAN_TYPE;
+ int rand_scan_size = (rand() % NUM_SCAN_SIZE) + 2;
+ coeff_t *rand_src = ref_src + i;
+
+ const uint16_t* const scanTbl = g_scan4x4[rand_scan_type];
+
+ int j;
+ for (j = 0; j < SCAN_SET_SIZE; j++)
+ {
+ const uint32_t idxY = j / MLS_CG_SIZE;
+ const uint32_t idxX = j % MLS_CG_SIZE;
+ if (rand_src[idxY * rand_scan_size + idxX]) break;
+ }
+
+ // fill one coeff when all coeff group are zero
+ if (j >= SCAN_SET_SIZE)
+ rand_src[0] = 0x0BAD;
+
+ uint32_t ref_scanPos = ref(rand_src, (1 << rand_scan_size), scanTbl);
+ uint32_t opt_scanPos = (int)checked(opt, rand_src, (1 << rand_scan_size), scanTbl);
+
+ if (ref_scanPos != opt_scanPos)
+ return false;
+
+ reportfail();
+ }
+
+ return true;
+}
+
bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
{
if (opt.pu[part].satd)
@@ -1804,6 +1851,15 @@
}
}
+ if (opt.findPosFirstLast)
+ {
+ if (!check_findPosFirstLast(ref.findPosFirstLast, opt.findPosFirstLast))
+ {
+ printf("findPosFirstLast failed!\n");
+ return false;
+ }
+ }
+
return true;
}
@@ -2180,4 +2236,17 @@
memset(coefBuf + 32 * 31, 1, 32 * sizeof(coeff_t));
REPORT_SPEEDUP(opt.findPosLast, ref.findPosLast, g_scanOrder[SCAN_DIAG][NUM_SCAN_SIZE - 1], coefBuf, (uint16_t*)sbuf1, (uint16_t*)sbuf2, (uint8_t*)psbuf1, 32);
}
+
+ if (opt.findPosFirstLast)
+ {
+ HEADER0("findPosFirstLast");
+ coeff_t coefBuf[32 * MLS_CG_SIZE];
+ memset(coefBuf, 0, sizeof(coefBuf));
+ // every CG can't be all zeros!
+ coefBuf[3 + 0 * 32] = 0x0BAD;
+ coefBuf[3 + 1 * 32] = 0x0BAD;
+ coefBuf[3 + 2 * 32] = 0x0BAD;
+ coefBuf[3 + 3 * 32] = 0x0BAD;
+ REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 32, g_scan4x4[SCAN_DIAG]);
+ }
}
diff -r bc9631ab9d36 -r e4ba0f962860 source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Apr 20 12:41:16 2015 +0800
+++ b/source/test/pixelharness.h Mon Apr 20 19:58:29 2015 +0800
@@ -109,6 +109,7 @@
bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
bool check_calSign(sign_t ref, sign_t opt);
bool check_findPosLast(findPosLast_t ref, findPosLast_t opt);
+ bool check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt);
public:
More information about the x265-devel
mailing list