[x265] [PATCH] added calSign primitive, improved 2316.99 -> 233.63 (9.92x) over C code
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Dec 30 15:08:08 CET 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1419948480 -19800
# Node ID c107c70e154b18e50b7f29f08724a0d923859cfb
# Parent 10a16175a843d46df9c84523edce2d61d20761fa
added calSign primitive, improved 2316.99 -> 233.63 (9.92x) over C code
Calsign primitive will be utilized to optimize various SAO algorithm switch cases
diff -r 10a16175a843 -r c107c70e154b source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/loopfilter.cpp Tue Dec 30 19:38:00 2014 +0530
@@ -28,6 +28,18 @@
#define PIXEL_MIN 0
#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
+/* get the sign of input variable (TODO: this is a dup, make common) */
+inline int8_t signOf(int x)
+{
+ return (x >> 31) | ((int)((((uint32_t)-x)) >> 31));
+}
+
+void calSign(int8_t *dst, pixel *src1, pixel *src2, int endX)
+{
+ for (int x = 0; x < endX; x++)
+ dst[x] = signOf(src1[x] - src2[x]);
+}
+
void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft)
{
int x;
@@ -49,5 +61,6 @@
void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
{
p.saoCuOrgE0 = processSaoCUE0;
+ p.sign = calSign;
}
}
diff -r 10a16175a843 -r c107c70e154b source/common/primitives.h
--- a/source/common/primitives.h Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/primitives.h Tue Dec 30 19:38:00 2014 +0530
@@ -191,6 +191,7 @@
typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*sign_t)(int8_t *dst, pixel *src1, pixel *src2, int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
@@ -240,6 +241,7 @@
ssim_4x4x2_core_t ssim_4x4x2_core;
ssim_end4_t ssim_end_4;
+ sign_t sign;
saoCuOrgE0_t saoCuOrgE0;
downscale_t frameInitLowres;
diff -r 10a16175a843 -r c107c70e154b source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 30 19:38:00 2014 +0530
@@ -1642,6 +1642,7 @@
}
if (cpuMask & X265_CPU_SSE4)
{
+ p.sign = x265_calSign_sse4;
p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
LUMA_ADDAVG(_sse4);
diff -r 10a16175a843 -r c107c70e154b source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/x86/loopfilter.asm Tue Dec 30 19:38:00 2014 +0530
@@ -83,3 +83,33 @@
sub r2d, 16
jnz .loop
RET
+
+;============================================================================================================
+; void calSign(int8_t *dst, Pixel *src1 , Pixel *src2, int endX)
+;============================================================================================================
+INIT_XMM sse4
+cglobal calSign, 4, 5, 7
+
+ mov r4, 16
+ mova m1, [pb_128]
+ mova m0, [pb_1]
+ shr r3d, 4
+.loop
+ movu m2, [r1] ; m2 = pRec[x]
+ movu m3, [r2] ; m3 = pTmpU[x]
+
+ pxor m4, m2, m1
+ pxor m5, m3, m1
+ pcmpgtb m6, m4, m5
+ pcmpgtb m5, m4
+ pand m6, m0
+ por m6, m5
+
+ movu [r0], m6
+
+ add r0, r4
+ add r1, r4
+ add r2, r4
+ dec r3d
+ jnz .loop
+ RET
diff -r 10a16175a843 -r c107c70e154b source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Dec 23 13:07:09 2014 +0530
+++ b/source/common/x86/loopfilter.h Tue Dec 30 19:38:00 2014 +0530
@@ -25,5 +25,6 @@
#define X265_LOOPFILTER_H
void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_calSign_sse4(int8_t *dst, pixel *src1, pixel *src2, int endX);
#endif // ifndef X265_LOOPFILTER_H
diff -r 10a16175a843 -r c107c70e154b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Dec 23 13:07:09 2014 +0530
+++ b/source/test/pixelharness.cpp Tue Dec 30 19:38:00 2014 +0530
@@ -856,6 +856,33 @@
return true;
}
+bool PixelHarness::check_calSign(sign_t ref, sign_t opt)
+{
+ ALIGN_VAR_16(int8_t, ref_dest[64 * 64]);
+ ALIGN_VAR_16(int8_t, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int width = 16 * (rand() % 4 + 1);
+
+ ref(ref_dest, pbuf2 + j, pbuf3 + j, width);
+ checked(opt, opt_dest, pbuf2 + j, pbuf3 + j, width);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(int8_t)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1386,6 +1413,15 @@
}
}
+ if (opt.sign)
+ {
+ if (!check_calSign(ref.sign, opt.sign))
+ {
+ printf("calSign failed\n");
+ return false;
+ }
+ }
+
if (opt.saoCuOrgE0)
{
if (!check_saoCuOrgE0_t(ref.saoCuOrgE0, opt.saoCuOrgE0))
@@ -1712,6 +1748,12 @@
REPORT_SPEEDUP(opt.ssim_end_4, ref.ssim_end_4, (int(*)[4])pbuf2, (int(*)[4])pbuf1, 4);
}
+ if (opt.sign)
+ {
+ HEADER0("calSign");
+ REPORT_SPEEDUP(opt.sign, ref.sign, psbuf1, pbuf1, pbuf2, 64);
+ }
+
if (opt.saoCuOrgE0)
{
HEADER0("SAO_EO_0");
diff -r 10a16175a843 -r c107c70e154b source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Dec 23 13:07:09 2014 +0530
+++ b/source/test/pixelharness.h Tue Dec 30 19:38:00 2014 +0530
@@ -93,6 +93,7 @@
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);
+ bool check_calSign(sign_t ref, sign_t opt);
public:
More information about the x265-devel
mailing list