[x265] [PATCH 2 of 3] asm: sse4 code for saoCuStatsE0, improved 250341c->147284c
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jul 7 11:35:36 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1436251628 -19800
# Tue Jul 07 12:17:08 2015 +0530
# Node ID 235930aae11da04863e3fb13905e2d1d95e3dc0a
# Parent e0166f09f332af72a83eb059d878044db15f59bd
asm: sse4 code for saoCuStatsE0, improved 250341c->147284c
diff -r e0166f09f332 -r 235930aae11d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 07 12:17:08 2015 +0530
@@ -2498,6 +2498,7 @@
#if X86_64
p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
+ p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/loopfilter.asm Tue Jul 07 12:17:08 2015 +0530
@@ -2043,3 +2043,119 @@
jnz .loopH
RET
%endif
+
+;-----------------------------------------------------------------------------------------------------------------------
+; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+;-----------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE0, 5,8,8, 0-32
+ mov r3d, r3m
+
+ ; clear internal temporary buffer
+ pxor m0, m0
+ mova [rsp], m0
+ mova [rsp + mmsize], m0
+ mova m4, [pb_128]
+ mova m5, [hmul_16p + 16]
+ mova m6, [pb_2]
+ xor r7d, r7d
+
+.loopH:
+ mov r5d, r3d
+
+ ; calculate signLeft
+ mov r7b, [r1]
+ sub r7b, [r1 - 1]
+ seta r7b
+ setb r6b
+ sub r7b, r6b
+ neg r7b
+ pinsrb m0, r7d, 15
+
+.loopL:
+ movu m7, [r1]
+ movu m2, [r1 + 1]
+
+ pxor m1, m7, m4
+ pxor m3, m2, m4
+ pcmpgtb m2, m1, m3
+ pcmpgtb m3, m1
+ pand m2, [pb_1]
+ por m2, m3 ; signRight
+
+ palignr m3, m2, m0, 15
+ psignb m3, m4 ; signLeft
+
+ mova m0, m2
+ paddb m2, m3
+ paddb m2, m6 ; edgeType
+
+ ; stats[edgeType]
+ movu m3, [r0] ; fenc[0-15]
+ punpckhbw m1, m3, m7
+ punpcklbw m3, m7
+ pmaddubsw m1, m5
+ pmaddubsw m3, m5
+
+%assign x 0
+%rep 16
+ pextrb r7d, m2, x
+
+%if (x < 8)
+ pextrw r6d, m3, (x % 8)
+%else
+ pextrw r6d, m1, (x % 8)
+%endif
+ movsx r6d, r6w
+ inc word [rsp + r7 * 2] ; tmp_count[edgeType]++
+ add [rsp + 5 * 2 + r7 * 4], r6d ; tmp_stats[edgeType] += (fenc[x] - rec[x])
+ dec r5d
+ jz .next
+%assign x x+1
+%endrep
+
+ add r0q, 16
+ add r1q, 16
+ jmp .loopL
+
+.next:
+ mov r6d, r3d
+ and r6d, 15
+
+ sub r6, r3
+ add r6, r2
+ add r0, r6
+ add r1, r6
+
+ dec r4d
+ jnz .loopH
+
+ ; sum to global buffer
+ mov r1, r5m
+ mov r0, r6m
+
+ ; s_eoTable = {1, 2, 0, 3, 4}
+ movzx r5d, word [rsp + 0 * 2]
+ add [r0 + 1 * 4], r5d
+ movzx r6d, word [rsp + 1 * 2]
+ add [r0 + 2 * 4], r6d
+ movzx r5d, word [rsp + 2 * 2]
+ add [r0 + 0 * 4], r5d
+ movzx r6d, word [rsp + 3 * 2]
+ add [r0 + 3 * 4], r6d
+ movzx r5d, word [rsp + 4 * 2]
+ add [r0 + 4 * 4], r5d
+
+ mov r6d, [rsp + 5 * 2 + 0 * 4]
+ add [r1 + 1 * 4], r6d
+ mov r5d, [rsp + 5 * 2 + 1 * 4]
+ add [r1 + 2 * 4], r5d
+ mov r6d, [rsp + 5 * 2 + 2 * 4]
+ add [r1 + 0 * 4], r6d
+ mov r5d, [rsp + 5 * 2 + 3 * 4]
+ add [r1 + 3 * 4], r5d
+ mov r6d, [rsp + 5 * 2 + 4 * 4]
+ add [r1 + 4 * 4], r6d
+ RET
+%endif
diff -r e0166f09f332 -r 235930aae11d source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Jul 07 11:14:35 2015 +0530
+++ b/source/common/x86/loopfilter.h Tue Jul 07 12:17:08 2015 +0530
@@ -36,6 +36,7 @@
void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+ void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r e0166f09f332 -r 235930aae11d source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Jul 07 11:14:35 2015 +0530
+++ b/source/test/pixelharness.cpp Tue Jul 07 12:17:08 2015 +0530
@@ -1053,6 +1053,42 @@
return true;
}
+bool PixelHarness::check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt)
+{
+ enum { NUM_EDGETYPE = 5 };
+ int32_t stats_ref[NUM_EDGETYPE];
+ int32_t stats_vec[NUM_EDGETYPE];
+
+ int32_t count_ref[NUM_EDGETYPE];
+ int32_t count_vec[NUM_EDGETYPE];
+
+ int j = 0;
+ for (int i = 0; i < ITERS; i++)
+ {
+ // initialize input data to random, the dynamic range wrong but good to verify our asm code
+ for (int x = 0; x < NUM_EDGETYPE; x++)
+ {
+ stats_ref[x] = stats_vec[x] = rand();
+ count_ref[x] = count_vec[x] = rand();
+ }
+
+ intptr_t stride = 16 * (rand() % 4 + 1);
+ int endX = MAX_CU_SIZE - (rand() % 5) - 1;
+ int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+ ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
+ checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
+
+ if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt)
{
enum { NUM_EDGETYPE = 5 };
@@ -2139,6 +2175,15 @@
}
}
+ if (opt.saoCuStatsE0)
+ {
+ if (!check_saoCuStatsE0_t(ref.saoCuStatsE0, opt.saoCuStatsE0))
+ {
+ printf("saoCuStatsE0 failed\n");
+ return false;
+ }
+ }
+
if (opt.saoCuStatsE2)
{
if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2))
@@ -2578,6 +2623,13 @@
REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, pbuf2, pbuf3, 64, 60, 61, stats, count);
}
+ if (opt.saoCuStatsE0)
+ {
+ int32_t stats[33], count[33];
+ HEADER0("saoCuStatsE0");
+ REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count);
+ }
+
if (opt.saoCuStatsE2)
{
int32_t stats[5], count[5];
diff -r e0166f09f332 -r 235930aae11d source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Jul 07 11:14:35 2015 +0530
+++ b/source/test/pixelharness.h Tue Jul 07 12:17:08 2015 +0530
@@ -101,6 +101,7 @@
bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt);
+ bool check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt);
bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
More information about the x265-devel
mailing list