[x265] [PATCH] asm: new SSE4 primivite on saoStatsE2
Min Chen
chenm003 at 163.com
Wed May 27 01:34:28 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1432682219 25200
# Node ID c75ee6c1f0834505a6fd07a1ac9d487f798a44a0
# Parent 8ddc790790a46de9ceadea388f6271acdb3012ed
asm: new SSE4 primivite on saoStatsE2
---
source/common/primitives.h | 2 +
source/common/x86/asm-primitives.cpp | 1 +
source/common/x86/loopfilter.h | 1 +
source/common/x86/pixel-util8.asm | 167 +++++++++++++++++++++++++++++++++-
source/encoder/sao.cpp | 69 ++++++++------
source/test/pixelharness.cpp | 74 +++++++++++++++
source/test/pixelharness.h | 1 +
7 files changed, 285 insertions(+), 30 deletions(-)
diff -r 8ddc790790a4 -r c75ee6c1f083 source/common/primitives.h
--- a/source/common/primitives.h Tue May 26 10:33:56 2015 +0530
+++ b/source/common/primitives.h Tue May 26 16:16:59 2015 -0700
@@ -174,6 +174,7 @@
typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
@@ -292,6 +293,7 @@
saoCuOrgE3_t saoCuOrgE3[2];
saoCuOrgB0_t saoCuOrgB0;
+ saoCuStatsE2_t saoCuStatsE2;
saoCuStatsE3_t saoCuStatsE3;
downscale_t frameInitLowres;
diff -r 8ddc790790a4 -r c75ee6c1f083 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue May 26 16:16:59 2015 -0700
@@ -1871,6 +1871,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = x265_filterPixelToShort_6x16_sse4;
#if X86_64
+ p.saoCuStatsE2 = x265_saoCuStatsE2_sse4;
p.saoCuStatsE3 = x265_saoCuStatsE3_sse4;
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
diff -r 8ddc790790a4 -r c75ee6c1f083 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/loopfilter.h Tue May 26 16:16:59 2015 -0700
@@ -39,6 +39,7 @@
void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+void x265_saoCuStatsE2_sse4(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count);
void x265_saoCuStatsE3_sse4(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
void x265_calSign_avx2(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 8ddc790790a4 -r c75ee6c1f083 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue May 26 10:33:56 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Tue May 26 16:16:59 2015 -0700
@@ -6054,6 +6054,172 @@
RET
+;void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
+;{
+; X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
+; X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
+; int x, y;
+; int32_t tmp_stats[SAO::NUM_EDGETYPE];
+; int32_t tmp_count[SAO::NUM_EDGETYPE];
+; memset(tmp_stats, 0, sizeof(tmp_stats));
+; memset(tmp_count, 0, sizeof(tmp_count));
+; for (y = 0; y < endY; y++)
+; {
+; upBufft[0] = signOf(rec[stride] - rec[-1]);
+; for (x = 0; x < endX; x++)
+; {
+; int signDown = signOf2(rec[x], rec[x + stride + 1]);
+; X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
+; uint32_t edgeType = signDown + upBuff1[x] + 2;
+; upBufft[x + 1] = (int8_t)(-signDown);
+; tmp_stats[edgeType] += (fenc[x] - rec[x]);
+; tmp_count[edgeType]++;
+; }
+; std::swap(upBuff1, upBufft);
+; rec += stride;
+; fenc += stride;
+; }
+; for (x = 0; x < SAO::NUM_EDGETYPE; x++)
+; {
+; stats[SAO::s_eoTable[x]] += tmp_stats[x];
+; count[SAO::s_eoTable[x]] += tmp_count[x];
+; }
+;}
+
+%if ARCH_X86_64
+; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
+INIT_XMM sse4
+cglobal saoCuStatsE2, 5,9,8,0-32 ; Stack: 5 of stats and 5 of count
+ mov r5d, r5m
+
+ ; clear internal temporary buffer
+ pxor m0, m0
+ mova [rsp], m0
+ mova [rsp + mmsize], m0
+ mova m0, [pb_128]
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+
+.loopH:
+ ; TODO: merge into below SIMD
+ ; get upBuffX[0]
+ mov r6b, [r1 + r2]
+ sub r6b, [r1 - 1]
+ seta r6b
+ setb r7b
+ sub r6b, r7b
+ mov [r4], r6b
+
+ ; backup unavailable pixels
+ movh m7, [r4 + r5 + 1]
+
+ mov r6d, r5d
+.loopW:
+ movu m1, [r1]
+ movu m2, [r1 + r2 + 1]
+
+ ; signDown
+ pxor m1, m0
+ pxor m2, m0
+ pcmpgtb m3, m1, m2
+ pand m3, m5
+ pcmpgtb m2, m1
+ por m2, m3
+ pxor m3, m3
+ psubb m3, m2
+
+ ; edgeType
+ movu m4, [r3]
+ paddb m4, m6
+ paddb m2, m4
+
+ ; update upBuff1
+ movu [r4 + 1], m3
+
+ ; stats[edgeType]
+ pxor m1, m0
+ movu m3, [r0]
+ punpckhbw m4, m3, m1
+ punpcklbw m3, m1
+ pmaddubsw m3, [hmul_16p + 16]
+ pmaddubsw m4, [hmul_16p + 16]
+
+ ; 16 pixels
+%assign x 0
+%rep 16
+ pextrb r7d, m2, x
+ inc word [rsp + r7 * 2]
+
+ %if (x < 8)
+ pextrw r8d, m3, (x % 8)
+ %else
+ pextrw r8d, m4, (x % 8)
+ %endif
+ movsx r8d, r8w
+ add [rsp + 5 * 2 + r7 * 4], r8d
+
+ dec r6d
+ jz .next
+%assign x x+1
+%endrep
+
+ add r0, 16
+ add r1, 16
+ add r3, 16
+ add r4, 16
+ jmp .loopW
+
+.next:
+ xchg r3, r4
+
+ ; restore pointer upBuff1
+ mov r6d, r5d
+ and r6d, 15
+
+ ; move to next row
+ sub r6, r5
+ add r3, r6
+ add r4, r6
+ add r6, r2
+ add r0, r6
+ add r1, r6
+
+ ; restore unavailable pixels
+ movh [r3 + r5 + 1], m7
+
+ dec byte r6m
+ jg .loopH
+
+ ; sum to global buffer
+ mov r1, r7m
+ mov r0, r8m
+
+ ; s_eoTable = {1,2,0,3,4}
+ movzx r6d, word [rsp + 0 * 2]
+ add [r0 + 1 * 4], r6d
+ movzx r6d, word [rsp + 1 * 2]
+ add [r0 + 2 * 4], r6d
+ movzx r6d, word [rsp + 2 * 2]
+ add [r0 + 0 * 4], r6d
+ movzx r6d, word [rsp + 3 * 2]
+ add [r0 + 3 * 4], r6d
+ movzx r6d, word [rsp + 4 * 2]
+ add [r0 + 4 * 4], r6d
+
+ mov r6d, [rsp + 5 * 2 + 0 * 4]
+ add [r1 + 1 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 1 * 4]
+ add [r1 + 2 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 2 * 4]
+ add [r1 + 0 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 3 * 4]
+ add [r1 + 3 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 4 * 4]
+ add [r1 + 4 * 4], r6d
+ RET
+%endif ; ARCH_X86_64
+
+
;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
;{
; memset(tmp_stats, 0, sizeof(tmp_stats));
@@ -6080,7 +6246,6 @@
;}
%if ARCH_X86_64
-; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
INIT_XMM sse4
cglobal saoCuStatsE3, 4,9,8,0-32 ; Stack: 5 of stats and 5 of count
mov r4d, r4m
diff -r 8ddc790790a4 -r c75ee6c1f083 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Tue May 26 10:33:56 2015 +0530
+++ b/source/encoder/sao.cpp Tue May 26 16:16:59 2015 -0700
@@ -866,35 +866,7 @@
primitives.sign(&upBuff1[startX], &rec[startX], &rec[startX - stride - 1], (endX - startX));
- memset(tmp_stats, 0, sizeof(tmp_stats));
- memset(tmp_count, 0, sizeof(tmp_count));
-
- for (y = startY; y < endY; y++)
- {
- upBufft[startX] = signOf(rec[startX + stride] - rec[startX - 1]);
- for (x = startX; x < endX; x++)
- {
- int signDown = signOf2(rec[x], rec[x + stride + 1]);
- X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
- uint32_t edgeType = signDown + upBuff1[x] + 2;
- upBufft[x + 1] = (int8_t)(-signDown);
- tmp_stats[edgeType] += (fenc[x] - rec[x]);
- tmp_count[edgeType]++;
- }
-
- std::swap(upBuff1, upBufft);
-
- rec += stride;
- fenc += stride;
- }
-
- stats = m_offsetOrg[plane][SAO_EO_2];
- count = m_count[plane][SAO_EO_2];
- for (x = 0; x < NUM_EDGETYPE; x++)
- {
- stats[s_eoTable[x]] += tmp_stats[x];
- count[s_eoTable[x]] += tmp_count[x];
- }
+ primitives.saoCuStatsE2(fenc0 + startX + startY * stride, rec0 + startX + startY * stride, stride, upBuff1 + startX, upBufft + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_2], m_count[plane][SAO_EO_2]);
}
// SAO_EO_3: // dir: 45
@@ -1642,6 +1614,44 @@
}
// NOTE: must put in namespace x265 since we need class SAO
+void saoCuStatsE2_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count)
+{
+ X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
+ X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
+
+ int x, y;
+ int32_t tmp_stats[SAO::NUM_EDGETYPE];
+ int32_t tmp_count[SAO::NUM_EDGETYPE];
+
+ memset(tmp_stats, 0, sizeof(tmp_stats));
+ memset(tmp_count, 0, sizeof(tmp_count));
+
+ for (y = 0; y < endY; y++)
+ {
+ upBufft[0] = signOf(rec[stride] - rec[-1]);
+ for (x = 0; x < endX; x++)
+ {
+ int signDown = signOf2(rec[x], rec[x + stride + 1]);
+ X265_CHECK(signDown == signOf(rec[x] - rec[x + stride + 1]), "signDown check failure\n");
+ uint32_t edgeType = signDown + upBuff1[x] + 2;
+ upBufft[x + 1] = (int8_t)(-signDown);
+ tmp_stats[edgeType] += (fenc[x] - rec[x]);
+ tmp_count[edgeType]++;
+ }
+
+ std::swap(upBuff1, upBufft);
+
+ rec += stride;
+ fenc += stride;
+ }
+
+ for (x = 0; x < SAO::NUM_EDGETYPE; x++)
+ {
+ stats[SAO::s_eoTable[x]] += tmp_stats[x];
+ count[SAO::s_eoTable[x]] += tmp_count[x];
+ }
+}
+
void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
{
X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
@@ -1684,6 +1694,7 @@
void setupSaoPrimitives_c(EncoderPrimitives &p)
{
// TODO: move other sao functions to here
+ p.saoCuStatsE2 = saoCuStatsE2_c;
p.saoCuStatsE3 = saoCuStatsE3_c;
}
}
diff -r 8ddc790790a4 -r c75ee6c1f083 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue May 26 10:33:56 2015 +0530
+++ b/source/test/pixelharness.cpp Tue May 26 16:16:59 2015 -0700
@@ -1016,6 +1016,60 @@
return true;
}
+bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt)
+{
+ enum { NUM_EDGETYPE = 5 };
+ int32_t stats_ref[NUM_EDGETYPE];
+ int32_t stats_vec[NUM_EDGETYPE];
+
+ int32_t count_ref[NUM_EDGETYPE];
+ int32_t count_vec[NUM_EDGETYPE];
+
+ int8_t _upBuff1_ref[MAX_CU_SIZE + 2], *upBuff1_ref = _upBuff1_ref + 1;
+ int8_t _upBufft_ref[MAX_CU_SIZE + 2], *upBufft_ref = _upBufft_ref + 1;
+ int8_t _upBuff1_vec[MAX_CU_SIZE + 2], *upBuff1_vec = _upBuff1_vec + 1;
+ int8_t _upBufft_vec[MAX_CU_SIZE + 2], *upBufft_vec = _upBufft_vec + 1;
+
+ int j = 0;
+
+ // NOTE: verify more times since our asm is NOT exact match to C, the output of upBuff* will be DIFFERENT
+ for (int i = 0; i < ITERS * 10; i++)
+ {
+ // initialize input data to random, the dynamic range wrong but good to verify our asm code
+ for (int x = 0; x < NUM_EDGETYPE; x++)
+ {
+ stats_ref[x] = stats_vec[x] = rand();
+ count_ref[x] = count_vec[x] = rand();
+ }
+
+ // initial sign
+ for (int x = 0; x < MAX_CU_SIZE + 2; x++)
+ {
+ _upBuff1_ref[x] = _upBuff1_vec[x] = (rand() % 3) - 1;
+ _upBufft_ref[x] = _upBufft_vec[x] = (rand() % 3) - 1;
+ }
+
+ intptr_t stride = 16 * (rand() % 4 + 1);
+ int endX = MAX_CU_SIZE - (rand() % 5) - 1;
+ int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+ ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, upBufft_ref, endX, endY, stats_ref, count_ref);
+ checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, upBufft_vec, endX, endY, stats_vec, count_vec);
+
+ // TODO: don't check upBuff*, the latest output pixels different, and can move into stack temporary buffer in future
+ if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
+ || memcmp(_upBufft_ref, _upBufft_vec, sizeof(_upBufft_ref))
+ || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+ || memcmp(count_ref, count_vec, sizeof(count_ref)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt)
{
enum { NUM_EDGETYPE = 5 };
@@ -1894,6 +1948,15 @@
}
}
+ if (opt.saoCuStatsE2)
+ {
+ if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2))
+ {
+ printf("saoCuStatsE2 failed\n");
+ return false;
+ }
+ }
+
if (opt.saoCuStatsE3)
{
if (!check_saoCuStatsE3_t(ref.saoCuStatsE3, opt.saoCuStatsE3))
@@ -2304,6 +2367,17 @@
REPORT_SPEEDUP(opt.saoCuOrgB0, ref.saoCuOrgB0, pbuf1, psbuf1, 64, 64, 64);
}
+ if (opt.saoCuStatsE2)
+ {
+ int32_t stats[5], count[5];
+ int8_t upBuff1[MAX_CU_SIZE + 2];
+ int8_t upBufft[MAX_CU_SIZE + 2];
+ memset(upBuff1, 1, sizeof(upBuff1));
+ memset(upBufft, -1, sizeof(upBufft));
+ HEADER0("saoCuStatsE2");
+ REPORT_SPEEDUP(opt.saoCuStatsE2, ref.saoCuStatsE2, pbuf2, pbuf3, 64, upBuff1 + 1, upBufft + 1, 60, 61, stats, count);
+ }
+
if (opt.saoCuStatsE3)
{
int8_t upBuff1[MAX_CU_SIZE + 2];
diff -r 8ddc790790a4 -r c75ee6c1f083 source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue May 26 10:33:56 2015 +0530
+++ b/source/test/pixelharness.h Tue May 26 16:16:59 2015 -0700
@@ -100,6 +100,7 @@
bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
+ bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
More information about the x265-devel
mailing list