[x265] [PATCH 3 of 3] asm: sse4 code for saoCuStatsE1, improved 320369c->151086c
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jul 7 11:35:37 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1436252372 -19800
# Tue Jul 07 12:29:32 2015 +0530
# Node ID 25a8323b886f480347f4b0813f7ded18e579704a
# Parent 235930aae11da04863e3fb13905e2d1d95e3dc0a
asm: sse4 code for saoCuStatsE1, improved 320369c->151086c
diff -r 235930aae11d -r 25a8323b886f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 07 12:29:32 2015 +0530
@@ -2499,6 +2499,7 @@
#if X86_64
p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
p.saoCuStatsE0 = PFX(saoCuStatsE0_sse4);
+ p.saoCuStatsE1 = PFX(saoCuStatsE1_sse4);
p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/loopfilter.asm Tue Jul 07 12:29:32 2015 +0530
@@ -2159,3 +2159,122 @@
add [r1 + 4 * 4], r6d
RET
%endif
+
+;-------------------------------------------------------------------------------------------------------------------------------------------
+; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+;-------------------------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE1, 4,11,9,0-32 ; Stack: 5 of stats and 5 of count
+ mov r4d, r4m
+ mov r5d, r5m
+
+ ; clear internal temporary buffer
+ pxor m0, m0
+ mova [rsp], m0
+ mova [rsp + mmsize], m0
+ mova m0, [pb_128]
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ mova m8, [hmul_16p + 16]
+ movh m7, [r3 + r4]
+
+.loopH:
+ mov r6d, r4d
+ mov r9, r0
+ mov r10, r1
+ mov r5, r3
+
+.loopW:
+ movu m1, [r10]
+ movu m2, [r10 + r2]
+
+ ; signDown
+ pxor m1, m0
+ pxor m2, m0
+ pcmpgtb m3, m1, m2
+ pand m3, m5
+ pcmpgtb m2, m1
+ por m2, m3
+ pxor m3, m3
+ psubb m3, m2 ; -signDown
+
+ ; edgeType
+ movu m4, [r5]
+ paddb m4, m6
+ paddb m2, m4
+
+ ; update upBuff1
+ movu [r5], m3
+
+ ; stats[edgeType]
+ pxor m1, m0
+ movu m3, [r9]
+ punpckhbw m4, m3, m1
+ punpcklbw m3, m1
+ pmaddubsw m3, m8
+ pmaddubsw m4, m8
+
+ ; 16 pixels
+%assign x 0
+%rep 16
+ pextrb r7d, m2, x
+ inc word [rsp + r7 * 2]
+
+ %if (x < 8)
+ pextrw r8d, m3, (x % 8)
+ %else
+ pextrw r8d, m4, (x % 8)
+ %endif
+ movsx r8d, r8w
+ add [rsp + 5 * 2 + r7 * 4], r8d
+
+ dec r6d
+ jz .next
+%assign x x+1
+%endrep
+
+ add r9, 16
+ add r10, 16
+ add r5, 16
+ jmp .loopW
+
+.next:
+ ; restore pointer upBuff1
+ add r0, r2
+ add r1, r2
+
+ dec byte r5m
+ jg .loopH
+
+ ; restore unavailable pixels
+ movh [r3 + r4], m7
+
+ ; sum to global buffer
+ mov r1, r6m
+ mov r0, r7m
+
+ ; s_eoTable = {1,2,0,3,4}
+ movzx r6d, word [rsp + 0 * 2]
+ add [r0 + 1 * 4], r6d
+ movzx r6d, word [rsp + 1 * 2]
+ add [r0 + 2 * 4], r6d
+ movzx r6d, word [rsp + 2 * 2]
+ add [r0 + 0 * 4], r6d
+ movzx r6d, word [rsp + 3 * 2]
+ add [r0 + 3 * 4], r6d
+ movzx r6d, word [rsp + 4 * 2]
+ add [r0 + 4 * 4], r6d
+
+ mov r6d, [rsp + 5 * 2 + 0 * 4]
+ add [r1 + 1 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 1 * 4]
+ add [r1 + 2 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 2 * 4]
+ add [r1 + 0 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 3 * 4]
+ add [r1 + 3 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 4 * 4]
+ add [r1 + 4 * 4], r6d
+ RET
+%endif ; ARCH_X86_64
diff -r 235930aae11d -r 25a8323b886f source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Jul 07 12:17:08 2015 +0530
+++ b/source/common/x86/loopfilter.h Tue Jul 07 12:29:32 2015 +0530
@@ -37,6 +37,7 @@
void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+ void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 235930aae11d -r 25a8323b886f source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Jul 07 12:17:08 2015 +0530
+++ b/source/test/pixelharness.cpp Tue Jul 07 12:29:32 2015 +0530
@@ -1089,6 +1089,52 @@
return true;
}
+bool PixelHarness::check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt)
+{
+ enum { NUM_EDGETYPE = 5 };
+ int32_t stats_ref[NUM_EDGETYPE];
+ int32_t stats_vec[NUM_EDGETYPE];
+
+ int32_t count_ref[NUM_EDGETYPE];
+ int32_t count_vec[NUM_EDGETYPE];
+
+ int8_t _upBuff1_ref[MAX_CU_SIZE + 2], *upBuff1_ref = _upBuff1_ref + 1;
+ int8_t _upBuff1_vec[MAX_CU_SIZE + 2], *upBuff1_vec = _upBuff1_vec + 1;
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ // initialize input data to random, the dynamic range wrong but good to verify our asm code
+ for (int x = 0; x < NUM_EDGETYPE; x++)
+ {
+ stats_ref[x] = stats_vec[x] = rand();
+ count_ref[x] = count_vec[x] = rand();
+ }
+
+ // initial sign
+ for (int x = 0; x < MAX_CU_SIZE + 2; x++)
+ _upBuff1_ref[x] = _upBuff1_vec[x] = (rand() % 3) - 1;
+
+ intptr_t stride = 16 * (rand() % 4 + 1);
+ int endX = MAX_CU_SIZE - (rand() % 5);
+ int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+ ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+ checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+
+ if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
+ || memcmp(stats_ref, stats_vec, sizeof(stats_ref))
+ || memcmp(count_ref, count_vec, sizeof(count_ref)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt)
{
enum { NUM_EDGETYPE = 5 };
@@ -2184,6 +2230,15 @@
}
}
+ if (opt.saoCuStatsE1)
+ {
+ if (!check_saoCuStatsE1_t(ref.saoCuStatsE1, opt.saoCuStatsE1))
+ {
+ printf("saoCuStatsE1 failed\n");
+ return false;
+ }
+ }
+
if (opt.saoCuStatsE2)
{
if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2))
@@ -2630,6 +2685,15 @@
REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count);
}
+ if (opt.saoCuStatsE1)
+ {
+ int32_t stats[5], count[5];
+ int8_t upBuff1[MAX_CU_SIZE + 2];
+ memset(upBuff1, 1, sizeof(upBuff1));
+ HEADER0("saoCuStatsE1");
+ REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, pbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count);
+ }
+
if (opt.saoCuStatsE2)
{
int32_t stats[5], count[5];
diff -r 235930aae11d -r 25a8323b886f source/test/pixelharness.h
--- a/source/test/pixelharness.h Tue Jul 07 12:17:08 2015 +0530
+++ b/source/test/pixelharness.h Tue Jul 07 12:29:32 2015 +0530
@@ -102,6 +102,7 @@
bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt);
bool check_saoCuStatsE0_t(saoCuStatsE0_t ref, saoCuStatsE0_t opt);
+ bool check_saoCuStatsE1_t(saoCuStatsE1_t ref, saoCuStatsE1_t opt);
bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
More information about the x265-devel
mailing list