[x265] [PATCH 1 of 3] asm: sse4 code for saoCuStatsBO, improved 185378c->131279c
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Tue Jul 7 11:35:35 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1436247875 -19800
# Tue Jul 07 11:14:35 2015 +0530
# Node ID e0166f09f332af72a83eb059d878044db15f59bd
# Parent 523540864864752baea88ba0ac78cf292364bf7e
asm: sse4 code for saoCuStatsBO, improved 185378c->131279c
diff -r 523540864864 -r e0166f09f332 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jul 06 14:12:55 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 07 11:14:35 2015 +0530
@@ -2497,6 +2497,7 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = PFX(filterPixelToShort_6x16_sse4);
#if X86_64
+ p.saoCuStatsBO = PFX(saoCuStatsBO_sse4);
p.saoCuStatsE2 = PFX(saoCuStatsE2_sse4);
p.saoCuStatsE3 = PFX(saoCuStatsE3_sse4);
diff -r 523540864864 -r e0166f09f332 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Mon Jul 06 14:12:55 2015 +0530
+++ b/source/common/x86/loopfilter.asm Tue Jul 07 11:14:35 2015 +0530
@@ -29,6 +29,7 @@
SECTION_RODATA 32
pb_31: times 32 db 31
+pb_124: times 32 db 124
pb_15: times 32 db 15
pb_movemask_32: times 32 db 0x00
times 32 db 0xFF
@@ -41,6 +42,8 @@
cextern pw_1023
cextern pb_movemask
cextern pw_1
+cextern hmul_16p
+cextern pb_4
;============================================================================================================
@@ -1984,3 +1987,59 @@
.end:
RET
%endif
+
+;--------------------------------------------------------------------------------------------------------------------------
+; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+;--------------------------------------------------------------------------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsBO, 7,12,6
+ mova m3, [hmul_16p + 16]
+ mova m4, [pb_124]
+ mova m5, [pb_4]
+ xor r7d, r7d
+
+.loopH:
+ mov r10, r0
+ mov r11, r1
+ mov r9d, r3d
+.loopL:
+ movu m1, [r11]
+ movu m0, [r10]
+
+ punpckhbw m2, m0, m1
+ punpcklbw m0, m1
+ psrlw m1, 1 ; rec[x] >> boShift
+ pmaddubsw m2, m3
+ pmaddubsw m0, m3
+ pand m1, m4
+ paddb m1, m5
+
+%assign x 0
+%rep 16
+ pextrb r7d, m1, x
+
+%if (x < 8)
+ pextrw r8d, m0, (x % 8)
+%else
+ pextrw r8d, m2, (x % 8)
+%endif
+ movsx r8d, r8w
+ inc dword [r6 + r7] ; count[classIdx]++
+ add [r5 + r7], r8d ; stats[classIdx] += (fenc[x] - rec[x]);
+ dec r9d
+ jz .next
+%assign x x+1
+%endrep
+
+ add r10, 16
+ add r11, 16
+ jmp .loopL
+
+.next:
+ add r0, r2
+ add r1, r2
+ dec r4d
+ jnz .loopH
+ RET
+%endif
diff -r 523540864864 -r e0166f09f332 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Mon Jul 06 14:12:55 2015 +0530
+++ b/source/common/x86/loopfilter.h Tue Jul 07 11:14:35 2015 +0530
@@ -35,6 +35,7 @@
void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
+ void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 523540864864 -r e0166f09f332 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jul 06 14:12:55 2015 +0530
+++ b/source/test/pixelharness.cpp Tue Jul 07 11:14:35 2015 +0530
@@ -1017,6 +1017,42 @@
return true;
}
+bool PixelHarness::check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt)
+{
+ enum { NUM_EDGETYPE = 33 }; // classIdx = 1 + (rec[x] >> 3);
+ int32_t stats_ref[NUM_EDGETYPE];
+ int32_t stats_vec[NUM_EDGETYPE];
+
+ int32_t count_ref[NUM_EDGETYPE];
+ int32_t count_vec[NUM_EDGETYPE];
+
+ int j = 0;
+ for (int i = 0; i < ITERS; i++)
+ {
+ // initialize input data to random, the dynamic range wrong but good to verify our asm code
+ for (int x = 0; x < NUM_EDGETYPE; x++)
+ {
+ stats_ref[x] = stats_vec[x] = rand();
+ count_ref[x] = count_vec[x] = rand();
+ }
+
+ intptr_t stride = 16 * (rand() % 4 + 1);
+ int endX = MAX_CU_SIZE - (rand() % 5);
+ int endY = MAX_CU_SIZE - (rand() % 4) - 1;
+
+ ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
+ checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
+
+ if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt)
{
enum { NUM_EDGETYPE = 5 };
@@ -2094,6 +2130,15 @@
}
}
+ if (opt.saoCuStatsBO)
+ {
+ if (!check_saoCuStatsBO_t(ref.saoCuStatsBO, opt.saoCuStatsBO))
+ {
+ printf("saoCuStatsBO failed\n");
+ return false;
+ }
+ }
+
if (opt.saoCuStatsE2)
{
if (!check_saoCuStatsE2_t(ref.saoCuStatsE2, opt.saoCuStatsE2))
@@ -2526,6 +2571,13 @@
REPORT_SPEEDUP(opt.saoCuOrgB0, ref.saoCuOrgB0, pbuf1, psbuf1, 64, 64, 64);
}
+ if (opt.saoCuStatsBO)
+ {
+ int32_t stats[33], count[33];
+ HEADER0("saoCuStatsBO");
+ REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, pbuf2, pbuf3, 64, 60, 61, stats, count);
+ }
+
if (opt.saoCuStatsE2)
{
int32_t stats[5], count[5];
diff -r 523540864864 -r e0166f09f332 source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Jul 06 14:12:55 2015 +0530
+++ b/source/test/pixelharness.h Tue Jul 07 11:14:35 2015 +0530
@@ -100,6 +100,7 @@
bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
+ bool check_saoCuStatsBO_t(saoCuStatsBO_t ref, saoCuStatsBO_t opt);
bool check_saoCuStatsE2_t(saoCuStatsE2_t ref, saoCuStatsE2_t opt);
bool check_saoCuStatsE3_t(saoCuStatsE3_t ref, saoCuStatsE3_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
More information about the x265-devel
mailing list