[x265] [PATCH 1 of 2] asm: new SSE4 primivite on saoStatsE3
Min Chen
chenm003 at 163.com
Sat May 23 00:52:07 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1432332676 25200
# Node ID 3ce8e58d2d1cc58527be38bfe69ee42d82c3ccee
# Parent 234bc93bd51698801fad77cc861177ed019f5113
asm: new SSE4 primivite on saoStatsE3
---
source/common/primitives.cpp | 2 +
source/common/primitives.h | 5 +
source/common/x86/asm-primitives.cpp | 2 +
source/common/x86/loopfilter.h | 1 +
source/common/x86/pixel-util8.asm | 146 ++++++++++++++++++++++++++++++++++
source/encoder/sao.cpp | 74 ++++++++++-------
source/encoder/sao.h | 4 +-
7 files changed, 203 insertions(+), 31 deletions(-)
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/primitives.cpp
--- a/source/common/primitives.cpp Thu May 21 16:34:48 2015 +0530
+++ b/source/common/primitives.cpp Fri May 22 15:11:16 2015 -0700
@@ -56,6 +56,7 @@
void setupFilterPrimitives_c(EncoderPrimitives &p);
void setupIntraPrimitives_c(EncoderPrimitives &p);
void setupLoopFilterPrimitives_c(EncoderPrimitives &p);
+void setupSaoPrimitives_c(EncoderPrimitives &p);
void setupCPrimitives(EncoderPrimitives &p)
{
@@ -64,6 +65,7 @@
setupFilterPrimitives_c(p); // ipfilter.cpp
setupIntraPrimitives_c(p); // intrapred.cpp
setupLoopFilterPrimitives_c(p); // loopfilter.cpp
+ setupSaoPrimitives_c(p); // sao.cpp
}
void setupAliasPrimitives(EncoderPrimitives &p)
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/primitives.h
--- a/source/common/primitives.h Thu May 21 16:34:48 2015 +0530
+++ b/source/common/primitives.h Fri May 22 15:11:16 2015 -0700
@@ -173,6 +173,9 @@
typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+
+typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
@@ -289,6 +292,8 @@
saoCuOrgE3_t saoCuOrgE3[2];
saoCuOrgB0_t saoCuOrgB0;
+ saoCuStatsE3_t saoCuStatsE3;
+
downscale_t frameInitLowres;
cutree_propagate_cost propagateCost;
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri May 22 15:11:16 2015 -0700
@@ -1797,6 +1797,8 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].p2s = x265_filterPixelToShort_6x16_sse4;
#if X86_64
+ p.saoCuStatsE3 = x265_saoCuStatsE3_sse4;
+
ALL_LUMA_CU(psy_cost_pp, psyCost_pp, sse4);
ALL_LUMA_CU(psy_cost_ss, psyCost_ss, sse4);
#endif
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/loopfilter.h Fri May 22 15:11:16 2015 -0700
@@ -39,6 +39,7 @@
void x265_saoCuOrgE3_32_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
+void x265_saoCuStatsE3_sse4(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
void x265_calSign_avx2(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu May 21 16:34:48 2015 +0530
+++ b/source/common/x86/pixel-util8.asm Fri May 22 15:11:16 2015 -0700
@@ -53,6 +53,7 @@
cextern pw_1
cextern pw_0_15
cextern pb_1
+cextern pb_128
cextern pw_00ff
cextern pw_1023
cextern pw_3fff
@@ -6051,3 +6052,148 @@
shl r1d, 16
or eax, r1d
RET
+
+
+;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+;{
+; memset(tmp_stats, 0, sizeof(tmp_stats));
+; memset(tmp_count, 0, sizeof(tmp_count));
+; for (y = startY; y < endY; y++)
+; {
+; for (x = startX; x < endX; x++)
+; {
+; int signDown = signOf2(rec[x], rec[x + stride - 1]);
+; uint32_t edgeType = signDown + upBuff1[x] + 2;
+; upBuff1[x - 1] = (int8_t)(-signDown);
+; tmp_stats[edgeType] += (fenc[x] - rec[x]);
+; tmp_count[edgeType]++;
+; }
+; upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+; rec += stride;
+; fenc += stride;
+; }
+; for (x = 0; x < NUM_EDGETYPE; x++)
+; {
+; stats[s_eoTable[x]] += tmp_stats[x];
+; count[s_eoTable[x]] += tmp_count[x];
+; }
+;}
+
+%if ARCH_X86_64
+; TODO: x64 only because I need temporary register r7,r8, easy portab to x86
+INIT_XMM sse4
+cglobal saoCuStatsE3, 4,9,8,0-32 ; Stack: 5 of stats and 5 of count
+ mov r4d, r4m
+ mov r5d, r5m
+
+ ; clear internal temporary buffer
+ pxor m0, m0
+ mova [rsp], m0
+ mova [rsp + mmsize], m0
+ mova m0, [pb_128]
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ movh m7, [r3 + r4]
+
+.loopH:
+ mov r6d, r4d
+
+.loopW:
+ movu m1, [r1]
+ movu m2, [r1 + r2 - 1]
+
+ ; signDown
+ pxor m1, m0
+ pxor m2, m0
+ pcmpgtb m3, m1, m2
+ pand m3, m5
+ pcmpgtb m2, m1
+ por m2, m3
+ pxor m3, m3
+ psubb m3, m2
+
+ ; edgeType
+ movu m4, [r3]
+ paddb m4, m6
+ paddb m2, m4
+
+ ; update upBuff1
+ movu [r3 - 1], m3
+
+ ; stats[edgeType]
+ pxor m1, m0
+ movu m3, [r0]
+ punpckhbw m4, m3, m1
+ punpcklbw m3, m1
+ pmaddubsw m3, [hmul_16p + 16]
+ pmaddubsw m4, [hmul_16p + 16]
+
+ ; 16 pixels
+%assign x 0
+%rep 16
+ pextrb r7d, m2, x
+ inc word [rsp + r7 * 2]
+
+ %if (x < 8)
+ pextrw r8d, m3, (x % 8)
+ %else
+ pextrw r8d, m4, (x % 8)
+ %endif
+ movsx r8d, r8w
+ add [rsp + 5 * 2 + r7 * 4], r8d
+
+ dec r6d
+ jz .next
+%assign x x+1
+%endrep
+
+ add r0, 16
+ add r1, 16
+ add r3, 16
+ jmp .loopW
+
+.next:
+ ; restore pointer upBuff1
+ mov r6d, r4d
+ and r6d, 15
+
+ ; move to next row
+ sub r6, r4
+ add r3, r6
+ add r6, r2
+ add r0, r6
+ add r1, r6
+ dec r5d
+ jg .loopH
+
+ ; restore unavailable pixels
+ movh [r3 + r4], m7
+
+ ; sum to global buffer
+ mov r1, r6m
+ mov r0, r7m
+
+ ; s_eoTable = {1,2,0,3,4}
+ movzx r6d, word [rsp + 0 * 2]
+ add [r0 + 1 * 4], r6d
+ movzx r6d, word [rsp + 1 * 2]
+ add [r0 + 2 * 4], r6d
+ movzx r6d, word [rsp + 2 * 2]
+ add [r0 + 0 * 4], r6d
+ movzx r6d, word [rsp + 3 * 2]
+ add [r0 + 3 * 4], r6d
+ movzx r6d, word [rsp + 4 * 2]
+ add [r0 + 4 * 4], r6d
+
+ mov r6d, [rsp + 5 * 2 + 0 * 4]
+ add [r1 + 1 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 1 * 4]
+ add [r1 + 2 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 2 * 4]
+ add [r1 + 0 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 3 * 4]
+ add [r1 + 3 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 4 * 4]
+ add [r1 + 4 * 4], r6d
+ RET
+%endif ; ARCH_X86_64
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Thu May 21 16:34:48 2015 +0530
+++ b/source/encoder/sao.cpp Fri May 22 15:11:16 2015 -0700
@@ -57,7 +57,6 @@
{
return (count * offset - offsetOrg * 2) * offset;
}
-
} // end anonymous namespace
@@ -925,35 +924,7 @@
primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
- memset(tmp_stats, 0, sizeof(tmp_stats));
- memset(tmp_count, 0, sizeof(tmp_count));
-
- for (y = startY; y < endY; y++)
- {
- for (x = startX; x < endX; x++)
- {
- int signDown = signOf2(rec[x], rec[x + stride - 1]);
- X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
-
- uint32_t edgeType = signDown + upBuff1[x] + 2;
- upBuff1[x - 1] = (int8_t)(-signDown);
- tmp_stats[edgeType] += (fenc[x] - rec[x]);
- tmp_count[edgeType]++;
- }
-
- upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
-
- rec += stride;
- fenc += stride;
- }
-
- stats = m_offsetOrg[plane][SAO_EO_3];
- count = m_count[plane][SAO_EO_3];
- for (x = 0; x < NUM_EDGETYPE; x++)
- {
- stats[s_eoTable[x]] += tmp_stats[x];
- count[s_eoTable[x]] += tmp_count[x];
- }
+ primitives.saoCuStatsE3(fenc0 + startX + startY * stride, rec0 + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
}
}
}
@@ -1669,4 +1640,47 @@
}
}
}
+
+// NOTE: must put in namespace x265 since we need class SAO
+void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+{
+ int x, y;
+ int32_t tmp_stats[SAO::NUM_EDGETYPE];
+ int32_t tmp_count[SAO::NUM_EDGETYPE];
+
+ memset(tmp_stats, 0, sizeof(tmp_stats));
+ memset(tmp_count, 0, sizeof(tmp_count));
+
+ for (y = 0; y < endY; y++)
+ {
+ for (x = 0; x < endX; x++)
+ {
+ int signDown = signOf2(rec[x], rec[x + stride - 1]);
+ X265_CHECK(signDown == signOf(rec[x] - rec[x + stride - 1]), "signDown check failure\n");
+
+ uint32_t edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x - 1] = (int8_t)(-signDown);
+ tmp_stats[edgeType] += (fenc[x] - rec[x]);
+ tmp_count[edgeType]++;
+ }
+
+ upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+
+ rec += stride;
+ fenc += stride;
+ }
+
+ for (x = 0; x < SAO::NUM_EDGETYPE; x++)
+ {
+ stats[SAO::s_eoTable[x]] += tmp_stats[x];
+ count[SAO::s_eoTable[x]] += tmp_count[x];
+ }
}
+
+void setupSaoPrimitives_c(EncoderPrimitives &p)
+{
+ // TODO: move other sao functions to here
+ p.saoCuStatsE3 = saoCuStatsE3_c;
+}
+}
+
diff -r 234bc93bd516 -r 3ce8e58d2d1c source/encoder/sao.h
--- a/source/encoder/sao.h Thu May 21 16:34:48 2015 +0530
+++ b/source/encoder/sao.h Fri May 22 15:11:16 2015 -0700
@@ -52,7 +52,7 @@
class SAO
{
-protected:
+public:
enum { SAO_MAX_DEPTH = 4 };
enum { SAO_BO_BITS = 5 };
@@ -68,6 +68,8 @@
typedef int32_t (PerClass[MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
typedef int32_t (PerPlane[NUM_PLANE][MAX_NUM_SAO_TYPE][MAX_NUM_SAO_CLASS]);
+protected:
+
/* allocated per part */
PerClass* m_count;
PerClass* m_offset;
More information about the x265-devel
mailing list