[x265] [PATCH 14 of 16] improve saoCuStatsE3 by use prepare (fenc - frec)
Min Chen
chenm003 at 163.com
Wed Oct 7 00:55:25 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167722 18000
# Node ID 8fdd1b8fd4529b6966ab787f6c624f6056f77593
# Parent 1b7be2d88bce205ad41a04f8b5874f22cd8763e7
improve saoCuStatsE3 by use prepare (fenc - frec)
---
source/common/primitives.h | 2 +-
source/common/x86/loopfilter.asm | 139 +++++++++++++++++++++++++++++++++++
source/common/x86/loopfilter.h | 2 +-
source/common/x86/pixel-util8.asm | 144 -------------------------------------
source/encoder/sao.cpp | 8 +-
source/test/pixelharness.cpp | 6 +-
6 files changed, 148 insertions(+), 153 deletions(-)
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/common/primitives.h
--- a/source/common/primitives.h Tue Oct 06 16:41:59 2015 -0500
+++ b/source/common/primitives.h Tue Oct 06 16:42:02 2015 -0500
@@ -180,7 +180,7 @@
typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE2_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE3_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Oct 06 16:41:59 2015 -0500
+++ b/source/common/x86/loopfilter.asm Tue Oct 06 16:42:02 2015 -0500
@@ -2404,3 +2404,142 @@
%endif ; ARCH_X86_64
+
+
+;void saoStatE3(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+;{
+; memset(tmp_stats, 0, sizeof(tmp_stats));
+; memset(tmp_count, 0, sizeof(tmp_count));
+; for (y = startY; y < endY; y++)
+; {
+; for (x = startX; x < endX; x++)
+; {
+; int signDown = signOf2(rec[x], rec[x + stride - 1]);
+; uint32_t edgeType = signDown + upBuff1[x] + 2;
+; upBuff1[x - 1] = (int8_t)(-signDown);
+; tmp_stats[edgeType] += diff[x];
+; tmp_count[edgeType]++;
+; }
+; upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+; rec += stride;
+; fenc += stride;
+; }
+; for (x = 0; x < NUM_EDGETYPE; x++)
+; {
+; stats[s_eoTable[x]] += tmp_stats[x];
+; count[s_eoTable[x]] += tmp_count[x];
+; }
+;}
+
+%if ARCH_X86_64
+INIT_XMM sse4
+cglobal saoCuStatsE3, 4,9,8,0-32 ; Stack: 5 of stats and 5 of count
+ mov r4d, r4m
+ mov r5d, r5m
+
+ ; clear internal temporary buffer
+ pxor m0, m0
+ mova [rsp], m0
+ mova [rsp + mmsize], m0
+ mova m0, [pb_128]
+ mova m5, [pb_1]
+ mova m6, [pb_2]
+ movh m7, [r3 + r4]
+
+.loopH:
+ mov r6d, r4d
+
+.loopW:
+ movu m1, [r1]
+ movu m2, [r1 + r2 - 1]
+
+ ; signDown
+ pxor m1, m0
+ pxor m2, m0
+ pcmpgtb m3, m1, m2
+ pand m3, m5
+ pcmpgtb m2, m1
+ por m2, m3
+ pxor m3, m3
+ psubb m3, m2
+
+ ; edgeType
+ movu m4, [r3]
+ paddb m4, m6
+ paddb m2, m4
+
+ ; update upBuff1
+ movu [r3 - 1], m3
+
+ ; stats[edgeType]
+ pxor m1, m0
+
+ ; 16 pixels
+%assign x 0
+%rep 16
+ pextrb r7d, m2, x
+ inc word [rsp + r7 * 2]
+
+ movsx r8d, word [r0 + x * 2]
+ add [rsp + 5 * 2 + r7 * 4], r8d
+
+ dec r6d
+ jz .next
+%assign x x+1
+%endrep
+
+ add r0, 16*2
+ add r1, 16
+ add r3, 16
+ jmp .loopW
+
+.next:
+ ; restore pointer upBuff1
+ mov r6d, r4d
+ and r6d, ~15
+ neg r6 ; MUST BE 64-bits, it is Negtive
+
+ ; move to next row
+
+ ; move back to start point
+ add r3, r6
+
+ ; adjust with stride
+ lea r0, [r0 + (r6 + 64) * 2] ; 64 = MAX_CU_SIZE
+ add r1, r2
+ add r1, r6
+
+ dec r5d
+ jg .loopH
+
+ ; restore unavailable pixels
+ movh [r3 + r4], m7
+
+ ; sum to global buffer
+ mov r1, r6m
+ mov r0, r7m
+
+ ; s_eoTable = {1,2,0,3,4}
+ movzx r6d, word [rsp + 0 * 2]
+ add [r0 + 1 * 4], r6d
+ movzx r6d, word [rsp + 1 * 2]
+ add [r0 + 2 * 4], r6d
+ movzx r6d, word [rsp + 2 * 2]
+ add [r0 + 0 * 4], r6d
+ movzx r6d, word [rsp + 3 * 2]
+ add [r0 + 3 * 4], r6d
+ movzx r6d, word [rsp + 4 * 2]
+ add [r0 + 4 * 4], r6d
+
+ mov r6d, [rsp + 5 * 2 + 0 * 4]
+ add [r1 + 1 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 1 * 4]
+ add [r1 + 2 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 2 * 4]
+ add [r1 + 0 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 3 * 4]
+ add [r1 + 3 * 4], r6d
+ mov r6d, [rsp + 5 * 2 + 4 * 4]
+ add [r1 + 4 * 4], r6d
+ RET
+%endif ; ARCH_X86_64
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Oct 06 16:41:59 2015 -0500
+++ b/source/common/x86/loopfilter.h Tue Oct 06 16:42:02 2015 -0500
@@ -40,7 +40,7 @@
void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE2_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
- void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+ void PFX(saoCuStatsE3_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
DECL_SAO(sse4);
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Oct 06 16:41:59 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Tue Oct 06 16:42:02 2015 -0500
@@ -6702,150 +6702,6 @@
RET
-;void saoStatE3(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
-;{
-; memset(tmp_stats, 0, sizeof(tmp_stats));
-; memset(tmp_count, 0, sizeof(tmp_count));
-; for (y = startY; y < endY; y++)
-; {
-; for (x = startX; x < endX; x++)
-; {
-; int signDown = signOf2(rec[x], rec[x + stride - 1]);
-; uint32_t edgeType = signDown + upBuff1[x] + 2;
-; upBuff1[x - 1] = (int8_t)(-signDown);
-; tmp_stats[edgeType] += (fenc[x] - rec[x]);
-; tmp_count[edgeType]++;
-; }
-; upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
-; rec += stride;
-; fenc += stride;
-; }
-; for (x = 0; x < NUM_EDGETYPE; x++)
-; {
-; stats[s_eoTable[x]] += tmp_stats[x];
-; count[s_eoTable[x]] += tmp_count[x];
-; }
-;}
-
-%if ARCH_X86_64
-INIT_XMM sse4
-cglobal saoCuStatsE3, 4,9,8,0-32 ; Stack: 5 of stats and 5 of count
- mov r4d, r4m
- mov r5d, r5m
-
- ; clear internal temporary buffer
- pxor m0, m0
- mova [rsp], m0
- mova [rsp + mmsize], m0
- mova m0, [pb_128]
- mova m5, [pb_1]
- mova m6, [pb_2]
- movh m7, [r3 + r4]
-
-.loopH:
- mov r6d, r4d
-
-.loopW:
- movu m1, [r1]
- movu m2, [r1 + r2 - 1]
-
- ; signDown
- pxor m1, m0
- pxor m2, m0
- pcmpgtb m3, m1, m2
- pand m3, m5
- pcmpgtb m2, m1
- por m2, m3
- pxor m3, m3
- psubb m3, m2
-
- ; edgeType
- movu m4, [r3]
- paddb m4, m6
- paddb m2, m4
-
- ; update upBuff1
- movu [r3 - 1], m3
-
- ; stats[edgeType]
- pxor m1, m0
- movu m3, [r0]
- punpckhbw m4, m3, m1
- punpcklbw m3, m1
- pmaddubsw m3, [hmul_16p + 16]
- pmaddubsw m4, [hmul_16p + 16]
-
- ; 16 pixels
-%assign x 0
-%rep 16
- pextrb r7d, m2, x
- inc word [rsp + r7 * 2]
-
- %if (x < 8)
- pextrw r8d, m3, (x % 8)
- %else
- pextrw r8d, m4, (x % 8)
- %endif
- movsx r8d, r8w
- add [rsp + 5 * 2 + r7 * 4], r8d
-
- dec r6d
- jz .next
-%assign x x+1
-%endrep
-
- add r0, 16
- add r1, 16
- add r3, 16
- jmp .loopW
-
-.next:
- ; restore pointer upBuff1
- mov r6d, r4d
- and r6d, 15
-
- ; move to next row
- sub r6, r4
- add r3, r6
- add r6, r2
- add r0, r6
- add r1, r6
- dec r5d
- jg .loopH
-
- ; restore unavailable pixels
- movh [r3 + r4], m7
-
- ; sum to global buffer
- mov r1, r6m
- mov r0, r7m
-
- ; s_eoTable = {1,2,0,3,4}
- movzx r6d, word [rsp + 0 * 2]
- add [r0 + 1 * 4], r6d
- movzx r6d, word [rsp + 1 * 2]
- add [r0 + 2 * 4], r6d
- movzx r6d, word [rsp + 2 * 2]
- add [r0 + 0 * 4], r6d
- movzx r6d, word [rsp + 3 * 2]
- add [r0 + 3 * 4], r6d
- movzx r6d, word [rsp + 4 * 2]
- add [r0 + 4 * 4], r6d
-
- mov r6d, [rsp + 5 * 2 + 0 * 4]
- add [r1 + 1 * 4], r6d
- mov r6d, [rsp + 5 * 2 + 1 * 4]
- add [r1 + 2 * 4], r6d
- mov r6d, [rsp + 5 * 2 + 2 * 4]
- add [r1 + 0 * 4], r6d
- mov r6d, [rsp + 5 * 2 + 3 * 4]
- add [r1 + 3 * 4], r6d
- mov r6d, [rsp + 5 * 2 + 4 * 4]
- add [r1 + 4 * 4], r6d
- RET
-%endif ; ARCH_X86_64
-
-
; uint32_t costCoeffNxN(uint16_t *scan, coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, uint8_t *tabSigCtx, uint16_t scanFlagMask, uint8_t *baseCtx, int offset, int subPosBase)
;for (int i = 0; i < MLS_CG_SIZE; i++)
;{
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Tue Oct 06 16:41:59 2015 -0500
+++ b/source/encoder/sao.cpp Tue Oct 06 16:42:02 2015 -0500
@@ -830,7 +830,7 @@
primitives.sign(&upBuff1[startX - 1], &rec[startX - 1], &rec[startX - 1 - stride + 1], (endX - startX + 1));
- primitives.saoCuStatsE3(fenc0 + startX + startY * stride, rec0 + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
+ primitives.saoCuStatsE3(diff + startX + startY * MAX_CU_SIZE, rec0 + startX + startY * stride, stride, upBuff1 + startX, endX - startX, endY - startY, m_offsetOrg[plane][SAO_EO_3], m_count[plane][SAO_EO_3]);
}
}
}
@@ -1667,7 +1667,7 @@
}
}
-void saoCuStatsE3_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsE3_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
{
X265_CHECK(endX < MAX_CU_SIZE, "endX check failure\n");
X265_CHECK(endY < MAX_CU_SIZE, "endY check failure\n");
@@ -1689,14 +1689,14 @@
uint32_t edgeType = signDown + upBuff1[x] + 2;
upBuff1[x - 1] = (int8_t)(-signDown);
- tmp_stats[edgeType] += (fenc[x] - rec[x]);
+ tmp_stats[edgeType] += diff[x];
tmp_count[edgeType]++;
}
upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
rec += stride;
- fenc += stride;
+ diff += MAX_CU_SIZE;
}
for (x = 0; x < SAO::NUM_EDGETYPE; x++)
diff -r 1b7be2d88bce -r 8fdd1b8fd452 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Oct 06 16:41:59 2015 -0500
+++ b/source/test/pixelharness.cpp Tue Oct 06 16:42:02 2015 -0500
@@ -1245,8 +1245,8 @@
int endX = MAX_CU_SIZE - (rand() % 5) - 1;
int endY = MAX_CU_SIZE - (rand() % 4) - 1;
- ref(pbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
- checked(opt, pbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+ ref(sbuf2, pbuf3, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+ checked(opt, sbuf2, pbuf3, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
|| memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -2886,7 +2886,7 @@
int32_t stats[5], count[5];
memset(upBuff1, 1, sizeof(upBuff1));
HEADER0("saoCuStatsE3");
- REPORT_SPEEDUP(opt.saoCuStatsE3, ref.saoCuStatsE3, pbuf2, pbuf3, 64, upBuff1 + 1, 60, 61, stats, count);
+ REPORT_SPEEDUP(opt.saoCuStatsE3, ref.saoCuStatsE3, sbuf2, pbuf3, 64, upBuff1 + 1, 60, 61, stats, count);
}
if (opt.planecopy_sp)
More information about the x265-devel
mailing list