[x265] [PATCH 11 of 16] improve saoCuStatsE0 by use prepare (fenc - frec)
Min Chen
chenm003 at 163.com
Wed Oct 7 00:55:22 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167714 18000
# Node ID 72d345dcf13e3c715f767ff624f5ff47043e01a6
# Parent 5f448e155870fdbb34242fa51c5a4eeebd71ebc0
improve saoCuStatsE0 by use prepare (fenc - frec)
---
source/common/primitives.h | 2 +-
source/common/x86/loopfilter.asm | 77 +++++++++++++++++---------------------
source/common/x86/loopfilter.h | 2 +-
source/encoder/sao.cpp | 10 +++--
source/test/pixelharness.cpp | 6 +-
5 files changed, 45 insertions(+), 52 deletions(-)
diff -r 5f448e155870 -r 72d345dcf13e source/common/primitives.h
--- a/source/common/primitives.h Tue Oct 06 16:41:52 2015 -0500
+++ b/source/common/primitives.h Tue Oct 06 16:41:54 2015 -0500
@@ -177,7 +177,7 @@
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
diff -r 5f448e155870 -r 72d345dcf13e source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Oct 06 16:41:52 2015 -0500
+++ b/source/common/x86/loopfilter.asm Tue Oct 06 16:41:54 2015 -0500
@@ -26,6 +26,7 @@
;*****************************************************************************/
%include "x86inc.asm"
+%include "x86util.asm"
SECTION_RODATA 32
pb_31: times 32 db 31
@@ -2030,23 +2031,29 @@
%endif
;-----------------------------------------------------------------------------------------------------------------------
-; saoCuStatsE0(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsE0(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
;-----------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
INIT_XMM sse4
-cglobal saoCuStatsE0, 5,9,8, 0-32
+cglobal saoCuStatsE0, 3,10,6, 0-32
mov r3d, r3m
- mov r8, r5mp
+ mov r4d, r4m
+ mov r9, r5mp
; clear internal temporary buffer
pxor m0, m0
mova [rsp], m0
mova [rsp + mmsize], m0
mova m4, [pb_128]
- mova m5, [hmul_16p + 16]
- mova m6, [pb_2]
+ mova m5, [pb_2]
xor r7d, r7d
+ ; correct stride for diff[] and rec
+ mov r6d, r3d
+ and r6d, ~15
+ sub r2, r6
+ lea r8, [(r6 - 64) * 2] ; 64 = MAX_CU_SIZE
+
.loopH:
mov r5d, r3d
@@ -2060,62 +2067,46 @@
pinsrb m0, r7d, 15
.loopL:
- movu m7, [r1]
+ movu m3, [r1]
movu m2, [r1 + 1]
- pxor m1, m7, m4
- pxor m3, m2, m4
- pcmpgtb m2, m1, m3
- pcmpgtb m3, m1
- pand m2, [pb_1]
- por m2, m3 ; signRight
+ pxor m1, m3, m4
+ pxor m2, m4
+ pcmpgtb m3, m1, m2
+ pcmpgtb m2, m1
+ pand m3, [pb_1]
+ por m2, m3 ; signRight
palignr m3, m2, m0, 15
- psignb m3, m4 ; signLeft
+ psignb m3, m4 ; signLeft
mova m0, m2
paddb m2, m3
- paddb m2, m6 ; edgeType
+ paddb m2, m5 ; edgeType
; stats[edgeType]
- movu m3, [r0] ; fenc[0-15]
- punpckhbw m1, m3, m7
- punpcklbw m3, m7
- pmaddubsw m1, m5
- pmaddubsw m3, m5
-
%assign x 0
%rep 16
pextrb r7d, m2, x
-%if (x < 8)
- pextrw r6d, m3, (x % 8)
-%else
- pextrw r6d, m1, (x % 8)
-%endif
- movsx r6d, r6w
+ movsx r6d, word [r0 + x * 2]
inc word [rsp + r7 * 2] ; tmp_count[edgeType]++
add [rsp + 5 * 2 + r7 * 4], r6d ; tmp_stats[edgeType] += (fenc[x] - rec[x])
dec r5d
- jz .next
+ jz .next
%assign x x+1
%endrep
- add r0q, 16
- add r1q, 16
- jmp .loopL
+ add r0, 16*2
+ add r1, 16
+ jmp .loopL
.next:
- mov r6d, r3d
- and r6d, 15
-
- sub r6, r3
- add r6, r2
- add r0, r6
- add r1, r6
+ sub r0, r8
+ add r1, r2
dec r4d
- jnz .loopH
+ jnz .loopH
; sum to global buffer
mov r0, r6mp
@@ -2133,15 +2124,15 @@
add [r0 + 4 * 4], r5d
mov r6d, [rsp + 5 * 2 + 0 * 4]
- add [r8 + 1 * 4], r6d
+ add [r9 + 1 * 4], r6d
mov r5d, [rsp + 5 * 2 + 1 * 4]
- add [r8 + 2 * 4], r5d
+ add [r9 + 2 * 4], r5d
mov r6d, [rsp + 5 * 2 + 2 * 4]
- add [r8 + 0 * 4], r6d
+ add [r9 + 0 * 4], r6d
mov r5d, [rsp + 5 * 2 + 3 * 4]
- add [r8 + 3 * 4], r5d
+ add [r9 + 3 * 4], r5d
mov r6d, [rsp + 5 * 2 + 4 * 4]
- add [r8 + 4 * 4], r6d
+ add [r9 + 4 * 4], r6d
RET
%endif
diff -r 5f448e155870 -r 72d345dcf13e source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Oct 06 16:41:52 2015 -0500
+++ b/source/common/x86/loopfilter.h Tue Oct 06 16:41:54 2015 -0500
@@ -37,7 +37,7 @@
void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
- void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+ void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
diff -r 5f448e155870 -r 72d345dcf13e source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Tue Oct 06 16:41:52 2015 -0500
+++ b/source/encoder/sao.cpp Tue Oct 06 16:41:54 2015 -0500
@@ -752,7 +752,7 @@
startX = !lpelx;
endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth - skipR + plane_offset;
- primitives.saoCuStatsE0(fenc0 + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
+ primitives.saoCuStatsE0(diff + startX, rec0 + startX, stride, endX - startX, ctuHeight - skipB + plane_offset, m_offsetOrg[plane][SAO_EO_0], m_count[plane][SAO_EO_0]);
}
// SAO_EO_1: // dir: |
@@ -1559,12 +1559,14 @@
}
}
-void saoCuStatsE0_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsE0_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
{
int x, y;
int32_t tmp_stats[SAO::NUM_EDGETYPE];
int32_t tmp_count[SAO::NUM_EDGETYPE];
+ X265_CHECK(endX <= MAX_CU_SIZE, "endX too big\n");
+
memset(tmp_stats, 0, sizeof(tmp_stats));
memset(tmp_count, 0, sizeof(tmp_count));
@@ -1579,11 +1581,11 @@
signLeft = -signRight;
X265_CHECK(edgeType <= 4, "edgeType check failure\n");
- tmp_stats[edgeType] += (fenc[x] - rec[x]);
+ tmp_stats[edgeType] += diff[x];
tmp_count[edgeType]++;
}
- fenc += stride;
+ diff += MAX_CU_SIZE;
rec += stride;
}
diff -r 5f448e155870 -r 72d345dcf13e source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Oct 06 16:41:52 2015 -0500
+++ b/source/test/pixelharness.cpp Tue Oct 06 16:41:54 2015 -0500
@@ -1098,8 +1098,8 @@
int endX = MAX_CU_SIZE - (rand() % 5) - 1;
int endY = MAX_CU_SIZE - (rand() % 4) - 1;
- ref(pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
- checked(opt, pbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
+ ref(sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_ref, count_ref);
+ checked(opt, sbuf2 + j + 1, pbuf3 + j + 1, stride, endX, endY, stats_vec, count_vec);
if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
return false;
@@ -2857,7 +2857,7 @@
{
int32_t stats[33], count[33];
HEADER0("saoCuStatsE0");
- REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, pbuf2, pbuf3, 64, 60, 61, stats, count);
+ REPORT_SPEEDUP(opt.saoCuStatsE0, ref.saoCuStatsE0, sbuf2, pbuf3, 64, 60, 61, stats, count);
}
if (opt.saoCuStatsE1)
More information about the x265-devel
mailing list