[x265] [PATCH 12 of 16] improve saoCuStatsE1 by use prepare (fenc - frec)
Min Chen
chenm003 at 163.com
Wed Oct 7 00:55:23 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167717 18000
# Node ID becaacf5b72bfc13860ce183e1ff411b5820163d
# Parent 72d345dcf13e3c715f767ff624f5ff47043e01a6
improve saoCuStatsE1 by use prepare (fenc - frec)
---
source/common/primitives.h | 2 +-
source/common/x86/loopfilter.asm | 31 +++++++++----------------------
source/common/x86/loopfilter.h | 2 +-
source/encoder/sao.cpp | 10 ++++------
source/test/pixelharness.cpp | 6 +++---
5 files changed, 18 insertions(+), 33 deletions(-)
diff -r 72d345dcf13e -r becaacf5b72b source/common/primitives.h
--- a/source/common/primitives.h Tue Oct 06 16:41:54 2015 -0500
+++ b/source/common/primitives.h Tue Oct 06 16:41:57 2015 -0500
@@ -178,7 +178,7 @@
typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE0_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
-typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsE1_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE3_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
diff -r 72d345dcf13e -r becaacf5b72b source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Oct 06 16:41:54 2015 -0500
+++ b/source/common/x86/loopfilter.asm Tue Oct 06 16:41:57 2015 -0500
@@ -2137,11 +2137,11 @@
%endif
;-------------------------------------------------------------------------------------------------------------------------------------------
-; saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
;-------------------------------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
INIT_XMM sse4
-cglobal saoCuStatsE1, 4,12,9,0-32 ; Stack: 5 of stats and 5 of count
+cglobal saoCuStatsE1, 4,12,8,0-32 ; Stack: 5 of stats and 5 of count
mov r5d, r5m
mov r4d, r4m
mov r11d, r5d
@@ -2153,7 +2153,6 @@
mova m0, [pb_128]
mova m5, [pb_1]
mova m6, [pb_2]
- mova m8, [hmul_16p + 16]
movh m7, [r3 + r4]
.loopH:
@@ -2170,11 +2169,11 @@
pxor m1, m0
pxor m2, m0
pcmpgtb m3, m1, m2
+ pcmpgtb m2, m1
pand m3, m5
- pcmpgtb m2, m1
por m2, m3
pxor m3, m3
- psubb m3, m2 ; -signDown
+ psubb m3, m2 ; -signDown
; edgeType
movu m4, [r11]
@@ -2184,26 +2183,14 @@
; update upBuff1
movu [r11], m3
- ; stats[edgeType]
- pxor m1, m0
- movu m3, [r9]
- punpckhbw m4, m3, m1
- punpcklbw m3, m1
- pmaddubsw m3, m8
- pmaddubsw m4, m8
-
; 16 pixels
%assign x 0
%rep 16
pextrb r7d, m2, x
inc word [rsp + r7 * 2]
- %if (x < 8)
- pextrw r8d, m3, (x % 8)
- %else
- pextrw r8d, m4, (x % 8)
- %endif
- movsx r8d, r8w
+ ; stats[edgeType]
+ movsx r8d, word [r9 + x * 2]
add [rsp + 5 * 2 + r7 * 4], r8d
dec r6d
@@ -2211,14 +2198,14 @@
%assign x x+1
%endrep
- add r9, 16
+ add r9, 16*2
add r10, 16
add r11, 16
- jmp .loopW
+ jmp .loopW
.next:
; restore pointer upBuff1
- add r0, r2
+ add r0, 64*2 ; MAX_CU_SIZE
add r1, r2
dec r5d
diff -r 72d345dcf13e -r becaacf5b72b source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Oct 06 16:41:54 2015 -0500
+++ b/source/common/x86/loopfilter.h Tue Oct 06 16:41:57 2015 -0500
@@ -38,7 +38,7 @@
void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE0_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
- void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
+ void PFX(saoCuStatsE1_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE3_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(calSign_ ## cpu)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 72d345dcf13e -r becaacf5b72b source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Tue Oct 06 16:41:54 2015 -0500
+++ b/source/encoder/sao.cpp Tue Oct 06 16:41:57 2015 -0500
@@ -763,7 +763,6 @@
skipR = 4;
}
- fenc = fenc0;
rec = rec0;
startY = !tpely;
@@ -771,13 +770,12 @@
endY = (bpely == picHeight) ? ctuHeight - 1 : ctuHeight - skipB + plane_offset;
if (!tpely)
{
- fenc += stride;
rec += stride;
}
primitives.sign(upBuff1, rec, &rec[- stride], ctuWidth);
- primitives.saoCuStatsE1(fenc0 + startY * stride, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
+ primitives.saoCuStatsE1(diff + startY * MAX_CU_SIZE, rec0 + startY * stride, stride, upBuff1, endX, endY - startY, m_offsetOrg[plane][SAO_EO_1], m_count[plane][SAO_EO_1]);
}
// SAO_EO_2: // dir: 135
@@ -1596,7 +1594,7 @@
}
}
-void saoCuStatsE1_c(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsE1_c(const int16_t *diff, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count)
{
X265_CHECK(endX <= MAX_CU_SIZE, "endX check failure\n");
X265_CHECK(endY <= MAX_CU_SIZE, "endY check failure\n");
@@ -1617,10 +1615,10 @@
uint32_t edgeType = signDown + upBuff1[x] + 2;
upBuff1[x] = (int8_t)(-signDown);
- tmp_stats[edgeType] += (fenc[x] - rec[x]);
+ tmp_stats[edgeType] += diff[x];
tmp_count[edgeType]++;
}
- fenc += stride;
+ diff += MAX_CU_SIZE;
rec += stride;
}
diff -r 72d345dcf13e -r becaacf5b72b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Oct 06 16:41:54 2015 -0500
+++ b/source/test/pixelharness.cpp Tue Oct 06 16:41:57 2015 -0500
@@ -1142,8 +1142,8 @@
int endX = MAX_CU_SIZE - (rand() % 5);
int endY = MAX_CU_SIZE - (rand() % 4) - 1;
- ref(pbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
- checked(opt, pbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
+ ref(sbuf2 + 1, pbuf3 + 1, stride, upBuff1_ref, endX, endY, stats_ref, count_ref);
+ checked(opt, sbuf2 + 1, pbuf3 + 1, stride, upBuff1_vec, endX, endY, stats_vec, count_vec);
if ( memcmp(_upBuff1_ref, _upBuff1_vec, sizeof(_upBuff1_ref))
|| memcmp(stats_ref, stats_vec, sizeof(stats_ref))
@@ -2866,7 +2866,7 @@
int8_t upBuff1[MAX_CU_SIZE + 2];
memset(upBuff1, 1, sizeof(upBuff1));
HEADER0("saoCuStatsE1");
- REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, pbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count);
+ REPORT_SPEEDUP(opt.saoCuStatsE1, ref.saoCuStatsE1, sbuf2, pbuf3, 64, upBuff1 + 1,60, 61, stats, count);
}
if (opt.saoCuStatsE2)
More information about the x265-devel
mailing list