[x265] [PATCH 10 of 16] prepare on (fenc - frec) and improve saoCuStatsBO
Min Chen
chenm003 at 163.com
Wed Oct 7 00:55:21 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1444167712 18000
# Node ID 5f448e155870fdbb34242fa51c5a4eeebd71ebc0
# Parent 5429d2f26ebfa2245e1a754a4355caf5c7f13c27
prepare on (fenc - frec) and improve saoCuStatsBO
---
source/common/primitives.h | 2 +-
source/common/x86/loopfilter.asm | 37 +++++++++++--------------------------
source/common/x86/loopfilter.h | 2 +-
source/encoder/sao.cpp | 22 ++++++++++++++++++----
source/test/pixelharness.cpp | 6 +++---
5 files changed, 34 insertions(+), 35 deletions(-)
diff -r 5429d2f26ebf -r 5f448e155870 source/common/primitives.h
--- a/source/common/primitives.h Tue Oct 06 16:41:50 2015 -0500
+++ b/source/common/primitives.h Tue Oct 06 16:41:52 2015 -0500
@@ -176,7 +176,7 @@
typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
-typedef void (*saoCuStatsBO_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
+typedef void (*saoCuStatsBO_t)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE0_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE1_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count);
typedef void (*saoCuStatsE2_t)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBuff, int endX, int endY, int32_t *stats, int32_t *count);
diff -r 5429d2f26ebf -r 5f448e155870 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Oct 06 16:41:50 2015 -0500
+++ b/source/common/x86/loopfilter.asm Tue Oct 06 16:41:52 2015 -0500
@@ -1989,14 +1989,12 @@
%endif
;--------------------------------------------------------------------------------------------------------------------------
-; saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+; saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
;--------------------------------------------------------------------------------------------------------------------------
%if ARCH_X86_64
INIT_XMM sse4
-cglobal saoCuStatsBO, 7,12,6
- mova m3, [hmul_16p + 16]
- mova m4, [pb_124]
- mova m5, [pb_4]
+cglobal saoCuStatsBO, 7,12,2
+ mova m0, [pb_124]
xor r7d, r7d
.loopH:
@@ -2005,42 +2003,29 @@
mov r9d, r3d
.loopL:
movu m1, [r11]
- movu m0, [r10]
-
- punpckhbw m2, m0, m1
- punpcklbw m0, m1
- psrlw m1, 1 ; rec[x] >> boShift
- pmaddubsw m2, m3
- pmaddubsw m0, m3
- pand m1, m4
- paddb m1, m5
+ psrlw m1, 1 ; rec[x] >> boShift
+ pand m1, m0
%assign x 0
%rep 16
pextrb r7d, m1, x
-
-%if (x < 8)
- pextrw r8d, m0, (x % 8)
-%else
- pextrw r8d, m2, (x % 8)
-%endif
- movsx r8d, r8w
- inc dword [r6 + r7] ; count[classIdx]++
- add [r5 + r7], r8d ; stats[classIdx] += (fenc[x] - rec[x]);
+ movsx r8d, word [r10 + x*2] ; diff[x]
+ inc dword [r6 + r7 + 4] ; count[classIdx]++
+ add [r5 + r7 + 4], r8d ; stats[classIdx] += (fenc[x] - rec[x]);
dec r9d
jz .next
%assign x x+1
%endrep
- add r10, 16
+ add r10, 16*2
add r11, 16
jmp .loopL
.next:
- add r0, r2
+ add r0, 64*2 ; MAX_CU_SIZE
add r1, r2
dec r4d
- jnz .loopH
+ jnz .loopH
RET
%endif
diff -r 5429d2f26ebf -r 5f448e155870 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Oct 06 16:41:50 2015 -0500
+++ b/source/common/x86/loopfilter.h Tue Oct 06 16:41:52 2015 -0500
@@ -36,7 +36,7 @@
void PFX(saoCuOrgE3_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
void PFX(saoCuOrgE3_32_ ## cpu)(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX); \
void PFX(saoCuOrgB0_ ## cpu)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride); \
- void PFX(saoCuStatsBO_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
+ void PFX(saoCuStatsBO_ ## cpu)(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE0_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE1_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int endX, int endY, int32_t *stats, int32_t *count); \
void PFX(saoCuStatsE2_ ## cpu)(const pixel *fenc, const pixel *rec, intptr_t stride, int8_t *upBuff1, int8_t *upBufft, int endX, int endY, int32_t *stats, int32_t *count); \
diff -r 5429d2f26ebf -r 5f448e155870 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Tue Oct 06 16:41:50 2015 -0500
+++ b/source/encoder/sao.cpp Tue Oct 06 16:41:52 2015 -0500
@@ -712,6 +712,20 @@
int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
+ ALIGN_VAR_32(int16_t, diff[MAX_CU_SIZE * MAX_CU_SIZE]);
+
+ // Calculate (fenc - frec) and put into diff[]
+ // WARNING: *) May read beyond bound on video than width or height is NOT multiple of cuSize
+ // *) MUST BE handle ColorSpace other than 420 yourself!
+ //primitives.cu[g_maxLog2CUSize - 2 - (plane != 0)].sub_ps(diff, MAX_CU_SIZE, fenc0, rec0, stride, stride);
+ for(int y = 0; y < ctuHeight; y++)
+ {
+ for(int x = 0; x < ctuWidth; x++)
+ {
+ diff[y * MAX_CU_SIZE + x] = (fenc0[y * stride + x] - rec0[y * stride + x]);
+ }
+ }
+
// SAO_BO:
{
if (m_param->bSaoNonDeblocked)
@@ -723,7 +737,7 @@
endX = (rpelx == picWidth) ? ctuWidth : ctuWidth - skipR + plane_offset;
endY = (bpely == picHeight) ? ctuHeight : ctuHeight - skipB + plane_offset;
- primitives.saoCuStatsBO(fenc0, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
+ primitives.saoCuStatsBO(diff, rec0, stride, endX, endY, m_offsetOrg[plane][SAO_BO], m_count[plane][SAO_BO]);
}
{
@@ -1526,7 +1540,7 @@
}
// NOTE: must put in namespace X265_NS since we need class SAO
-void saoCuStatsBO_c(const pixel *fenc, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
+void saoCuStatsBO_c(const int16_t *diff, const pixel *rec, intptr_t stride, int endX, int endY, int32_t *stats, int32_t *count)
{
int x, y;
const int boShift = X265_DEPTH - SAO_BO_BITS;
@@ -1536,11 +1550,11 @@
for (x = 0; x < endX; x++)
{
int classIdx = 1 + (rec[x] >> boShift);
- stats[classIdx] += (fenc[x] - rec[x]);
+ stats[classIdx] += diff[x];
count[classIdx]++;
}
- fenc += stride;
+ diff += MAX_CU_SIZE;
rec += stride;
}
}
diff -r 5429d2f26ebf -r 5f448e155870 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Oct 06 16:41:50 2015 -0500
+++ b/source/test/pixelharness.cpp Tue Oct 06 16:41:52 2015 -0500
@@ -1062,8 +1062,8 @@
int endX = MAX_CU_SIZE - (rand() % 5);
int endY = MAX_CU_SIZE - (rand() % 4) - 1;
- ref(pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
- checked(opt, pbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
+ ref(sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_ref, count_ref);
+ checked(opt, sbuf2 + j + 1, pbuf3 + 1, stride, endX, endY, stats_vec, count_vec);
if (memcmp(stats_ref, stats_vec, sizeof(stats_ref)) || memcmp(count_ref, count_vec, sizeof(count_ref)))
return false;
@@ -2850,7 +2850,7 @@
{
int32_t stats[33], count[33];
HEADER0("saoCuStatsBO");
- REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, pbuf2, pbuf3, 64, 60, 61, stats, count);
+ REPORT_SPEEDUP(opt.saoCuStatsBO, ref.saoCuStatsBO, sbuf2, pbuf3, 64, 60, 61, stats, count);
}
if (opt.saoCuStatsE0)
More information about the x265-devel
mailing list