[x265] [PATCH] sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately
Divya Manivannan
divya at multicorewareinc.com
Wed Apr 22 08:44:37 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429684176 -19800
# Wed Apr 22 11:59:36 2015 +0530
# Node ID 584211b333ac9640d81423b3f60a18956425e27c
# Parent 86268e498680951069c48b681eef830b0aa37873
sao: remove saoCuOrgE3_2Rows function and modify saoCuOrgE3 primitive to handle width=16 seperately
diff -r 86268e498680 -r 584211b333ac source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/loopfilter.cpp Wed Apr 22 11:59:36 2015 +0530
@@ -122,25 +122,6 @@
}
}
-void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
-{
- int8_t signDown;
- int8_t edgeType;
-
- for (int y = 0; y < 2; y++)
- {
- for (int x = startX + 1; x < endX; x++)
- {
- signDown = signOf(rec[x] - rec[x + stride]);
- edgeType = signDown + upBuff1[x] + 2;
- upBuff1[x - 1] = -signDown;
- rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
- }
- upBuff1[endX - 1] = upBuff[y];
- rec += stride + 1;
- }
-}
-
void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
{
#define SAO_BO_BITS 5
@@ -164,8 +145,8 @@
p.saoCuOrgE1 = processSaoCUE1;
p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows;
p.saoCuOrgE2 = processSaoCUE2;
- p.saoCuOrgE3 = processSaoCUE3;
- p.saoCuOrgE3_2Rows = processSaoCUE3_2Rows;
+ p.saoCuOrgE3[0] = processSaoCUE3;
+ p.saoCuOrgE3[1] = processSaoCUE3;
p.saoCuOrgB0 = processSaoCUB0;
p.sign = calSign;
}
diff -r 86268e498680 -r 584211b333ac source/common/primitives.h
--- a/source/common/primitives.h Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/primitives.h Wed Apr 22 11:59:36 2015 +0530
@@ -172,7 +172,6 @@
typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
-typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
@@ -278,8 +277,7 @@
saoCuOrgE0_t saoCuOrgE0;
saoCuOrgE1_t saoCuOrgE1, saoCuOrgE1_2Rows;
saoCuOrgE2_t saoCuOrgE2;
- saoCuOrgE3_t saoCuOrgE3;
- saoCuOrgE3_2Rows_t saoCuOrgE3_2Rows;
+ saoCuOrgE3_t saoCuOrgE3[2];
saoCuOrgB0_t saoCuOrgB0;
downscale_t frameInitLowres;
diff -r 86268e498680 -r 584211b333ac source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Apr 22 11:59:36 2015 +0530
@@ -1519,8 +1519,8 @@
p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
- p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
- p.saoCuOrgE3_2Rows = x265_saoCuOrgE3_2Rows_sse4;
+ p.saoCuOrgE3[0] = x265_saoCuOrgE3_sse4;
+ p.saoCuOrgE3[1] = x265_saoCuOrgE3_sse4;
p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
LUMA_ADDAVG(sse4);
@@ -1728,7 +1728,7 @@
p.saoCuOrgE0 = x265_saoCuOrgE0_avx2;
p.saoCuOrgE1 = x265_saoCuOrgE1_avx2;
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_avx2;
- p.saoCuOrgE3 = x265_saoCuOrgE3_avx2;
+ p.saoCuOrgE3[0] = x265_saoCuOrgE3_avx2;
p.saoCuOrgB0 = x265_saoCuOrgB0_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
diff -r 86268e498680 -r 584211b333ac source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/loopfilter.asm Wed Apr 22 11:59:36 2015 +0530
@@ -582,135 +582,6 @@
movhps [r1 + r5 - 1], xm7
RET
-;=============================================================================================================================
-;void saoCuOrgE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
-;=============================================================================================================================
-INIT_XMM sse4
-cglobal saoCuOrgE3_2Rows, 3, 7, 8
- mov r3d, r3m
- mov r4d, r4m
- movu m5, [r2]
- mov r2d, r5m
- mov r6, r6m
-
- movh m7, [r0 + r2]
- movhps m7, [r1 + r2 - 1]
-
- inc r4d
- add r0, r4
- add r1, r4
-
- sub r2d, r4d
- pxor m0, m0 ; m0 = 0
- mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
-
-.loop:
- movu m1, [r0] ; m1 = pRec[x]
- movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
-
- psubusb m3, m2, m1
- psubusb m4, m1, m2
- pcmpeqb m3, m0
- pcmpeqb m4, m0
- pcmpeqb m2, m1
-
- pabsb m3, m3
- por m4, m3
- pandn m2, m4 ; m2 = iSignDown
-
- movu m3, [r1] ; m3 = m_iUpBuff1
-
- paddb m3, m2
- paddb m3, m6 ; m3 = uiEdgeType
-
- pshufb m4, m5, m3
-
- psubb m3, m0, m2
- movu [r1 - 1], m3
-
- pmovzxbw m2, m1
- punpckhbw m1, m0
- pmovsxbw m3, m4
- punpckhbw m4, m4
- psraw m4, 8
-
- paddw m2, m3
- paddw m1, m4
- packuswb m2, m1
- movu [r0], m2
-
- add r0, 16
- add r1, 16
- sub r2, 16
- jg .loop
-
- add r0, r2
- add r1, r2
- movh [r0], m7
- movhps [r1 - 1], m7
-
- mov r5d, r5m
- mov r2b, byte[r6]
- mov byte[r1 - 1], r2b
-
- sub r0, r5
- lea r0, [r0 + r3 + 1]
-
- movh m7, [r0 + r5]
- movhps m7, [r1 - 1]
-
- sub r1, r5
- add r0, r4
- add r1, r4
- sub r5d, r4d
-
-.loop1:
- movu m1, [r0] ; m1 = pRec[x]
- movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
-
- psubusb m3, m2, m1
- psubusb m4, m1, m2
- pcmpeqb m3, m0
- pcmpeqb m4, m0
- pcmpeqb m2, m1
-
- pabsb m3, m3
- por m4, m3
- pandn m2, m4 ; m2 = iSignDown
-
- movu m3, [r1] ; m3 = m_iUpBuff1
-
- paddb m3, m2
- paddb m3, m6 ; m3 = uiEdgeType
-
- pshufb m4, m5, m3
-
- psubb m3, m0, m2
- movu [r1 - 1], m3
-
- pmovzxbw m2, m1
- punpckhbw m1, m0
- pmovsxbw m3, m4
- punpckhbw m4, m4
- psraw m4, 8
-
- paddw m2, m3
- paddw m1, m4
- packuswb m2, m1
- movu [r0], m2
-
- add r0, 16
- add r1, 16
- sub r5, 16
- jg .loop1
-
- movh [r0 + r5], m7
- movhps [r1 + r5 - 1], m7
-
- mov r2b, byte[r6 + 1]
- mov byte[r1 + r5 - 1], r2b
- RET
-
;=====================================================================================
; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
;=====================================================================================
diff -r 86268e498680 -r 584211b333ac source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Wed Apr 22 00:00:39 2015 -0500
+++ b/source/common/x86/loopfilter.h Wed Apr 22 11:59:36 2015 +0530
@@ -34,7 +34,6 @@
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
-void x265_saoCuOrgE3_2Rows_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 86268e498680 -r 584211b333ac source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/encoder/sao.cpp Wed Apr 22 11:59:36 2015 +0530
@@ -516,41 +516,20 @@
if (rpelx == picWidth)
upBuff1[ctuWidth - 1] = lastSign;
- int diff = endY - startY;
- for (y = 0; y < (diff >> 1); y++)
- {
- int8_t signDown, signDown0, upBuff[2];
- int edgeType1;
-
- signDown = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
- edgeType1 = signDown + upBuff1[startX] + 2;
- rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType1]];
-
- signDown = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
- signDown0 = signOf(rec[startX + 1] - rec[startX + stride]);
- edgeType1 = signDown - signDown0 + 2;
- upBuff1[startX - 1] = -signDown;
-
- upBuff[0] = signOf(rec[endX - 1 + stride] - rec[endX]);
- upBuff[1] = signOf(rec[endX - 1 + 2 * stride] - rec[endX + stride]);
-
- primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, upBuff);
-
- rec[startX + stride] = m_clipTable[rec[startX + stride] + m_offsetEo[edgeType1]];
-
- rec += 2 * stride;
- }
- if (diff & 1)
- {
- int8_t signDown1 = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
- int edgeType = signDown1 + upBuff1[startX] + 2;
- upBuff1[startX - 1] = -signDown1;
- rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType]];
-
- primitives.saoCuOrgE3(rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
-
- upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
- }
+ for (y = startY; y < endY; y++)
+ {
+ x = startX;
+ int8_t signDown = signOf(rec[x] - tmpL[y + 1]);
+ int edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x - 1] = -signDown;
+ rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+
+ primitives.saoCuOrgE3[endX > 16](rec, upBuff1, m_offsetEo, stride - 1, startX, endX);
+
+ upBuff1[endX - 1] = signOf(rec[endX - 1 + stride] - rec[endX]);
+
+ rec += stride;
+ }
}
break;
diff -r 86268e498680 -r 584211b333ac source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Wed Apr 22 00:00:39 2015 -0500
+++ b/source/test/pixelharness.cpp Wed Apr 22 11:59:36 2015 +0530
@@ -66,7 +66,7 @@
sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
psbuf1[i] = psbuf4[i] = (rand() % 65) - 32; // range is between -32 to 32
- psbuf2[i] = psbuf5[i] = psbuf6[i] = psbuf7[i] = (rand() % 3) - 1; // possible values {-1,0,1}
+ psbuf2[i] = psbuf5[i] = (rand() % 3) - 1; // possible values {-1,0,1}
psbuf3[i] = (rand() % 129) - 128;
sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
}
@@ -1011,34 +1011,34 @@
return true;
}
-bool PixelHarness::check_saoCuOrgE3_2Rows_t(saoCuOrgE3_2Rows_t ref, saoCuOrgE3_2Rows_t opt)
-{
- ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
- ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
-
- memset(ref_dest, 0xCD, sizeof(ref_dest));
- memset(opt_dest, 0xCD, sizeof(opt_dest));
-
- int j = 0;
-
- for (int i = 0; i < ITERS; i++)
- {
- int stride = 16 * (rand() % 4 + 1);
- int start = rand() % 2;
- int end = (16 * (rand() % 4 + 1)) - rand() % 2;
-
- ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end, psbuf6 + j);
- checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end, psbuf7 + j);
-
- if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
- return false;
-
- reportfail();
- j += INCR;
- }
-
- return true;
-}
+bool PixelHarness::check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int stride = 32 * (rand() % 2 + 1);
+ int start = rand() % 2;
+ int end = (32 * (rand() % 2 + 1)) - rand() % 2;
+
+ ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end);
+ checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end);
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
{
@@ -1788,20 +1788,20 @@
}
}
- if (opt.saoCuOrgE3)
+ if (opt.saoCuOrgE3[0])
{
- if (!check_saoCuOrgE3_t(ref.saoCuOrgE3, opt.saoCuOrgE3))
+ if (!check_saoCuOrgE3_t(ref.saoCuOrgE3[0], opt.saoCuOrgE3[0]))
{
- printf("SAO_EO_3 failed\n");
+ printf("SAO_EO_3[0] failed\n");
return false;
}
}
- if (opt.saoCuOrgE3_2Rows)
+ if (opt.saoCuOrgE3[1])
{
- if (!check_saoCuOrgE3_2Rows_t(ref.saoCuOrgE3_2Rows, opt.saoCuOrgE3_2Rows))
+ if (!check_saoCuOrgE3_32_t(ref.saoCuOrgE3[1], opt.saoCuOrgE3[1]))
{
- printf("SAO_EO_3_2Rows failed\n");
+ printf("SAO_EO_3[1] failed\n");
return false;
}
}
@@ -2192,16 +2192,16 @@
REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
}
- if (opt.saoCuOrgE3)
+ if (opt.saoCuOrgE3[0])
{
- HEADER0("SAO_EO_3");
- REPORT_SPEEDUP(opt.saoCuOrgE3, ref.saoCuOrgE3, pbuf1, psbuf2, psbuf1, 64, 0, 64);
+ HEADER0("SAO_EO_3[0]");
+ REPORT_SPEEDUP(opt.saoCuOrgE3[0], ref.saoCuOrgE3[0], pbuf1, psbuf2, psbuf1, 64, 0, 64);
}
- if (opt.saoCuOrgE3_2Rows)
+ if (opt.saoCuOrgE3[1])
{
- HEADER0("SAO_EO_3_2Rows");
- REPORT_SPEEDUP(opt.saoCuOrgE3_2Rows, ref.saoCuOrgE3_2Rows, pbuf1, psbuf2, psbuf1, 64, 0, 64, psbuf6);
+ HEADER0("SAO_EO_3[1]");
+ REPORT_SPEEDUP(opt.saoCuOrgE3[1], ref.saoCuOrgE3[1], pbuf1, psbuf2, psbuf1, 64, 0, 64);
}
if (opt.saoCuOrgB0)
diff -r 86268e498680 -r 584211b333ac source/test/pixelharness.h
--- a/source/test/pixelharness.h Wed Apr 22 00:00:39 2015 -0500
+++ b/source/test/pixelharness.h Wed Apr 22 11:59:36 2015 +0530
@@ -51,8 +51,6 @@
int8_t psbuf3[BUFFSIZE];
int8_t psbuf4[BUFFSIZE];
int8_t psbuf5[BUFFSIZE];
- int8_t psbuf6[BUFFSIZE];
- int8_t psbuf7[BUFFSIZE];
int16_t sbuf1[BUFFSIZE];
int16_t sbuf2[BUFFSIZE];
@@ -100,7 +98,7 @@
bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
- bool check_saoCuOrgE3_2Rows_t(saoCuOrgE3_2Rows_t ref, saoCuOrgE3_2Rows_t opt);
+ bool check_saoCuOrgE3_32_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
More information about the x265-devel
mailing list