[x265] [PATCH] aoCuOrgE2: asm code
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Jan 6 12:06:13 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1420541256 -19800
# Node ID 382dc33423b4d18ff7babbe8f97cbba58f77876b
# Parent feebd0ecda691aeaf9265c7cb20897169df6866a
aoCuOrgE2: asm code
diff -r feebd0ecda69 -r 382dc33423b4 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Mon Jan 05 18:57:20 2015 +0530
+++ b/source/common/loopfilter.cpp Tue Jan 06 16:17:36 2015 +0530
@@ -57,6 +57,19 @@
}
}
+void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
+{
+ int x = 0;
+ for (x = 0; x < width; x++)
+ {
+ int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
+ int edgeType = signDown + buff1[x] + 2;
+ bufft[x + 1] = -signDown;
+ short v = rec[x] + offsetEo[edgeType];
+ rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
+ }
+}
+
void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
{
#define SAO_BO_BITS 5
@@ -81,6 +94,7 @@
void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
{
p.saoCuOrgE0 = processSaoCUE0;
+ p.saoCuOrgE2 = processSaoCUE2;
p.saoCuOrgB0 = processSaoCUB0;
p.sign = calSign;
}
diff -r feebd0ecda69 -r 382dc33423b4 source/common/primitives.h
--- a/source/common/primitives.h Mon Jan 05 18:57:20 2015 +0530
+++ b/source/common/primitives.h Tue Jan 06 16:17:36 2015 +0530
@@ -191,6 +191,7 @@
typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
@@ -244,6 +245,7 @@
sign_t sign;
saoCuOrgE0_t saoCuOrgE0;
+ saoCuOrgE2_t saoCuOrgE2;
saoCuOrgB0_t saoCuOrgB0;
downscale_t frameInitLowres;
diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Jan 05 18:57:20 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Jan 06 16:17:36 2015 +0530
@@ -1650,6 +1650,7 @@
{
p.sign = x265_calSign_sse4;
p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
+ p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
LUMA_ADDAVG(_sse4);
diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Mon Jan 05 18:57:20 2015 +0530
+++ b/source/common/x86/loopfilter.asm Tue Jan 06 16:17:36 2015 +0530
@@ -35,6 +35,7 @@
cextern pb_1
cextern pb_128
cextern pb_2
+cextern pw_2
;============================================================================================================
@@ -85,6 +86,58 @@
jnz .loop
RET
+;======================================================================================================================================================
+; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
+;======================================================================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE2, 5, 7, 8, rec, bufft, buff1, offsetEo, lcuWidth
+
+ mov r6, 16
+ mov r5d, r5m
+ pxor m0, m0 ; m0 = 0
+ mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ mova m7, [pb_128]
+ shr r4d, 4
+ inc r1q
+
+ .loop
+ movu m1, [r0] ; m1 = rec[x]
+ movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1]
+ pxor m3, m1, m7
+ pxor m4, m2, m7
+ pcmpgtb m2, m3, m4
+ pcmpgtb m4, m3
+ pand m2, [pb_1]
+ por m2, m4
+ movu m3, [r2] ; m3 = buff1
+
+ paddb m3, m2
+ paddb m3, m6 ; m3 = edgeType
+
+ movu m4, [r3] ; m4 = offsetEo
+ pshufb m4, m3
+
+ psubb m3, m0, m2
+ movu [r1], m3
+
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m4
+ punpckhbw m4, m4
+ psraw m4, 8
+
+ paddw m2, m3
+ paddw m1, m4
+ packuswb m2, m1
+ movu [r0], m2
+
+ add r0, r6
+ add r1, r6
+ add r2, r6
+ dec r4d
+ jnz .loop
+ RET
+
;=====================================================================================
; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
;=====================================================================================
diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Mon Jan 05 18:57:20 2015 +0530
+++ b/source/common/x86/loopfilter.h Tue Jan 06 16:17:36 2015 +0530
@@ -26,6 +26,7 @@
#define X265_LOOPFILTER_H
void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r feebd0ecda69 -r 382dc33423b4 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Mon Jan 05 18:57:20 2015 +0530
+++ b/source/encoder/sao.cpp Tue Jan 06 16:17:36 2015 +0530
@@ -385,23 +385,54 @@
upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
}
- for (y = startY; y < endY; y++)
+ if (ctuWidth & 15)
{
- upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
- for (x = startX; x < endX; x++)
- {
- int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
- int edgeType = signDown + upBuff1[x] + 2;
- upBufft[x + 1] = -signDown;
- rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
- }
+ for (y = startY; y < endY; y++)
+ {
+ upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
+ for (x = startX; x < endX; x++)
+ {
+ int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
+ int edgeType = signDown + upBuff1[x] + 2;
+ upBufft[x + 1] = -signDown;
+ rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
+ }
- std::swap(upBuff1, upBufft);
+ std::swap(upBuff1, upBufft);
- rec += stride;
+ rec += stride;
+ }
}
+ else
+ {
+ for (y = startY; y < endY; y++)
+ {
+ int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
+ pixel firstPxl = rec[0]; // copy first Pxl
+ pixel lastPxl = rec[ctuWidth - 1];
+ int8_t one = upBufft[1];
+ int8_t two = upBufft[endX + 1];
- break;
+ primitives.saoCuOrgE2(rec, upBufft, upBuff1, m_offsetEo, ctuWidth, stride);
+ if (!lpelx)
+ {
+ rec[0] = firstPxl;
+ upBufft[1] = one;
+ }
+
+ if (rpelx == picWidth)
+ {
+ rec[ctuWidth - 1] = lastPxl;
+ upBufft[endX + 1] = two;
+ }
+
+ upBufft[startX] = iSignDown2;
+
+ std::swap(upBuff1, upBufft);
+ rec += stride;
+ }
+ }
+ break;
}
case SAO_EO_3: // dir: 45
{
diff -r feebd0ecda69 -r 382dc33423b4 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Mon Jan 05 18:57:20 2015 +0530
+++ b/source/test/pixelharness.cpp Tue Jan 06 16:17:36 2015 +0530
@@ -65,7 +65,9 @@
sbuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
- psbuf1[i] = (rand() % 65) - 32; // range is between -32 to 32
+ psbuf1[i] = psbuf4[i] = (rand() % 65) - 32; // range is between -32 to 32
+ psbuf2[i] = (rand() % 3) - 1;
+ psbuf3[i] = (rand() % 129) - 128;
sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
}
}
@@ -917,6 +919,37 @@
return true;
}
+bool PixelHarness::check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt)
+{
+ ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+ ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+ memset(ref_dest, 0xCD, sizeof(ref_dest));
+ memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+ int j = 0;
+
+ for (int i = 0; i < ITERS; i++)
+ {
+ int width = 16 * (rand() % 4 + 1);
+ int stride = width + 1;
+
+ ref(ref_dest, psbuf1 + j, psbuf2 + j, psbuf3 + j, width, stride);
+ checked(opt, opt_dest, psbuf4 + j, psbuf2 + j, psbuf3 + j, width, stride);
+
+ if (memcmp(psbuf1 + j, psbuf4 + j, width * sizeof(int8_t)))
+ return false;
+
+ if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+ return false;
+
+ reportfail();
+ j += INCR;
+ }
+
+ return true;
+}
+
bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
{
ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
@@ -1463,6 +1496,15 @@
}
}
+ if (opt.saoCuOrgE2)
+ {
+ if (!check_saoCuOrgE2_t(ref.saoCuOrgE2, opt.saoCuOrgE2))
+ {
+ printf("SAO_EO_2 failed\n");
+ return false;
+ }
+ }
+
if (opt.saoCuOrgB0)
{
if (!check_saoCuOrgB0_t(ref.saoCuOrgB0, opt.saoCuOrgB0))
@@ -1801,6 +1843,12 @@
REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
}
+ if (opt.saoCuOrgE2)
+ {
+ HEADER0("SAO_EO_2");
+ REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
+ }
+
if (opt.saoCuOrgB0)
{
HEADER0("SAO_BO_0");
diff -r feebd0ecda69 -r 382dc33423b4 source/test/pixelharness.h
--- a/source/test/pixelharness.h Mon Jan 05 18:57:20 2015 +0530
+++ b/source/test/pixelharness.h Tue Jan 06 16:17:36 2015 +0530
@@ -47,6 +47,9 @@
pixel pbuf4[BUFFSIZE];
int ibuf1[BUFFSIZE];
int8_t psbuf1[BUFFSIZE];
+ int8_t psbuf2[BUFFSIZE];
+ int8_t psbuf3[BUFFSIZE];
+ int8_t psbuf4[BUFFSIZE];
int16_t sbuf1[BUFFSIZE];
int16_t sbuf2[BUFFSIZE];
@@ -90,6 +93,7 @@
bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
bool check_addAvg(addAvg_t, addAvg_t);
bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
+ bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
More information about the x265-devel
mailing list