[x265] [PATCH] sao: add C and sse4 code of saoCuOrgE1 to process 2 rows
Divya Manivannan
divya at multicorewareinc.com
Wed Apr 8 06:26:41 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1428406902 -19800
# Tue Apr 07 17:11:42 2015 +0530
# Node ID 7044924d68147152533fe1502df2c75a3512befb
# Parent 3e416dec8024b8339b18568cf65e48eb3448bed1
sao: add C and sse4 code of saoCuOrgE1 to process 2 rows
diff -r 3e416dec8024 -r 7044924d6814 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/loopfilter.cpp Tue Apr 07 17:11:42 2015 +0530
@@ -77,6 +77,25 @@
}
}
+void processSaoCUE1_2Rows(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width)
+{
+ int x, y;
+ int8_t signDown;
+ int edgeType;
+
+ for (y = 0; y < 2; y++)
+ {
+ for (x = 0; x < width; x++)
+ {
+ signDown = signOf(rec[x] - rec[x + stride]);
+ edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x] = -signDown;
+ rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+ }
+ rec += stride;
+ }
+}
+
void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
{
int x;
@@ -124,6 +143,7 @@
{
p.saoCuOrgE0 = processSaoCUE0;
p.saoCuOrgE1 = processSaoCUE1;
+ p.saoCuOrgE1_2Rows = processSaoCUE1_2Rows;
p.saoCuOrgE2 = processSaoCUE2;
p.saoCuOrgE3 = processSaoCUE3;
p.saoCuOrgB0 = processSaoCUB0;
diff -r 3e416dec8024 -r 7044924d6814 source/common/primitives.h
--- a/source/common/primitives.h Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/primitives.h Tue Apr 07 17:11:42 2015 +0530
@@ -274,7 +274,7 @@
sign_t sign;
saoCuOrgE0_t saoCuOrgE0;
- saoCuOrgE1_t saoCuOrgE1;
+ saoCuOrgE1_t saoCuOrgE1, saoCuOrgE1_2Rows;
saoCuOrgE2_t saoCuOrgE2;
saoCuOrgE3_t saoCuOrgE3;
saoCuOrgB0_t saoCuOrgB0;
diff -r 3e416dec8024 -r 7044924d6814 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Tue Apr 07 17:11:42 2015 +0530
@@ -1374,6 +1374,7 @@
p.sign = x265_calSign_sse4;
p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
+ p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
diff -r 3e416dec8024 -r 7044924d6814 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/loopfilter.asm Tue Apr 07 17:11:42 2015 +0530
@@ -191,46 +191,121 @@
mov r3d, r3m
mov r4d, r4m
pxor m0, m0 ; m0 = 0
- movu m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+ mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
mova m7, [pb_128]
shr r4d, 4
- .loop
- movu m1, [r0] ; m1 = pRec[x]
- movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
+.loop
+ movu m1, [r0] ; m1 = pRec[x]
+ movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
- pxor m3, m1, m7
- pxor m4, m2, m7
- pcmpgtb m2, m3, m4
- pcmpgtb m4, m3
- pand m2, [pb_1]
- por m2, m4
+ pxor m3, m1, m7
+ pxor m4, m2, m7
+ pcmpgtb m2, m3, m4
+ pcmpgtb m4, m3
+ pand m2, [pb_1]
+ por m2, m4
- movu m3, [r1] ; m3 = m_iUpBuff1
+ movu m3, [r1] ; m3 = m_iUpBuff1
- paddb m3, m2
- paddb m3, m6
+ paddb m3, m2
+ paddb m3, m6
- movu m4, [r2] ; m4 = m_iOffsetEo
- pshufb m5, m4, m3
+ movu m4, [r2] ; m4 = m_iOffsetEo
+ pshufb m5, m4, m3
- psubb m3, m0, m2
- movu [r1], m3
+ psubb m3, m0, m2
+ movu [r1], m3
- pmovzxbw m2, m1
- punpckhbw m1, m0
- pmovsxbw m3, m5
- punpckhbw m5, m5
- psraw m5, 8
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m5
+ punpckhbw m5, m5
+ psraw m5, 8
- paddw m2, m3
- paddw m1, m5
- packuswb m2, m1
- movu [r0], m2
+ paddw m2, m3
+ paddw m1, m5
+ packuswb m2, m1
+ movu [r0], m2
- add r0, 16
- add r1, 16
- dec r4d
- jnz .loop
+ add r0, 16
+ add r1, 16
+ dec r4d
+ jnz .loop
+ RET
+
+;========================================================================================================
+; void saoCuOrgE1_2Rows(pixel *pRec, int8_t *m_iUpBuff1, int8_t *m_iOffsetEo, Int iStride, Int iLcuWidth)
+;========================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE1_2Rows, 3, 5, 8, pRec, m_iUpBuff1, m_iOffsetEo, iStride, iLcuWidth
+ mov r3d, r3m
+ mov r4d, r4m
+ pxor m0, m0 ; m0 = 0
+ mova m7, [pb_128]
+ shr r4d, 4
+.loop
+ movu m1, [r0] ; m1 = pRec[x]
+ movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
+
+ pxor m3, m1, m7
+ pxor m4, m2, m7
+ pcmpgtb m6, m3, m4
+ pcmpgtb m5, m4, m3
+ pand m6, [pb_1]
+ por m6, m5
+
+ movu m5, [r0 + r3 * 2]
+ pxor m3, m5, m7
+ pcmpgtb m5, m4, m3
+ pcmpgtb m3, m4
+ pand m5, [pb_1]
+ por m5, m3
+
+ movu m3, [r1] ; m3 = m_iUpBuff1
+ paddb m3, m6
+ paddb m3, [pb_2]
+
+ movu m4, [r2] ; m4 = m_iOffsetEo
+ pshufb m4, m3
+
+ psubb m3, m0, m6
+ movu [r1], m3
+
+ pmovzxbw m6, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m4
+ punpckhbw m4, m4
+ psraw m4, 8
+
+ paddw m6, m3
+ paddw m1, m4
+ packuswb m6, m1
+ movu [r0], m6
+
+ movu m3, [r1] ; m3 = m_iUpBuff1
+ paddb m3, m5
+ paddb m3, [pb_2]
+
+ movu m4, [r2] ; m4 = m_iOffsetEo
+ pshufb m4, m3
+ psubb m3, m0, m5
+ movu [r1], m3
+
+ pmovzxbw m5, m2
+ punpckhbw m2, m0
+ pmovsxbw m3, m4
+ punpckhbw m4, m4
+ psraw m4, 8
+
+ paddw m5, m3
+ paddw m2, m4
+ packuswb m5, m2
+ movu [r0 + r3], m5
+
+ add r0, 16
+ add r1, 16
+ dec r4d
+ jnz .loop
RET
;======================================================================================================================================================
diff -r 3e416dec8024 -r 7044924d6814 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Tue Apr 07 16:00:39 2015 -0500
+++ b/source/common/x86/loopfilter.h Tue Apr 07 17:11:42 2015 +0530
@@ -28,6 +28,7 @@
void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
void x265_saoCuOrgE0_avx2(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
+void x265_saoCuOrgE1_2Rows_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
diff -r 3e416dec8024 -r 7044924d6814 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Tue Apr 07 16:00:39 2015 -0500
+++ b/source/encoder/sao.cpp Tue Apr 07 17:11:42 2015 +0530
@@ -367,11 +367,14 @@
{
primitives.sign(upBuff1, rec, tmpU, ctuWidth);
- for (y = startY; y < endY; y++)
+ int diff = (endY - startY) % 2;
+ for (y = startY; y < endY - diff; y += 2)
{
+ primitives.saoCuOrgE1_2Rows(rec, upBuff1, m_offsetEo, stride, ctuWidth);
+ rec += 2 * stride;
+ }
+ if (diff & 1)
primitives.saoCuOrgE1(rec, upBuff1, m_offsetEo, stride, ctuWidth);
- rec += stride;
- }
}
break;
diff -r 3e416dec8024 -r 7044924d6814 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Apr 07 16:00:39 2015 -0500
+++ b/source/test/pixelharness.cpp Tue Apr 07 17:11:42 2015 +0530
@@ -1687,6 +1687,15 @@
}
}
+ if (opt.saoCuOrgE1_2Rows)
+ {
+ if (!check_saoCuOrgE1_t(ref.saoCuOrgE1_2Rows, opt.saoCuOrgE1_2Rows))
+ {
+ printf("SAO_EO_1_2Rows failed\n");
+ return false;
+ }
+ }
+
if (opt.saoCuOrgE2)
{
if (!check_saoCuOrgE2_t(ref.saoCuOrgE2, opt.saoCuOrgE2))
@@ -2065,6 +2074,12 @@
REPORT_SPEEDUP(opt.saoCuOrgE1, ref.saoCuOrgE1, pbuf1, psbuf2, psbuf1, 64, 64);
}
+ if (opt.saoCuOrgE1_2Rows)
+ {
+ HEADER0("SAO_EO_1_2Rows");
+ REPORT_SPEEDUP(opt.saoCuOrgE1_2Rows, ref.saoCuOrgE1_2Rows, pbuf1, psbuf2, psbuf1, 64, 64);
+ }
+
if (opt.saoCuOrgE2)
{
HEADER0("SAO_EO_2");
More information about the x265-devel
mailing list