[x265] [PATCH] sao: modify C and SSE4 code for saoCuOrgE0 to process 2 rows
Divya Manivannan
divya at multicorewareinc.com
Mon Apr 6 10:38:18 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1428308807 -19800
# Mon Apr 06 13:56:47 2015 +0530
# Node ID a3ec7f6ba97beeb1208f9dbde25b7f5817d94ded
# Parent ebe5e57c4b45b45338035a1009b64585f21d66d5
sao: modify C and SSE4 code for saoCuOrgE0 to process 2 rows
diff -r ebe5e57c4b45 -r a3ec7f6ba97b source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Sat Apr 04 15:11:39 2015 -0500
+++ b/source/common/loopfilter.cpp Mon Apr 06 13:56:47 2015 +0530
@@ -42,18 +42,23 @@
dst[x] = signOf(src1[x] - src2[x]);
}
-void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft)
+void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
{
- int x;
- int8_t signRight;
+ int x, y;
+ int8_t signRight, signLeft0;
int8_t edgeType;
- for (x = 0; x < width; x++)
+ for (y = 0; y < 2; y++)
{
- signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
- edgeType = signRight + signLeft + 2;
- signLeft = -signRight;
- rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+ signLeft0 = signLeft[y];
+ for (x = 0; x < width; x++)
+ {
+ signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
+ edgeType = signRight + signLeft0 + 2;
+ signLeft0 = -signRight;
+ rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
+ }
+ rec += stride;
}
}
diff -r ebe5e57c4b45 -r a3ec7f6ba97b source/common/primitives.h
--- a/source/common/primitives.h Sat Apr 04 15:11:39 2015 -0500
+++ b/source/common/primitives.h Mon Apr 06 13:56:47 2015 +0530
@@ -168,7 +168,7 @@
typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
-typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t* signLeft, intptr_t stride);
typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
diff -r ebe5e57c4b45 -r a3ec7f6ba97b source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Sat Apr 04 15:11:39 2015 -0500
+++ b/source/common/x86/loopfilter.asm Mon Apr 06 13:56:47 2015 +0530
@@ -40,20 +40,25 @@
;============================================================================================================
-; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft)
+; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
;============================================================================================================
INIT_XMM sse4
-cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft
+cglobal saoCuOrgE0, 5, 5, 8, rec, offsetEo, lcuWidth, signLeft, stride
- neg r3 ; r3 = -signLeft
- movzx r3d, r3b
- movd m0, r3d
- mova m4, [pb_128] ; m4 = [80]
- pxor m5, m5 ; m5 = 0
- movu m6, [r1] ; m6 = offsetEo
+ mov r4d, r4m
+ mova m4, [pb_128] ; m4 = [80]
+ pxor m5, m5 ; m5 = 0
+ movu m6, [r1] ; m6 = offsetEo
+
+ movzx r1d, byte [r3]
+ inc r3
+ neg r1b
+ movd m0, r1d
+ lea r1, [r0 + r4]
+ mov r4d, r2d
.loop:
- movu m7, [r0] ; m1 = rec[x]
+ movu m7, [r0] ; m7 = rec[x]
movu m2, [r0 + 1] ; m2 = rec[x+1]
pxor m1, m7, m4
@@ -70,7 +75,7 @@
pxor m0, m0
palignr m0, m2, 15
paddb m2, m3
- paddb m2, [pb_2] ; m1 = uiEdgeType
+ paddb m2, [pb_2] ; m2 = uiEdgeType
pshufb m3, m6, m2
pmovzxbw m2, m7 ; rec
punpckhbw m7, m5
@@ -85,6 +90,43 @@
add r0q, 16
sub r2d, 16
jnz .loop
+
+ movzx r3d, byte [r3]
+ neg r3b
+ movd m0, r3d
+.loopH:
+ movu m7, [r1] ; m7 = rec[x]
+ movu m2, [r1 + 1] ; m2 = rec[x+1]
+
+ pxor m1, m7, m4
+ pxor m3, m2, m4
+ pcmpgtb m2, m1, m3
+ pcmpgtb m3, m1
+ pand m2, [pb_1]
+ por m2, m3
+
+ pslldq m3, m2, 1
+ por m3, m0
+
+ psignb m3, m4 ; m3 = signLeft
+ pxor m0, m0
+ palignr m0, m2, 15
+ paddb m2, m3
+ paddb m2, [pb_2] ; m2 = uiEdgeType
+ pshufb m3, m6, m2
+ pmovzxbw m2, m7 ; rec
+ punpckhbw m7, m5
+ pmovsxbw m1, m3 ; offsetEo
+ punpckhbw m3, m3
+ psraw m3, 8
+ paddw m2, m1
+ paddw m7, m3
+ packuswb m2, m7
+ movu [r1], m2
+
+ add r1q, 16
+ sub r4d, 16
+ jnz .loopH
RET
;==================================================================================================
diff -r ebe5e57c4b45 -r a3ec7f6ba97b source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Sat Apr 04 15:11:39 2015 -0500
+++ b/source/common/x86/loopfilter.h Mon Apr 06 13:56:47 2015 +0530
@@ -25,7 +25,7 @@
#ifndef X265_LOOPFILTER_H
#define X265_LOOPFILTER_H
-void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
diff -r ebe5e57c4b45 -r a3ec7f6ba97b source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Sat Apr 04 15:11:39 2015 -0500
+++ b/source/encoder/sao.cpp Mon Apr 06 13:56:47 2015 +0530
@@ -258,7 +258,7 @@
pixel* tmpL;
pixel* tmpU;
- int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
+ int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft1[2];
int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
@@ -279,7 +279,7 @@
{
case SAO_EO_0: // dir: -
{
- pixel firstPxl = 0, lastPxl = 0;
+ pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
startX = !lpelx;
endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
if (ctuWidth & 15)
@@ -301,25 +301,38 @@
}
else
{
- for (y = 0; y < ctuHeight; y++)
+ for (y = 0; y < ctuHeight; y += 2)
{
- int signLeft = signOf(rec[startX] - tmpL[y]);
+ signLeft1[0] = signOf(rec[startX] - tmpL[y]);
+ signLeft1[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
if (!lpelx)
+ {
firstPxl = rec[0];
+ row1FirstPxl = rec[stride];
+ }
if (rpelx == picWidth)
+ {
lastPxl = rec[ctuWidth - 1];
+ row1LastPxl = rec[stride + ctuWidth - 1];
+ }
- primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, (int8_t)signLeft);
+ primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, signLeft1, stride);
if (!lpelx)
+ {
rec[0] = firstPxl;
+ rec[stride] = row1FirstPxl;
+ }
if (rpelx == picWidth)
+ {
rec[ctuWidth - 1] = lastPxl;
+ rec[stride + ctuWidth - 1] = row1LastPxl;
+ }
- rec += stride;
+ rec += 2 * stride;
}
}
break;
diff -r ebe5e57c4b45 -r a3ec7f6ba97b source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Sat Apr 04 15:11:39 2015 -0500
+++ b/source/test/pixelharness.cpp Mon Apr 06 13:56:47 2015 +0530
@@ -908,12 +908,10 @@
for (int i = 0; i < ITERS; i++)
{
int width = 16 * (rand() % 4 + 1);
- int8_t sign = rand() % 3;
- if (sign == 2)
- sign = -1;
+ int stride = width + 1;
- ref(ref_dest, psbuf1 + j, width, sign);
- checked(opt, opt_dest, psbuf1 + j, width, sign);
+ ref(ref_dest, psbuf1 + j, width, psbuf2 + j, stride);
+ checked(opt, opt_dest, psbuf1 + j, width, psbuf5 + j, stride);
if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
return false;
@@ -2058,7 +2056,7 @@
if (opt.saoCuOrgE0)
{
HEADER0("SAO_EO_0");
- REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
+ REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, psbuf2, 64);
}
if (opt.saoCuOrgE1)
More information about the x265-devel
mailing list