[x265] [PATCH] sao: modify saoCuOrgE3_2Rows C code and add sse4 code
Divya Manivannan
divya at multicorewareinc.com
Mon Apr 20 15:32:01 CEST 2015
# HG changeset patch
# User Divya Manivannan <divya at multicorewareinc.com>
# Date 1429536293 -19800
# Mon Apr 20 18:54:53 2015 +0530
# Node ID b0aff8e0b995bd0e507825be0796e18694b60f1f
# Parent 5c3443546cccea47316d59dbc4f892e1b6f8b1b5
sao: modify saoCuOrgE3_2Rows C code and add sse4 code
SAO_EO_3_2Rows 9.52x 1042.79 9930.47
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/loopfilter.cpp Mon Apr 20 18:54:53 2015 +0530
@@ -122,25 +122,21 @@
}
}
-void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* signDown)
+void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
{
- int8_t signDown1;
+ int8_t signDown;
int8_t edgeType;
for (int y = 0; y < 2; y++)
{
- edgeType = signDown[y] + upBuff1[startX] + 2;
- upBuff1[startX - 1] = -signDown[y];
- rec[startX] = x265_clip(rec[startX] + offsetEo[edgeType]);
-
for (int x = startX + 1; x < endX; x++)
{
- signDown1 = signOf(rec[x] - rec[x + stride]);
- edgeType = signDown1 + upBuff1[x] + 2;
- upBuff1[x - 1] = -signDown1;
+ signDown = signOf(rec[x] - rec[x + stride]);
+ edgeType = signDown + upBuff1[x] + 2;
+ upBuff1[x - 1] = -signDown;
rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
}
- upBuff1[endX - 1] = signOf(rec[endX - 1 + stride + 1] - rec[endX]);
+ upBuff1[endX - 1] = upBuff[y];
rec += stride + 1;
}
}
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/primitives.h
--- a/source/common/primitives.h Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/primitives.h Mon Apr 20 18:54:53 2015 +0530
@@ -172,7 +172,7 @@
typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
-typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* signDown);
+typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Mon Apr 20 18:54:53 2015 +0530
@@ -1507,6 +1507,7 @@
p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
+ p.saoCuOrgE3_2Rows = x265_saoCuOrgE3_2Rows_sse4;
p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
LUMA_ADDAVG(sse4);
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/x86/loopfilter.asm Mon Apr 20 18:54:53 2015 +0530
@@ -582,6 +582,135 @@
movhps [r1 + r5 - 1], xm7
RET
+;=============================================================================================================================
+;void saoCuOrgE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
+;=============================================================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgE3_2Rows, 3, 7, 8
+ mov r3d, r3m
+ mov r4d, r4m
+ movu m5, [r2]
+ mov r2d, r5m
+ mov r6, r6m
+
+ movh m7, [r0 + r2]
+ movhps m7, [r1 + r2 - 1]
+
+ inc r4d
+ add r0, r4
+ add r1, r4
+
+ sub r2d, r4d
+ pxor m0, m0 ; m0 = 0
+ mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
+
+.loop:
+ movu m1, [r0] ; m1 = pRec[x]
+ movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
+
+ psubusb m3, m2, m1
+ psubusb m4, m1, m2
+ pcmpeqb m3, m0
+ pcmpeqb m4, m0
+ pcmpeqb m2, m1
+
+ pabsb m3, m3
+ por m4, m3
+ pandn m2, m4 ; m2 = iSignDown
+
+ movu m3, [r1] ; m3 = m_iUpBuff1
+
+ paddb m3, m2
+ paddb m3, m6 ; m3 = uiEdgeType
+
+ pshufb m4, m5, m3
+
+ psubb m3, m0, m2
+ movu [r1 - 1], m3
+
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m4
+ punpckhbw m4, m4
+ psraw m4, 8
+
+ paddw m2, m3
+ paddw m1, m4
+ packuswb m2, m1
+ movu [r0], m2
+
+ add r0, 16
+ add r1, 16
+ sub r2, 16
+ jg .loop
+
+ add r0, r2
+ add r1, r2
+ movh [r0], m7
+ movhps [r1 - 1], m7
+
+ mov r5d, r5m
+ mov r2b, byte[r6]
+ mov byte[r1 - 1], r2b
+
+ sub r0, r5
+ lea r0, [r0 + r3 + 1]
+
+ movh m7, [r0 + r5]
+ movhps m7, [r1 - 1]
+
+ sub r1, r5
+ add r0, r4
+ add r1, r4
+ sub r5d, r4d
+
+.loop1:
+ movu m1, [r0] ; m1 = pRec[x]
+ movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
+
+ psubusb m3, m2, m1
+ psubusb m4, m1, m2
+ pcmpeqb m3, m0
+ pcmpeqb m4, m0
+ pcmpeqb m2, m1
+
+ pabsb m3, m3
+ por m4, m3
+ pandn m2, m4 ; m2 = iSignDown
+
+ movu m3, [r1] ; m3 = m_iUpBuff1
+
+ paddb m3, m2
+ paddb m3, m6 ; m3 = uiEdgeType
+
+ pshufb m4, m5, m3
+
+ psubb m3, m0, m2
+ movu [r1 - 1], m3
+
+ pmovzxbw m2, m1
+ punpckhbw m1, m0
+ pmovsxbw m3, m4
+ punpckhbw m4, m4
+ psraw m4, 8
+
+ paddw m2, m3
+ paddw m1, m4
+ packuswb m2, m1
+ movu [r0], m2
+
+ add r0, 16
+ add r1, 16
+ sub r5, 16
+ jg .loop1
+
+ movh [r0 + r5], m7
+ movhps [r1 + r5 - 1], m7
+
+ mov r2b, byte[r6 + 1]
+ mov byte[r1 + r5 - 1], r2b
+ RET
+
;=====================================================================================
; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
;=====================================================================================
diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h Sat Apr 18 10:02:19 2015 -0700
+++ b/source/common/x86/loopfilter.h Mon Apr 20 18:54:53 2015 +0530
@@ -34,6 +34,7 @@
void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
+void x265_saoCuOrgE3_2Rows_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
diff -r 5c3443546ccc -r b0aff8e0b995 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp Sat Apr 18 10:02:19 2015 -0700
+++ b/source/encoder/sao.cpp Mon Apr 20 18:54:53 2015 +0530
@@ -517,13 +517,26 @@
upBuff1[ctuWidth - 1] = lastSign;
int diff = endY - startY;
- for (y = 0; y < diff / 2; y++)
+ for (y = 0; y < (diff >> 1); y++)
{
- int8_t signDown[2];
- signDown[0] = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
- signDown[1] = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
+ int8_t signDown, signDown0, upBuff[2];
+ int edgeType1;
- primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, signDown);
+ signDown = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
+ edgeType1 = signDown + upBuff1[startX] + 2;
+ rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType1]];
+
+ signDown = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
+ signDown0 = signOf(rec[startX + 1] - rec[startX + stride]);
+ edgeType1 = signDown - signDown0 + 2;
+ upBuff1[startX - 1] = -signDown;
+
+ upBuff[0] = signOf(rec[endX - 1 + stride] - rec[endX]);
+ upBuff[1] = signOf(rec[endX - 1 + 2 * stride] - rec[endX + stride]);
+
+ primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, upBuff);
+
+ rec[startX + stride] = m_clipTable[rec[startX + stride] + m_offsetEo[edgeType1]];
rec += 2 * stride;
}
More information about the x265-devel
mailing list