[x265] [PATCH] sao: modify saoCuOrgE3_2Rows C code and add sse4 code
Steve Borho
steve at borho.org
Mon Apr 20 17:22:34 CEST 2015
On 04/20, Divya Manivannan wrote:
> # HG changeset patch
> # User Divya Manivannan <divya at multicorewareinc.com>
> # Date 1429536293 -19800
> # Mon Apr 20 18:54:53 2015 +0530
> # Node ID b0aff8e0b995bd0e507825be0796e18694b60f1f
> # Parent 5c3443546cccea47316d59dbc4f892e1b6f8b1b5
> sao: modify saoCuOrgE3_2Rows C code and add sse4 code
>
> SAO_EO_3_2Rows 9.52x 1042.79 9930.47
queued
> diff -r 5c3443546ccc -r b0aff8e0b995 source/common/loopfilter.cpp
> --- a/source/common/loopfilter.cpp Sat Apr 18 10:02:19 2015 -0700
> +++ b/source/common/loopfilter.cpp Mon Apr 20 18:54:53 2015 +0530
> @@ -122,25 +122,21 @@
> }
> }
>
> -void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* signDown)
> +void processSaoCUE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
> {
> - int8_t signDown1;
> + int8_t signDown;
> int8_t edgeType;
>
> for (int y = 0; y < 2; y++)
> {
> - edgeType = signDown[y] + upBuff1[startX] + 2;
> - upBuff1[startX - 1] = -signDown[y];
> - rec[startX] = x265_clip(rec[startX] + offsetEo[edgeType]);
> -
> for (int x = startX + 1; x < endX; x++)
> {
> - signDown1 = signOf(rec[x] - rec[x + stride]);
> - edgeType = signDown1 + upBuff1[x] + 2;
> - upBuff1[x - 1] = -signDown1;
> + signDown = signOf(rec[x] - rec[x + stride]);
> + edgeType = signDown + upBuff1[x] + 2;
> + upBuff1[x - 1] = -signDown;
> rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
> }
> - upBuff1[endX - 1] = signOf(rec[endX - 1 + stride + 1] - rec[endX]);
> + upBuff1[endX - 1] = upBuff[y];
> rec += stride + 1;
> }
> }
> diff -r 5c3443546ccc -r b0aff8e0b995 source/common/primitives.h
> --- a/source/common/primitives.h Sat Apr 18 10:02:19 2015 -0700
> +++ b/source/common/primitives.h Mon Apr 20 18:54:53 2015 +0530
> @@ -172,7 +172,7 @@
> typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
> typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
> -typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* signDown);
> +typedef void (*saoCuOrgE3_2Rows_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
> typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
> typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
> typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
> diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Sat Apr 18 10:02:19 2015 -0700
> +++ b/source/common/x86/asm-primitives.cpp Mon Apr 20 18:54:53 2015 +0530
> @@ -1507,6 +1507,7 @@
> p.saoCuOrgE1_2Rows = x265_saoCuOrgE1_2Rows_sse4;
> p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
> p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
> + p.saoCuOrgE3_2Rows = x265_saoCuOrgE3_2Rows_sse4;
> p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
>
> LUMA_ADDAVG(sse4);
> diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm Sat Apr 18 10:02:19 2015 -0700
> +++ b/source/common/x86/loopfilter.asm Mon Apr 20 18:54:53 2015 +0530
> @@ -582,6 +582,135 @@
> movhps [r1 + r5 - 1], xm7
> RET
>
> +;=============================================================================================================================
> +;void saoCuOrgE3_2Rows(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff)
> +;=============================================================================================================================
> +INIT_XMM sse4
> +cglobal saoCuOrgE3_2Rows, 3, 7, 8
> + mov r3d, r3m
> + mov r4d, r4m
> + movu m5, [r2]
> + mov r2d, r5m
> + mov r6, r6m
> +
> + movh m7, [r0 + r2]
> + movhps m7, [r1 + r2 - 1]
> +
> + inc r4d
> + add r0, r4
> + add r1, r4
> +
> + sub r2d, r4d
> + pxor m0, m0 ; m0 = 0
> + mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
> +
> +.loop:
> + movu m1, [r0] ; m1 = pRec[x]
> + movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
> +
> + psubusb m3, m2, m1
> + psubusb m4, m1, m2
> + pcmpeqb m3, m0
> + pcmpeqb m4, m0
> + pcmpeqb m2, m1
> +
> + pabsb m3, m3
> + por m4, m3
> + pandn m2, m4 ; m2 = iSignDown
> +
> + movu m3, [r1] ; m3 = m_iUpBuff1
> +
> + paddb m3, m2
> + paddb m3, m6 ; m3 = uiEdgeType
> +
> + pshufb m4, m5, m3
> +
> + psubb m3, m0, m2
> + movu [r1 - 1], m3
> +
> + pmovzxbw m2, m1
> + punpckhbw m1, m0
> + pmovsxbw m3, m4
> + punpckhbw m4, m4
> + psraw m4, 8
> +
> + paddw m2, m3
> + paddw m1, m4
> + packuswb m2, m1
> + movu [r0], m2
> +
> + add r0, 16
> + add r1, 16
> + sub r2, 16
> + jg .loop
> +
> + add r0, r2
> + add r1, r2
> + movh [r0], m7
> + movhps [r1 - 1], m7
> +
> + mov r5d, r5m
> + mov r2b, byte[r6]
> + mov byte[r1 - 1], r2b
> +
> + sub r0, r5
> + lea r0, [r0 + r3 + 1]
> +
> + movh m7, [r0 + r5]
> + movhps m7, [r1 - 1]
> +
> + sub r1, r5
> + add r0, r4
> + add r1, r4
> + sub r5d, r4d
> +
> +.loop1:
> + movu m1, [r0] ; m1 = pRec[x]
> + movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
> +
> + psubusb m3, m2, m1
> + psubusb m4, m1, m2
> + pcmpeqb m3, m0
> + pcmpeqb m4, m0
> + pcmpeqb m2, m1
> +
> + pabsb m3, m3
> + por m4, m3
> + pandn m2, m4 ; m2 = iSignDown
> +
> + movu m3, [r1] ; m3 = m_iUpBuff1
> +
> + paddb m3, m2
> + paddb m3, m6 ; m3 = uiEdgeType
> +
> + pshufb m4, m5, m3
> +
> + psubb m3, m0, m2
> + movu [r1 - 1], m3
> +
> + pmovzxbw m2, m1
> + punpckhbw m1, m0
> + pmovsxbw m3, m4
> + punpckhbw m4, m4
> + psraw m4, 8
> +
> + paddw m2, m3
> + paddw m1, m4
> + packuswb m2, m1
> + movu [r0], m2
> +
> + add r0, 16
> + add r1, 16
> + sub r5, 16
> + jg .loop1
> +
> + movh [r0 + r5], m7
> + movhps [r1 + r5 - 1], m7
> +
> + mov r2b, byte[r6 + 1]
> + mov byte[r1 + r5 - 1], r2b
> + RET
> +
> ;=====================================================================================
> ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
> ;=====================================================================================
> diff -r 5c3443546ccc -r b0aff8e0b995 source/common/x86/loopfilter.h
> --- a/source/common/x86/loopfilter.h Sat Apr 18 10:02:19 2015 -0700
> +++ b/source/common/x86/loopfilter.h Mon Apr 20 18:54:53 2015 +0530
> @@ -34,6 +34,7 @@
> void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
> void x265_saoCuOrgE3_avx2(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
> +void x265_saoCuOrgE3_2Rows_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX, int8_t* upBuff);
> void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
> void x265_saoCuOrgB0_avx2(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
> void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
> diff -r 5c3443546ccc -r b0aff8e0b995 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp Sat Apr 18 10:02:19 2015 -0700
> +++ b/source/encoder/sao.cpp Mon Apr 20 18:54:53 2015 +0530
> @@ -517,13 +517,26 @@
> upBuff1[ctuWidth - 1] = lastSign;
>
> int diff = endY - startY;
> - for (y = 0; y < diff / 2; y++)
> + for (y = 0; y < (diff >> 1); y++)
> {
> - int8_t signDown[2];
> - signDown[0] = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
> - signDown[1] = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
> + int8_t signDown, signDown0, upBuff[2];
> + int edgeType1;
>
> - primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, signDown);
> + signDown = signOf(rec[startX] - tmpL[y * 2 + 1 + startY]);
> + edgeType1 = signDown + upBuff1[startX] + 2;
> + rec[startX] = m_clipTable[rec[startX] + m_offsetEo[edgeType1]];
> +
> + signDown = signOf(rec[startX + stride] - tmpL[y * 2 + 2 + startY]);
> + signDown0 = signOf(rec[startX + 1] - rec[startX + stride]);
> + edgeType1 = signDown - signDown0 + 2;
> + upBuff1[startX - 1] = -signDown;
> +
> + upBuff[0] = signOf(rec[endX - 1 + stride] - rec[endX]);
> + upBuff[1] = signOf(rec[endX - 1 + 2 * stride] - rec[endX + stride]);
> +
> + primitives.saoCuOrgE3_2Rows(rec, upBuff1, m_offsetEo, stride - 1, startX, endX, upBuff);
> +
> + rec[startX + stride] = m_clipTable[rec[startX + stride] + m_offsetEo[edgeType1]];
>
> rec += 2 * stride;
> }
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list