[x265] [PATCH] sao: modify C and SSE4 code for saoCuOrgE0 to process 2 rows
Steve Borho
steve at borho.org
Wed Apr 1 22:14:15 CEST 2015
On Wed, Apr 1, 2015 at 8:40 AM, Divya Manivannan
<divya at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Divya Manivannan <divya at multicorewareinc.com>
> # Date 1427895336 -19800
> # Wed Apr 01 19:05:36 2015 +0530
> # Node ID f718abdc8004d0c859266b292730b7b5b3d0d4df
> # Parent ac85c775620f1dcb0df056874633cbf916098bd2
> sao: modify C and SSE4 code for saoCuOrgE0 to process 2 rows
/home/sborho/repos/x265/source/encoder/sao.cpp: In member function
'void x265::SAO::processSaoCu(int, int, int)':
/home/sborho/repos/x265/source/encoder/sao.cpp:289:21: warning:
declaration of 'signLeft' shadows a previous local [-Wshadow]
int signLeft = signOf(rec[startX] - tmpL[y]);
^
/home/sborho/repos/x265/source/encoder/sao.cpp:261:64: warning:
shadowed declaration is here [-Wshadow]
int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft[2];
> diff -r ac85c775620f -r f718abdc8004 source/common/loopfilter.cpp
> --- a/source/common/loopfilter.cpp Tue Mar 31 20:04:28 2015 -0500
> +++ b/source/common/loopfilter.cpp Wed Apr 01 19:05:36 2015 +0530
> @@ -42,18 +42,23 @@
> dst[x] = signOf(src1[x] - src2[x]);
> }
>
> -void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t signLeft)
> +void processSaoCUE0(pixel * rec, int8_t * offsetEo, int width, int8_t* signLeft, intptr_t stride)
> {
> - int x;
> - int8_t signRight;
> + int x, y;
> + int8_t signRight, signLeft0;
> int8_t edgeType;
>
> - for (x = 0; x < width; x++)
> + for (y = 0; y < 2; y++)
> {
> - signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
> - edgeType = signRight + signLeft + 2;
> - signLeft = -signRight;
> - rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
> + signLeft0 = signLeft[y];
> + for (x = 0; x < width; x++)
> + {
> + signRight = ((rec[x] - rec[x + 1]) < 0) ? -1 : ((rec[x] - rec[x + 1]) > 0) ? 1 : 0;
> + edgeType = signRight + signLeft0 + 2;
> + signLeft0 = -signRight;
> + rec[x] = x265_clip(rec[x] + offsetEo[edgeType]);
> + }
> + rec += stride;
> }
> }
>
> diff -r ac85c775620f -r f718abdc8004 source/common/primitives.h
> --- a/source/common/primitives.h Tue Mar 31 20:04:28 2015 -0500
> +++ b/source/common/primitives.h Wed Apr 01 19:05:36 2015 +0530
> @@ -169,7 +169,7 @@
> typedef void (*pixelavg_pp_t)(pixel* dst, intptr_t dstride, const pixel* src0, intptr_t sstride0, const pixel* src1, intptr_t sstride1, int weight);
> typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>
> -typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
> +typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t* signLeft, intptr_t stride);
> typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
> typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
> diff -r ac85c775620f -r f718abdc8004 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm Tue Mar 31 20:04:28 2015 -0500
> +++ b/source/common/x86/loopfilter.asm Wed Apr 01 19:05:36 2015 +0530
> @@ -39,20 +39,25 @@
>
>
> ;============================================================================================================
> -; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft)
> +; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t* signLeft, intptr_t stride)
> ;============================================================================================================
> INIT_XMM sse4
> -cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft
> +cglobal saoCuOrgE0, 5, 6, 8, rec, offsetEo, lcuWidth, signLeft, stride
>
> - neg r3 ; r3 = -signLeft
> - movzx r3d, r3b
> - movd m0, r3d
> - mova m4, [pb_128] ; m4 = [80]
> - pxor m5, m5 ; m5 = 0
> - movu m6, [r1] ; m6 = offsetEo
> + mov r4d, r4m
> + mova m4, [pb_128] ; m4 = [80]
> + pxor m5, m5 ; m5 = 0
> + movu m6, [r1] ; m6 = offsetEo
> +
> + movzx r5d, byte [r3]
> + inc r3
> + neg r5b
> + movd m0, r5d
> + lea r5, [r0 + r4]
> + mov r4d, r2d
>
> .loop:
> - movu m7, [r0] ; m1 = rec[x]
> + movu m7, [r0] ; m7 = rec[x]
> movu m2, [r0 + 1] ; m2 = rec[x+1]
>
> pxor m1, m7, m4
> @@ -69,7 +74,7 @@
> pxor m0, m0
> palignr m0, m2, 15
> paddb m2, m3
> - paddb m2, [pb_2] ; m1 = uiEdgeType
> + paddb m2, [pb_2] ; m2 = uiEdgeType
> pshufb m3, m6, m2
> pmovzxbw m2, m7 ; rec
> punpckhbw m7, m5
> @@ -84,6 +89,43 @@
> add r0q, 16
> sub r2d, 16
> jnz .loop
> +
> + movzx r3d, byte [r3]
> + neg r3b
> + movd m0, r3d
> +.loopH:
> + movu m7, [r5] ; m7 = rec[x]
> + movu m2, [r5 + 1] ; m2 = rec[x+1]
> +
> + pxor m1, m7, m4
> + pxor m3, m2, m4
> + pcmpgtb m2, m1, m3
> + pcmpgtb m3, m1
> + pand m2, [pb_1]
> + por m2, m3
> +
> + pslldq m3, m2, 1
> + por m3, m0
> +
> + psignb m3, m4 ; m3 = signLeft
> + pxor m0, m0
> + palignr m0, m2, 15
> + paddb m2, m3
> + paddb m2, [pb_2] ; m2 = uiEdgeType
> + pshufb m3, m6, m2
> + pmovzxbw m2, m7 ; rec
> + punpckhbw m7, m5
> + pmovsxbw m1, m3 ; offsetEo
> + punpckhbw m3, m3
> + psraw m3, 8
> + paddw m2, m1
> + paddw m7, m3
> + packuswb m2, m7
> + movu [r5], m2
> +
> + add r5q, 16
> + sub r4d, 16
> + jnz .loopH
> RET
>
> ;==================================================================================================
> diff -r ac85c775620f -r f718abdc8004 source/common/x86/loopfilter.h
> --- a/source/common/x86/loopfilter.h Tue Mar 31 20:04:28 2015 -0500
> +++ b/source/common/x86/loopfilter.h Wed Apr 01 19:05:36 2015 +0530
> @@ -25,7 +25,7 @@
> #ifndef X265_LOOPFILTER_H
> #define X265_LOOPFILTER_H
>
> -void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
> +void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t* signLeft, intptr_t stride);
> void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
> void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
> diff -r ac85c775620f -r f718abdc8004 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp Tue Mar 31 20:04:28 2015 -0500
> +++ b/source/encoder/sao.cpp Wed Apr 01 19:05:36 2015 +0530
> @@ -258,7 +258,7 @@
> pixel* tmpL;
> pixel* tmpU;
>
> - int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1;
> + int8_t _upBuff1[MAX_CU_SIZE + 2], *upBuff1 = _upBuff1 + 1, signLeft[2];
> int8_t _upBufft[MAX_CU_SIZE + 2], *upBufft = _upBufft + 1;
>
> memset(_upBuff1 + MAX_CU_SIZE, 0, 2 * sizeof(int8_t)); /* avoid valgrind uninit warnings */
> @@ -279,7 +279,7 @@
> {
> case SAO_EO_0: // dir: -
> {
> - pixel firstPxl = 0, lastPxl = 0;
> + pixel firstPxl = 0, lastPxl = 0, row1FirstPxl = 0, row1LastPxl = 0;
> startX = !lpelx;
> endX = (rpelx == picWidth) ? ctuWidth - 1 : ctuWidth;
> if (ctuWidth & 15)
> @@ -301,25 +301,38 @@
> }
> else
> {
> - for (y = 0; y < ctuHeight; y++)
> + for (y = 0; y < ctuHeight; y += 2)
> {
> - int signLeft = signOf(rec[startX] - tmpL[y]);
> -
> - if (!lpelx)
> - firstPxl = rec[0];
> -
> - if (rpelx == picWidth)
> - lastPxl = rec[ctuWidth - 1];
> -
> - primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, (int8_t)signLeft);
> -
> - if (!lpelx)
> - rec[0] = firstPxl;
> -
> - if (rpelx == picWidth)
> - rec[ctuWidth - 1] = lastPxl;
> -
> - rec += stride;
> + signLeft[0] = signOf(rec[startX] - tmpL[y]);
> + signLeft[1] = signOf(rec[stride + startX] - tmpL[y + 1]);
> +
> + if (!lpelx)
> + {
> + firstPxl = rec[0];
> + row1FirstPxl = rec[stride];
> + }
> +
> + if (rpelx == picWidth)
> + {
> + lastPxl = rec[ctuWidth - 1];
> + row1LastPxl = rec[stride + ctuWidth - 1];
> + }
> +
> + primitives.saoCuOrgE0(rec, m_offsetEo, ctuWidth, signLeft, stride);
> +
> + if (!lpelx)
> + {
> + rec[0] = firstPxl;
> + rec[stride] = row1FirstPxl;
> + }
> +
> + if (rpelx == picWidth)
> + {
> + rec[ctuWidth - 1] = lastPxl;
> + rec[stride + ctuWidth - 1] = row1LastPxl;
> + }
> +
> + rec += 2 * stride;
> }
> }
> break;
> diff -r ac85c775620f -r f718abdc8004 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Tue Mar 31 20:04:28 2015 -0500
> +++ b/source/test/pixelharness.cpp Wed Apr 01 19:05:36 2015 +0530
> @@ -908,12 +908,10 @@
> for (int i = 0; i < ITERS; i++)
> {
> int width = 16 * (rand() % 4 + 1);
> - int8_t sign = rand() % 3;
> - if (sign == 2)
> - sign = -1;
> -
> - ref(ref_dest, psbuf1 + j, width, sign);
> - checked(opt, opt_dest, psbuf1 + j, width, sign);
> + int stride = width + 1;
> +
> + ref(ref_dest, psbuf1 + j, width, psbuf2 + j, stride);
> + checked(opt, opt_dest, psbuf1 + j, width, psbuf5 + j, stride);
>
> if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> return false;
> @@ -2058,7 +2056,7 @@
> if (opt.saoCuOrgE0)
> {
> HEADER0("SAO_EO_0");
> - REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
> + REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, psbuf2, 64);
> }
>
> if (opt.saoCuOrgE1)
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list