[x265] [PATCH 1 of 2] asm: saoCuOrgE3 asm code
Steve Borho
steve at borho.org
Wed Jan 7 12:41:30 CET 2015
On 01/07, nabajit at multicorewareinc.com wrote:
> # HG changeset patch
> # User Nabajit Deka
> # Date 1420620491 -19800
> # Wed Jan 07 14:18:11 2015 +0530
> # Node ID 9ec89f245be8ca4468362cb095172dbc92bd5140
> # Parent 6cc757f662ed982a2f64122eba8e557d8ef0abba
> asm: saoCuOrgE3 asm code
>
> diff -r 6cc757f662ed -r 9ec89f245be8 source/common/loopfilter.cpp
> --- a/source/common/loopfilter.cpp Wed Jan 07 13:44:23 2015 +0530
> +++ b/source/common/loopfilter.cpp Wed Jan 07 14:18:11 2015 +0530
> @@ -87,6 +87,22 @@
> }
> }
>
> +void processSaoCUE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX)
queued these three patches with these white-space nits cleaned up and m_
prefix removed from the function argument
> + {
> + int8_t signDown;
> + int8_t edgeType;
> +
> + for (int x = startX + 1; x < endX; x++)
> + {
> + signDown = signOf(rec[x] - rec[x + stride]);
> + edgeType = signDown + upBuff1[x] + 2;
> + upBuff1[x - 1] = -signDown;
> +
> + short v = rec[x] + m_offsetEo[edgeType];
> + rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
this looks like it should be rec[x] = x265_clip(v); please change here
and review the other functions in this file for similar changes.
> + }
> + }
> +
> void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
> {
> #define SAO_BO_BITS 5
> @@ -113,6 +129,7 @@
> p.saoCuOrgE0 = processSaoCUE0;
> p.saoCuOrgE1 = processSaoCUE1;
> p.saoCuOrgE2 = processSaoCUE2;
> + p.saoCuOrgE3 = processSaoCUE3;
> p.saoCuOrgB0 = processSaoCUB0;
> p.sign = calSign;
> }
> diff -r 6cc757f662ed -r 9ec89f245be8 source/common/primitives.h
> --- a/source/common/primitives.h Wed Jan 07 13:44:23 2015 +0530
> +++ b/source/common/primitives.h Wed Jan 07 14:18:11 2015 +0530
> @@ -193,6 +193,7 @@
> typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
> typedef void (*saoCuOrgE1_t)(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
> typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> +typedef void (*saoCuOrgE3_t)(pixel* rec, int8_t* upBuff1, int8_t* m_offsetEo, intptr_t stride, int startX, int endX);
> typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
> typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
> typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
> @@ -248,6 +249,7 @@
> saoCuOrgE0_t saoCuOrgE0;
> saoCuOrgE1_t saoCuOrgE1;
> saoCuOrgE2_t saoCuOrgE2;
> + saoCuOrgE3_t saoCuOrgE3;
> saoCuOrgB0_t saoCuOrgB0;
>
> downscale_t frameInitLowres;
> diff -r 6cc757f662ed -r 9ec89f245be8 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Wed Jan 07 13:44:23 2015 +0530
> +++ b/source/common/x86/asm-primitives.cpp Wed Jan 07 14:18:11 2015 +0530
> @@ -1652,6 +1652,7 @@
> p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
> p.saoCuOrgE1 = x265_saoCuOrgE1_sse4;
> p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
> + p.saoCuOrgE3 = x265_saoCuOrgE3_sse4;
> p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
>
> LUMA_ADDAVG(_sse4);
> diff -r 6cc757f662ed -r 9ec89f245be8 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm Wed Jan 07 13:44:23 2015 +0530
> +++ b/source/common/x86/loopfilter.asm Wed Jan 07 14:18:11 2015 +0530
> @@ -188,6 +188,88 @@
> jnz .loop
> RET
>
> +;=======================================================================================================
> +;void saoCuOrgE3(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX)
> +;=======================================================================================================
> +INIT_XMM sse4
> +cglobal saoCuOrgE3, 3, 7, 8
> + mov r3d, r3m
> + mov r4d, r4m
> + mov r5d, r5m
> +
> + mov r6d, r5d
> + sub r6d, r4d
> +
> + inc r4d
> + add r0, r4
> + add r1, r4
> + movh m7, [r0 + r6 - 1]
> + mov r6, [r1 + r6 - 2]
> + pxor m0, m0 ; m0 = 0
> + movu m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
> +
> +.loop:
> + movu m1, [r0] ; m1 = pRec[x]
> + movu m2, [r0 + r3] ; m2 = pRec[x + iStride]
> +
> + psubusb m3, m2, m1
> + psubusb m4, m1, m2
> + pcmpeqb m3, m0
> + pcmpeqb m4, m0
> + pcmpeqb m2, m1
> +
> + pabsb m3, m3
> + por m4, m3
> + pandn m2, m4 ; m2 = iSignDown
> +
> + movu m3, [r1] ; m3 = m_iUpBuff1
> +
> + paddb m3, m2
> + paddb m3, m6 ; m3 = uiEdgeType
> +
> + movu m4, [r2] ; m4 = m_iOffsetEo
> + pshufb m5, m4, m3
> +
> + psubb m3, m0, m2
> + movu [r1 - 1], m3
> +
> + pmovzxbw m2, m1
> + punpckhbw m1, m0
> + pmovsxbw m3, m5
> + punpckhbw m5, m5
> + psraw m5, 8
> +
> + paddw m2, m3
> + paddw m1, m5
> + packuswb m2, m1
> + movu [r0], m2
> +
> + sub r5d, 16
> + jle .end
> +
> + lea r0, [r0 + 16]
> + lea r1, [r1 + 16]
> +
> + jnz .loop
> +
> +.end:
> + js .skip
> + sub r0, r4
> + sub r1, r4
> + movh [r0 + 16], m7
> + mov [r1 + 15], r6
> + jmp .quit
> +
> +.skip:
> + sub r0, r4
> + sub r1, r4
> + movh [r0 + 15], m7
> + mov [r1 + 14], r6
> +
> +.quit:
> +
> + RET
> +
> ;=====================================================================================
> ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
> ;=====================================================================================
> diff -r 6cc757f662ed -r 9ec89f245be8 source/common/x86/loopfilter.h
> --- a/source/common/x86/loopfilter.h Wed Jan 07 13:44:23 2015 +0530
> +++ b/source/common/x86/loopfilter.h Wed Jan 07 14:18:11 2015 +0530
> @@ -28,6 +28,7 @@
> void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
> void x265_saoCuOrgE1_sse4(pixel* rec, int8_t* upBuff1, int8_t* offsetEo, intptr_t stride, int width);
> void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> +void x265_saoCuOrgE3_sse4(pixel *rec, int8_t *upBuff1, int8_t *m_offsetEo, intptr_t stride, int startX, int endX);
> void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
> void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
>
> diff -r 6cc757f662ed -r 9ec89f245be8 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Wed Jan 07 13:44:23 2015 +0530
> +++ b/source/test/pixelharness.cpp Wed Jan 07 14:18:11 2015 +0530
> @@ -978,6 +978,35 @@
> return true;
> }
>
> +bool PixelHarness::check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt)
> +{
> + ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> + ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
> +
> + memset(ref_dest, 0xCD, sizeof(ref_dest));
> + memset(opt_dest, 0xCD, sizeof(opt_dest));
> +
> + int j = 0;
> +
> + for (int i = 0; i < ITERS; i++)
> + {
> + int stride = 16 * (rand() % 4 + 1);
> + int start = rand() % 2;
> + int end = (16 * (rand() % 4 + 1)) - rand() % 2;
> +
> + ref(ref_dest, psbuf2 + j, psbuf1 + j, stride, start, end);
> + checked(opt, opt_dest, psbuf5 + j, psbuf1 + j, stride, start, end);
> +
> + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)) || memcmp(psbuf2, psbuf5, BUFFSIZE))
> + return false;
> +
> + reportfail();
> + j += INCR;
> + }
> +
> + return true;
> +}
> +
> bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
> {
> ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> @@ -1542,6 +1571,15 @@
> }
> }
>
> + if (opt.saoCuOrgE3)
> + {
> + if (!check_saoCuOrgE3_t(ref.saoCuOrgE3, opt.saoCuOrgE3))
> + {
> + printf("SAO_EO_3 failed\n");
> + return false;
> + }
> + }
> +
> if (opt.saoCuOrgB0)
> {
> if (!check_saoCuOrgB0_t(ref.saoCuOrgB0, opt.saoCuOrgB0))
> @@ -1892,6 +1930,12 @@
> REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
> }
>
> + if (opt.saoCuOrgE3)
> + {
> + HEADER0("SAO_EO_3");
> + REPORT_SPEEDUP(opt.saoCuOrgE3, ref.saoCuOrgE3, pbuf1, psbuf2, psbuf1, 64, 0, 64);
> + }
> +
> if (opt.saoCuOrgB0)
> {
> HEADER0("SAO_BO_0");
> diff -r 6cc757f662ed -r 9ec89f245be8 source/test/pixelharness.h
> --- a/source/test/pixelharness.h Wed Jan 07 13:44:23 2015 +0530
> +++ b/source/test/pixelharness.h Wed Jan 07 14:18:11 2015 +0530
> @@ -96,6 +96,7 @@
> bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
> bool check_saoCuOrgE1_t(saoCuOrgE1_t ref, saoCuOrgE1_t opt);
> bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
> + bool check_saoCuOrgE3_t(saoCuOrgE3_t ref, saoCuOrgE3_t opt);
> bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
> bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
> bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list