[x265] [PATCH] aoCuOrgE2: asm code
Steve Borho
steve at borho.org
Tue Jan 6 12:56:07 CET 2015
On 01/06, praveen at multicorewareinc.com wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1420541256 -19800
> # Node ID 382dc33423b4d18ff7babbe8f97cbba58f77876b
> # Parent feebd0ecda691aeaf9265c7cb20897169df6866a
> aoCuOrgE2: asm code
queued
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/loopfilter.cpp
> --- a/source/common/loopfilter.cpp Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/loopfilter.cpp Tue Jan 06 16:17:36 2015 +0530
> @@ -57,6 +57,19 @@
> }
> }
>
> +void processSaoCUE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int width, intptr_t stride)
> +{
> + int x = 0;
> + for (x = 0; x < width; x++)
> + {
> + int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
> + int edgeType = signDown + buff1[x] + 2;
> + bufft[x + 1] = -signDown;
> + short v = rec[x] + offsetEo[edgeType];
> + rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
> + }
> +}
> +
> void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
> {
> #define SAO_BO_BITS 5
> @@ -81,6 +94,7 @@
> void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
> {
> p.saoCuOrgE0 = processSaoCUE0;
> + p.saoCuOrgE2 = processSaoCUE2;
> p.saoCuOrgB0 = processSaoCUB0;
> p.sign = calSign;
> }
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/primitives.h
> --- a/source/common/primitives.h Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/primitives.h Tue Jan 06 16:17:36 2015 +0530
> @@ -191,6 +191,7 @@
> typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>
> typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
> +typedef void (*saoCuOrgE2_t)(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
> typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
> typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
> @@ -244,6 +245,7 @@
>
> sign_t sign;
> saoCuOrgE0_t saoCuOrgE0;
> + saoCuOrgE2_t saoCuOrgE2;
> saoCuOrgB0_t saoCuOrgB0;
>
> downscale_t frameInitLowres;
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/x86/asm-primitives.cpp Tue Jan 06 16:17:36 2015 +0530
> @@ -1650,6 +1650,7 @@
> {
> p.sign = x265_calSign_sse4;
> p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
> + p.saoCuOrgE2 = x265_saoCuOrgE2_sse4;
> p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
>
> LUMA_ADDAVG(_sse4);
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/loopfilter.asm
> --- a/source/common/x86/loopfilter.asm Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/x86/loopfilter.asm Tue Jan 06 16:17:36 2015 +0530
> @@ -35,6 +35,7 @@
> cextern pb_1
> cextern pb_128
> cextern pb_2
> +cextern pw_2
>
>
> ;============================================================================================================
> @@ -85,6 +86,58 @@
> jnz .loop
> RET
>
> +;======================================================================================================================================================
> +; void saoCuOrgE2(pixel * rec, int8_t * bufft, int8_t * buff1, int8_t * offsetEo, int lcuWidth, intptr_t stride)
> +;======================================================================================================================================================
> +INIT_XMM sse4
> +cglobal saoCuOrgE2, 5, 7, 8, rec, bufft, buff1, offsetEo, lcuWidth
> +
> + mov r6, 16
> + mov r5d, r5m
> + pxor m0, m0 ; m0 = 0
> + mova m6, [pb_2] ; m6 = [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]
> + mova m7, [pb_128]
> + shr r4d, 4
> + inc r1q
> +
> + .loop
> + movu m1, [r0] ; m1 = rec[x]
> + movu m2, [r0 + r5 + 1] ; m2 = rec[x + stride + 1]
> + pxor m3, m1, m7
> + pxor m4, m2, m7
> + pcmpgtb m2, m3, m4
> + pcmpgtb m4, m3
> + pand m2, [pb_1]
> + por m2, m4
> + movu m3, [r2] ; m3 = buff1
> +
> + paddb m3, m2
> + paddb m3, m6 ; m3 = edgeType
> +
> + movu m4, [r3] ; m4 = offsetEo
> + pshufb m4, m3
> +
> + psubb m3, m0, m2
> + movu [r1], m3
> +
> + pmovzxbw m2, m1
> + punpckhbw m1, m0
> + pmovsxbw m3, m4
> + punpckhbw m4, m4
> + psraw m4, 8
> +
> + paddw m2, m3
> + paddw m1, m4
> + packuswb m2, m1
> + movu [r0], m2
> +
> + add r0, r6
> + add r1, r6
> + add r2, r6
> + dec r4d
> + jnz .loop
> + RET
> +
> ;=====================================================================================
> ; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
> ;=====================================================================================
> diff -r feebd0ecda69 -r 382dc33423b4 source/common/x86/loopfilter.h
> --- a/source/common/x86/loopfilter.h Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/common/x86/loopfilter.h Tue Jan 06 16:17:36 2015 +0530
> @@ -26,6 +26,7 @@
> #define X265_LOOPFILTER_H
>
> void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
> +void x265_saoCuOrgE2_sse4(pixel* rec, int8_t* pBufft, int8_t* pBuff1, int8_t* offsetEo, int lcuWidth, intptr_t stride);
> void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
> void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
>
> diff -r feebd0ecda69 -r 382dc33423b4 source/encoder/sao.cpp
> --- a/source/encoder/sao.cpp Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/encoder/sao.cpp Tue Jan 06 16:17:36 2015 +0530
> @@ -385,23 +385,54 @@
> upBuff1[x] = signOf(rec[x] - tmpU[x - 1]);
> }
>
> - for (y = startY; y < endY; y++)
> + if (ctuWidth & 15)
> {
> - upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
> - for (x = startX; x < endX; x++)
> - {
> - int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
> - int edgeType = signDown + upBuff1[x] + 2;
> - upBufft[x + 1] = -signDown;
> - rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
> - }
> + for (y = startY; y < endY; y++)
> + {
> + upBufft[startX] = signOf(rec[stride + startX] - tmpL[y]);
> + for (x = startX; x < endX; x++)
> + {
> + int8_t signDown = signOf(rec[x] - rec[x + stride + 1]);
> + int edgeType = signDown + upBuff1[x] + 2;
> + upBufft[x + 1] = -signDown;
> + rec[x] = m_clipTable[rec[x] + m_offsetEo[edgeType]];
> + }
>
> - std::swap(upBuff1, upBufft);
> + std::swap(upBuff1, upBufft);
>
> - rec += stride;
> + rec += stride;
> + }
> }
> + else
> + {
> + for (y = startY; y < endY; y++)
> + {
> + int8_t iSignDown2 = signOf(rec[stride + startX] - tmpL[y]);
> + pixel firstPxl = rec[0]; // copy first Pxl
> + pixel lastPxl = rec[ctuWidth - 1];
> + int8_t one = upBufft[1];
> + int8_t two = upBufft[endX + 1];
>
> - break;
> + primitives.saoCuOrgE2(rec, upBufft, upBuff1, m_offsetEo, ctuWidth, stride);
> + if (!lpelx)
> + {
> + rec[0] = firstPxl;
> + upBufft[1] = one;
> + }
> +
> + if (rpelx == picWidth)
> + {
> + rec[ctuWidth - 1] = lastPxl;
> + upBufft[endX + 1] = two;
> + }
> +
> + upBufft[startX] = iSignDown2;
> +
> + std::swap(upBuff1, upBufft);
> + rec += stride;
> + }
> + }
> + break;
> }
> case SAO_EO_3: // dir: 45
> {
> diff -r feebd0ecda69 -r 382dc33423b4 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/test/pixelharness.cpp Tue Jan 06 16:17:36 2015 +0530
> @@ -65,7 +65,9 @@
> sbuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
> sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
> ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
> - psbuf1[i] = (rand() % 65) - 32; // range is between -32 to 32
> + psbuf1[i] = psbuf4[i] = (rand() % 65) - 32; // range is between -32 to 32
> + psbuf2[i] = (rand() % 3) - 1;
> + psbuf3[i] = (rand() % 129) - 128;
> sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
> }
> }
> @@ -917,6 +919,37 @@
> return true;
> }
>
> +bool PixelHarness::check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt)
> +{
> + ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> + ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
> +
> + memset(ref_dest, 0xCD, sizeof(ref_dest));
> + memset(opt_dest, 0xCD, sizeof(opt_dest));
> +
> + int j = 0;
> +
> + for (int i = 0; i < ITERS; i++)
> + {
> + int width = 16 * (rand() % 4 + 1);
> + int stride = width + 1;
> +
> + ref(ref_dest, psbuf1 + j, psbuf2 + j, psbuf3 + j, width, stride);
> + checked(opt, opt_dest, psbuf4 + j, psbuf2 + j, psbuf3 + j, width, stride);
> +
> + if (memcmp(psbuf1 + j, psbuf4 + j, width * sizeof(int8_t)))
> + return false;
> +
> + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> + return false;
> +
> + reportfail();
> + j += INCR;
> + }
> +
> + return true;
> +}
> +
> bool PixelHarness::check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt)
> {
> ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> @@ -1463,6 +1496,15 @@
> }
> }
>
> + if (opt.saoCuOrgE2)
> + {
> + if (!check_saoCuOrgE2_t(ref.saoCuOrgE2, opt.saoCuOrgE2))
> + {
> + printf("SAO_EO_2 failed\n");
> + return false;
> + }
> + }
> +
> if (opt.saoCuOrgB0)
> {
> if (!check_saoCuOrgB0_t(ref.saoCuOrgB0, opt.saoCuOrgB0))
> @@ -1801,6 +1843,12 @@
> REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
> }
>
> + if (opt.saoCuOrgE2)
> + {
> + HEADER0("SAO_EO_2");
> + REPORT_SPEEDUP(opt.saoCuOrgE2, ref.saoCuOrgE2, pbuf1, psbuf1, psbuf2, psbuf3, 64, 64);
> + }
> +
> if (opt.saoCuOrgB0)
> {
> HEADER0("SAO_BO_0");
> diff -r feebd0ecda69 -r 382dc33423b4 source/test/pixelharness.h
> --- a/source/test/pixelharness.h Mon Jan 05 18:57:20 2015 +0530
> +++ b/source/test/pixelharness.h Tue Jan 06 16:17:36 2015 +0530
> @@ -47,6 +47,9 @@
> pixel pbuf4[BUFFSIZE];
> int ibuf1[BUFFSIZE];
> int8_t psbuf1[BUFFSIZE];
> + int8_t psbuf2[BUFFSIZE];
> + int8_t psbuf3[BUFFSIZE];
> + int8_t psbuf4[BUFFSIZE];
>
> int16_t sbuf1[BUFFSIZE];
> int16_t sbuf2[BUFFSIZE];
> @@ -90,6 +93,7 @@
> bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
> bool check_addAvg(addAvg_t, addAvg_t);
> bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
> + bool check_saoCuOrgE2_t(saoCuOrgE2_t ref, saoCuOrgE2_t opt);
> bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
> bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
> bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list