[x265] [PATCH] asm: split SAO_EO_0 into separate primitive func, added assembly code and testbench support
Steve Borho
steve at borho.org
Fri Feb 28 21:49:30 CET 2014
On Fri, Feb 28, 2014 at 12:48 AM, <praveen at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Praveen Tiwari
> # Date 1393570037 -19800
> # Node ID 520cab36d86c791ca75d2ee15a0c0f507615ebf4
> # Parent d3e3baaf80b490f330d2171e454ad5b7856acaa7
> asm: split SAO_EO_0 into separate primitive func, added assembly code and testbench support
> added loopfilter.cpp, loopfilter.h, loopfilter.asm files for C and assembly code
this is loads better; but the test bench changes have loads of
white-space issues. I'll have to uncrustify it prior to pushing this
patch.
>
> diff -r d3e3baaf80b4 -r 520cab36d86c source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp
> --- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp Thu Feb 27 16:25:51 2014 +0530
> +++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp Fri Feb 28 12:17:17 2014 +0530
> @@ -44,7 +44,6 @@
> namespace x265 {
> //! \ingroup TLibCommon
> //! \{
> -
> SAOParam::~SAOParam()
> {
> for (int i = 0; i < 3; i++)
> @@ -535,8 +534,6 @@
> uint32_t tpely = tmpCu->getCUPelY();
> uint32_t rpelx;
> uint32_t bpely;
> - int signLeft;
> - int signRight;
> int signDown;
> int signDown1;
> int signDown2;
> @@ -614,23 +611,57 @@
> {
> case SAO_EO_0: // dir: -
> {
> - startX = (lpelx == 0) ? 1 : 0;
> - endX = (rpelx == picWidthTmp) ? lcuWidth - 1 : lcuWidth;
> - for (y = 0; y < lcuHeight; y++)
> - {
> - signLeft = xSign(rec[startX] - tmpL[y]);
> - for (x = startX; x < endX; x++)
> - {
> - signRight = xSign(rec[x] - rec[x + 1]);
> - edgeType = signRight + signLeft + 2;
> - signLeft = -signRight;
> + pixel firstPxl = 0, lastPxl = 0;
> + startX = (lpelx == 0) ? 1 : 0;
> + endX = (rpelx == picWidthTmp) ? lcuWidth-1 : lcuWidth;
> + if (lcuWidth % 16)
> + {
> + int8_t signRight;
> + int8_t edgeType;
> + for (y = 0; y < lcuHeight; y++)
> + {
> + int8_t signLeft = xSign(rec[startX] - tmpL[y]);
> + for (x = startX; x < endX; x++)
> + {
> + signRight = xSign(rec[x] - rec[x+1]);
> + edgeType = signRight + signLeft + 2;
> + signLeft = -signRight;
>
> - rec[x] = clipTbl[rec[x] + m_offsetEo[edgeType]];
> - }
> + rec[x] = Clip3(0, (1 << X265_DEPTH) - 1, rec[x] + m_offsetEo[edgeType]);
> + }
> + rec += stride;
> + }
> + }
> + else
> + {
> + for (y = 0; y < lcuHeight; y++)
> + {
> + int8_t signLeft = xSign(rec[startX] - tmpL[y]);
>
> - rec += stride;
> - }
> + if (lpelx == 0)
> + {
> + firstPxl = rec[0];
> + }
>
> + if (rpelx == picWidthTmp)
> + {
> + lastPxl = rec[lcuWidth - 1];
> + }
> +
> + primitives.saoCuOrgE0(rec, m_offsetEo, lcuWidth, signLeft);
> +
> + if (lpelx == 0)
> + {
> + rec[0] = firstPxl;
> + }
> +
> + if (rpelx == picWidthTmp)
> + {
> + rec[lcuWidth - 1] = lastPxl;
> + }
> + rec += stride;
> + }
> + }
> break;
> }
> case SAO_EO_1: // dir: |
> diff -r d3e3baaf80b4 -r 520cab36d86c source/Lib/TLibCommon/TComSampleAdaptiveOffset.h
> --- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h Thu Feb 27 16:25:51 2014 +0530
> +++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h Fri Feb 28 12:17:17 2014 +0530
> @@ -143,11 +143,9 @@
> static const int m_numCulPartsLevel[5];
> static const uint32_t m_eoTable[9];
> static const int m_numClass[MAX_NUM_SAO_TYPE];
> -
> int32_t *m_offsetBo;
> int32_t *m_chromaOffsetBo;
> - int m_offsetEo[LUMA_GROUP_NUM];
> -
> + int8_t m_offsetEo[LUMA_GROUP_NUM];
> int m_picWidth;
> int m_picHeight;
> uint32_t m_maxSplitLevel;
> diff -r d3e3baaf80b4 -r 520cab36d86c source/common/CMakeLists.txt
> --- a/source/common/CMakeLists.txt Thu Feb 27 16:25:51 2014 +0530
> +++ b/source/common/CMakeLists.txt Fri Feb 28 12:17:17 2014 +0530
> @@ -41,7 +41,6 @@
> ../Lib/TLibCommon/TComYuv.cpp)
> source_group(TLibCommon FILES ${LIBCOMMON_SRC})
> source_group(TLibCommonH FILES ${LIBCOMMON_HDR})
> -
> if(GCC)
> set_source_files_properties(${LIBCOMMON_SRC} PROPERTIES COMPILE_FLAGS
> "-Wno-sign-compare")
> @@ -102,14 +101,14 @@
>
> if(ENABLE_ASSEMBLY)
> set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
> - set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h)
> + set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
> set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
> mc-a2.asm ipfilter8.asm pixel-util8.asm blockcopy8.asm
> pixeladd8.asm dct8.asm)
> if(HIGH_BIT_DEPTH)
> set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm)
> else()
> - set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm)
> + set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm loopfilter.asm)
> endif()
>
> if(NOT X64)
> @@ -150,4 +149,5 @@
> common.cpp common.h
> param.cpp param.h
> lowres.cpp lowres.h
> - piclist.cpp piclist.h)
> + piclist.cpp piclist.h
> + loopfilter.cpp)
> diff -r d3e3baaf80b4 -r 520cab36d86c source/common/loopfilter.cpp
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/loopfilter.cpp Fri Feb 28 12:17:17 2014 +0530
> @@ -0,0 +1,52 @@
> +/*****************************************************************************
> +* Copyright (C) 2013 x265 project
> +*
> +* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> +* Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> +* This program is free software; you can redistribute it and/or modify
> +* it under the terms of the GNU General Public License as published by
> +* the Free Software Foundation; either version 2 of the License, or
> +* (at your option) any later version.
> +*
> +* This program is distributed in the hope that it will be useful,
> +* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +* GNU General Public License for more details.
> +*
> +* You should have received a copy of the GNU General Public License
> +* along with this program; if not, write to the Free Software
> +* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
> +*
> +* This program is also available under a commercial proprietary license.
> +* For more information, contact us at licensing at multicorewareinc.com.
> +*****************************************************************************/
> +
> +#include "TLibCommon/TypeDef.h"
> +#include "primitives.h"
> +
> +#define PIXEL_MIN 0
> +#define PIXEL_MAX ((1 << X265_DEPTH) - 1)
> +
> +void processSaoCUE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft)
> +{
> + int x;
> + int8_t signRight;
> + int8_t edgeType;
> +
> + for (x = 0; x < lcuWidth; x++)
> + {
> + signRight = ((rec[x] - rec[x+1]) < 0) ? -1 : ((rec[x] - rec[x+1]) > 0) ? 1 : 0;
> + edgeType = signRight + signLeft + 2;
> + signLeft = -signRight;
> +
> + short v = rec[x] + offsetEo[edgeType];
> + rec[x] = (pixel)(v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
> + }
> +}
> +
> +namespace x265 {
> +void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
> +{
> + p.saoCuOrgE0 = processSaoCUE0;
> +}
> +}
> diff -r d3e3baaf80b4 -r 520cab36d86c source/common/primitives.cpp
> --- a/source/common/primitives.cpp Thu Feb 27 16:25:51 2014 +0530
> +++ b/source/common/primitives.cpp Fri Feb 28 12:17:17 2014 +0530
> @@ -63,16 +63,16 @@
> void Setup_C_DCTPrimitives(EncoderPrimitives &p);
> void Setup_C_IPFilterPrimitives(EncoderPrimitives &p);
> void Setup_C_IPredPrimitives(EncoderPrimitives &p);
> -
> +void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p);
> void Setup_C_Primitives(EncoderPrimitives &p)
> {
> Setup_C_PixelPrimitives(p); // pixel.cpp
> Setup_C_DCTPrimitives(p); // dct.cpp
> Setup_C_IPFilterPrimitives(p); // ipfilter.cpp
> Setup_C_IPredPrimitives(p); // intrapred.cpp
> + Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp
> }
> }
> -
> using namespace x265;
>
> /* cpuid == 0 - auto-detect CPU type, else
> diff -r d3e3baaf80b4 -r 520cab36d86c source/common/primitives.h
> --- a/source/common/primitives.h Thu Feb 27 16:25:51 2014 +0530
> +++ b/source/common/primitives.h Fri Feb 28 12:17:17 2014 +0530
> @@ -185,8 +185,9 @@
>
> typedef void (*pixel_sub_ps_t)(int16_t *dst, intptr_t dstride, pixel *src0, pixel *src1, intptr_t sstride0, intptr_t sstride1);
> typedef void (*pixel_add_ps_t)(pixel *a, intptr_t dstride, pixel *b0, int16_t *b1, intptr_t sstride0, intptr_t sstride1);
> +typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
>
> -typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
> +typedef void (*saoCuOrgE0_t)(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft);
>
> /* Define a structure containing function pointers to optimized encoder
> * primitives. Each pointer can reference either an assembly routine,
> @@ -254,6 +255,8 @@
> downscale_t frame_init_lowres_core;
> plane_copy_deinterleave_t plane_copy_deinterleave_c;
> extendCURowBorder_t extendRowBorder;
> + // sao primitives
> + saoCuOrgE0_t saoCuOrgE0;
>
> struct
> {
> diff -r d3e3baaf80b4 -r 520cab36d86c source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Thu Feb 27 16:25:51 2014 +0530
> +++ b/source/common/x86/asm-primitives.cpp Fri Feb 28 12:17:17 2014 +0530
> @@ -32,6 +32,7 @@
> #include "pixel-util.h"
> #include "mc.h"
> #include "ipfilter8.h"
> +#include "loopfilter.h"
> #include "blockcopy8.h"
> #include "intrapred.h"
> #include "dct8.h"
> @@ -1149,6 +1150,8 @@
> }
> if (cpuMask & X265_CPU_SSE4)
> {
> + p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
> +
> LUMA_ADDAVG(_sse4);
> CHROMA_ADDAVG(_sse4);
> p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
> diff -r d3e3baaf80b4 -r 520cab36d86c source/common/x86/loopfilter.asm
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/x86/loopfilter.asm Fri Feb 28 12:17:17 2014 +0530
> @@ -0,0 +1,85 @@
> +;*****************************************************************************
> +;* Copyright (C) 2013 x265 project
> +;*
> +;* Authors: Min Chen <chenm001 at 163.com>
> +;* Praveen Kumar Tiwari <praveen at multicorewareinc.com>
> +;* Nabajit Deka <nabajit at multicorewareinc.com>
> +;* Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> +;* Murugan Vairavel <murugan at multicorewareinc.com>
> +;* Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
> +;* This program is free software; you can redistribute it and/or modify
> +;* it under the terms of the GNU General Public License as published by
> +;* the Free Software Foundation; either version 2 of the License, or
> +;* (at your option) any later version.
> +;*
> +;* This program is distributed in the hope that it will be useful,
> +;* but WITHOUT ANY WARRANTY; without even the implied warranty of
> +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> +;* GNU General Public License for more details.
> +;*
> +;* You should have received a copy of the GNU General Public License
> +;* along with this program; if not, write to the Free Software
> +;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
> +;*
> +;* This program is also available under a commercial proprietary license.
> +;* For more information, contact us at licensing at multicorewareinc.com.
> +;*****************************************************************************/
> +
> +%include "x86inc.asm"
> +
> +SECTION_RODATA 32
> +
> +pw_2: times 16 db 2
> +
> +SECTION .text
> +
> +;============================================================================================================
> +; void saoCuOrgE0(pixel * rec, int8_t * offsetEo, int lcuWidth, int8_t signLeft)
> +;============================================================================================================
> +INIT_XMM sse4
> +cglobal saoCuOrgE0, 4, 4, 8, rec, offsetEo, lcuWidth, signLeft
> +
> + neg r3 ; r3 = -iSignLeft
> + movd m0, r3d
> + pslldq m0, 15 ; m0 = [iSignLeft x .. x]
> + pcmpeqb m4, m4 ; m4 = [pb -1]
> + pxor m5, m5 ; m5 = 0
> + movu m6, [r1] ; m6 = m_iOffsetEo
> +
> +.loop:
> + movu m7, [r0] ; m1 = pRec[x]
> + mova m1, m7
> + movu m2, [r0+1] ; m2 = pRec[x+1]
> +
> + psubusb m3, m2, m7
> + psubusb m1, m2
> + pcmpeqb m3, m5
> + pcmpeqb m1, m5
> + pcmpeqb m2, m7
> +
> + pabsb m3, m3 ; m1 = (pRec[x] - pRec[x+1]) > 0) ? 1 : 0
> + por m1, m3 ; m1 = iSignRight
> + pandn m2, m1
> +
> + palignr m3, m2, m0, 15 ; m3 = -iSignLeft
> + psignb m3, m4 ; m3 = iSignLeft
> + mova m0, m4
> + pslldq m0, 15
> + pand m0, m2 ; [pb 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1]
> + paddb m2, m3
> + paddb m2, [pw_2] ; m1 = uiEdgeType
> + pshufb m3, m6, m2
> + pmovzxbw m2, m7 ; rec
> + punpckhbw m7, m5
> + pmovsxbw m1, m3 ; iOffsetEo
> + punpckhbw m3, m3
> + psraw m3, 8
> + paddw m2, m1
> + paddw m7, m3
> + packuswb m2, m7
> + movu [r0], m2
> +
> + add r0q, 16
> + sub r2d, 16
> + jnz .loop
> + RET
> diff -r d3e3baaf80b4 -r 520cab36d86c source/common/x86/loopfilter.h
> --- /dev/null Thu Jan 01 00:00:00 1970 +0000
> +++ b/source/common/x86/loopfilter.h Fri Feb 28 12:17:17 2014 +0530
> @@ -0,0 +1,29 @@
> +/*****************************************************************************
> + * Copyright (C) 2013 x265 project
> + *
> + * Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License as published by
> + * the Free Software Foundation; either version 2 of the License, or
> + * (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
> + *
> + * This program is also available under a commercial proprietary license.
> + * For more information, contact us at licensing at multicorewareinc.com.
> + *****************************************************************************/
> +
> +#ifndef X265_LOOPFILTER_H
> +#define X265_LOOPFILTER_H
> +
> +void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
> +
> +#endif // ifndef X265_LOOPFILTER_H
> diff -r d3e3baaf80b4 -r 520cab36d86c source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Thu Feb 27 16:25:51 2014 +0530
> +++ b/source/test/pixelharness.cpp Fri Feb 28 12:17:17 2014 +0530
> @@ -52,6 +52,8 @@
>
> ibuf1 = (int*)X265_MALLOC(int, bufsize);
>
> + psbuf1 = X265_MALLOC(int8_t, bufsize);
> +
> sbuf1 = (int16_t*)X265_MALLOC(int16_t, bufsize);
> sbuf2 = (int16_t*)X265_MALLOC(int16_t, bufsize);
> sbuf3 = (int16_t*)X265_MALLOC(int16_t, bufsize);
> @@ -62,8 +64,7 @@
> short_test_buff1 = (int16_t**)X265_MALLOC(int16_t*, TEST_CASES);
> int_test_buff = (int**)X265_MALLOC(int*, TEST_CASES);
>
> - if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 || !sbuf1 || !sbuf2 || !sbuf3 || !ibuf1 ||
> - !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1)
> + if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 || !sbuf1 || !sbuf2 || !sbuf3 || !ibuf1 || !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !psbuf1)
> {
> fprintf(stderr, "malloc failed, unable to initiate tests!\n");
> exit(1);
> @@ -112,7 +113,7 @@
> sbuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
> sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
> ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
> -
> + psbuf1[i] = (rand() %65) - 32; // range is between -32 to 32
> sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
> }
> }
> @@ -868,7 +869,39 @@
>
> return true;
> }
> +bool PixelHarness::check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt)
> +{
> + ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
> + ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
>
> + int j = 0;
> +
> + for(int i = 0; i < sizeof(ref_dest); i++)
> + {
> + opt_dest[i] = ref_dest[i] = rand() & PIXEL_MAX;
> + }
> +
> + int width = 16 * (rand() % 4 + 1);
> +
> + int8_t sign = rand () % 3;
> + if (sign == 2)
> + {
> + sign = -1;
> + }
> +
> + for (int i = 0; i < ITERS; i++)
> + {
> + ref(ref_dest, psbuf1 + j, width, sign);
> + opt(opt_dest, psbuf1 + j, width, sign);
> +
> + if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
> + return false;
> +
> + j += INCR;
> + }
> +
> + return true;
> +}
> bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
> {
> if (opt.satd[part])
> @@ -1251,10 +1284,17 @@
> return false;
> }
> }
> + if (opt.saoCuOrgE0)
> + {
> + if (!check_saoCuOrgE0_t(ref.saoCuOrgE0, opt.saoCuOrgE0))
> + {
> + printf("SAO_EO_0 failed\n");
> + return false;
> + }
> + }
>
> return true;
> }
> -
> void PixelHarness::measurePartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
> {
> ALIGN_VAR_16(int, cres[16]);
> @@ -1531,4 +1571,10 @@
> HEADER0("ssim_end_4");
> REPORT_SPEEDUP(opt.ssim_end_4, ref.ssim_end_4, (int(*)[4])pbuf2, (int(*)[4])pbuf1, 4);
> }
> +
> + if (opt.saoCuOrgE0)
> + {
> + printf("SAO_EO_0");
> + REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
> + }
> }
> diff -r d3e3baaf80b4 -r 520cab36d86c source/test/pixelharness.h
> --- a/source/test/pixelharness.h Thu Feb 27 16:25:51 2014 +0530
> +++ b/source/test/pixelharness.h Fri Feb 28 12:17:17 2014 +0530
> @@ -34,6 +34,7 @@
> pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4, **pixel_test_buff;
>
> int *ibuf1, **int_test_buff;
> + int8_t *psbuf1;
>
> int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1;
>
> @@ -65,7 +66,7 @@
> bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
> bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
> bool check_addAvg(addAvg_t, addAvg_t);
> -
> + bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
> public:
>
> PixelHarness();
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list