[x265] [PATCH] asm: split SAO_EO_0 into separate primitive func, added assembly code and testbench support
    dnyaneshwar at multicorewareinc.com 
    dnyaneshwar at multicorewareinc.com
       
    Thu Feb 27 14:00:18 CET 2014
    
    
  
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1393505529 -19800
#      Thu Feb 27 18:22:09 2014 +0530
# Node ID 33cab8f8f6c25cb5a16b2aee8d26a65f91bc156e
# Parent  c9a0802b64aca46509b55d134810cd1b87cd929b
asm: split SAO_EO_0 into separate primitive func, added assembly code and testbench support
added loopfilter.cpp, loopfilter.h, loopfilter.asm files for C and assembly code
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp
--- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp	Wed Feb 26 22:16:28 2014 -0600
+++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.cpp	Thu Feb 27 18:22:09 2014 +0530
@@ -45,6 +45,8 @@
 //! \ingroup TLibCommon
 //! \{
 
+int g_bitDepthY = 8;
+
 SAOParam::~SAOParam()
 {
     for (int i = 0; i < 3; i++)
@@ -535,8 +537,6 @@
     uint32_t tpely     = tmpCu->getCUPelY();
     uint32_t rpelx;
     uint32_t bpely;
-    int  signLeft;
-    int  signRight;
     int  signDown;
     int  signDown1;
     int  signDown2;
@@ -614,23 +614,60 @@
     {
     case SAO_EO_0: // dir: -
     {
-        startX = (lpelx == 0) ? 1 : 0;
-        endX   = (rpelx == picWidthTmp) ? lcuWidth - 1 : lcuWidth;
-        for (y = 0; y < lcuHeight; y++)
-        {
-            signLeft = xSign(rec[startX] - tmpL[y]);
-            for (x = startX; x < endX; x++)
-            {
-                signRight =  xSign(rec[x] - rec[x + 1]);
-                edgeType =  signRight + signLeft + 2;
-                signLeft  = -signRight;
+      pixel firstPxl = 0, lastPxl = 0;
 
-                rec[x] = clipTbl[rec[x] + m_offsetEo[edgeType]];
-            }
+      startX = (lpelx == 0) ? 1 : 0;
+      endX   = (rpelx == picWidthTmp) ? lcuWidth-1 : lcuWidth;
 
-            rec += stride;
-        }
+      if (lcuWidth % 16)
+      {
+          int8_t iSignRight;
+          int8_t uiEdgeType;
 
+          for (y = 0; y < lcuHeight; y++)
+          {
+              int8_t iSignLeft = xSign(rec[startX] - tmpL[y]);
+              for (x = startX; x < endX; x++)
+              {
+                  iSignRight = xSign(rec[x] - rec[x+1]);
+                  uiEdgeType = iSignRight + iSignLeft + 2;
+                  iSignLeft  = -iSignRight;
+
+                  rec[x] =  Clip3(0, (1 << g_bitDepthY) - 1, rec[x] + m_offsetEo[uiEdgeType]);
+              }
+              rec += stride;
+          }
+      }
+      else
+      {
+          for (y = 0; y < lcuHeight; y++)
+          {
+              int8_t iSignLeft = xSign(rec[startX] - tmpL[y]);
+
+              if (lpelx == 0)
+              {
+                  firstPxl = rec[0];
+              }
+
+              if (rpelx == picWidthTmp)
+              {
+                  lastPxl = rec[lcuWidth - 1];
+              }
+
+              primitives.processSaoCuOrg_8bit_SAO_EO_0(rec, m_offsetEo, lcuWidth, iSignLeft);
+
+              if (lpelx == 0)
+              {
+                  rec[0] = firstPxl;
+              }
+
+              if (rpelx == picWidthTmp)
+              {
+                  rec[lcuWidth - 1] = lastPxl;
+              }
+              rec += stride;
+          }
+      }
         break;
     }
     case SAO_EO_1: // dir: |
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/Lib/TLibCommon/TComSampleAdaptiveOffset.h
--- a/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h	Wed Feb 26 22:16:28 2014 -0600
+++ b/source/Lib/TLibCommon/TComSampleAdaptiveOffset.h	Thu Feb 27 18:22:09 2014 +0530
@@ -146,7 +146,7 @@
 
     int32_t *m_offsetBo;
     int32_t *m_chromaOffsetBo;
-    int m_offsetEo[LUMA_GROUP_NUM];
+    int8_t m_offsetEo[LUMA_GROUP_NUM];
 
     int  m_picWidth;
     int  m_picHeight;
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/Lib/TLibCommon/loopfilter.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/Lib/TLibCommon/loopfilter.cpp	Thu Feb 27 18:22:09 2014 +0530
@@ -0,0 +1,51 @@
+/*****************************************************************************
+* Copyright (C) 2013 x265 project
+*
+* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+*          Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+*
+* This program is also available under a commercial proprietary license.
+* For more information, contact us at licensing at multicorewareinc.com.
+*****************************************************************************/
+
+#include "primitives.h"
+
+#define PIXEL_MIN 0
+#define PIXEL_MAX ((1 << 8) - 1)
+
+void SAO_EO_0_C(pixel * pRec, int8_t * m_iOffsetEo, int iLcuWidth, int8_t iSignLeft)
+{
+    int x;
+    int8_t iSignRight;
+    int8_t uiEdgeType;
+
+    for (x = 0; x < iLcuWidth; x++)
+    {
+        iSignRight = ((pRec[x] - pRec[x+1]) < 0) ? -1 : ((pRec[x] - pRec[x+1]) > 0) ? 1 : 0;
+        uiEdgeType = iSignRight + iSignLeft + 2;
+        iSignLeft  = -iSignRight;
+
+        short v = pRec[x] + m_iOffsetEo[uiEdgeType];
+        pRec[x] = (v < 0 ? 0 : (v > (PIXEL_MAX)) ? (PIXEL_MAX) : v);
+    }
+}
+
+namespace x265 {
+void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
+{
+    p.processSaoCuOrg_8bit_SAO_EO_0 = SAO_EO_0_C;
+}
+}
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Wed Feb 26 22:16:28 2014 -0600
+++ b/source/common/CMakeLists.txt	Thu Feb 27 18:22:09 2014 +0530
@@ -38,7 +38,8 @@
     ../Lib/TLibCommon/TComSlice.cpp
     ../Lib/TLibCommon/TComTrQuant.cpp
     ../Lib/TLibCommon/TComWeightPrediction.cpp
-    ../Lib/TLibCommon/TComYuv.cpp)
+    ../Lib/TLibCommon/TComYuv.cpp
+    ../Lib/TLibCommon/loopfilter.cpp)
 source_group(TLibCommon FILES ${LIBCOMMON_SRC})
 source_group(TLibCommonH FILES ${LIBCOMMON_HDR})
 
@@ -102,14 +103,14 @@
 
 if(ENABLE_ASSEMBLY)
     set_source_files_properties(primitives.cpp PROPERTIES COMPILE_FLAGS -DENABLE_ASSEMBLY=1)
-    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h)
+    set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h dct8.h loopfilter.h)
     set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm ssd-a.asm mc-a.asm
                mc-a2.asm pixel-util8.asm blockcopy8.asm
                pixeladd8.asm dct8.asm)
     if(HIGH_BIT_DEPTH)
         set(A_SRCS ${A_SRCS} sad16-a.asm intrapred16.asm ipfilter16.asm)
     else()
-        set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm ipfilter8.asm)
+        set(A_SRCS ${A_SRCS} sad-a.asm intrapred8.asm ipfilter8.asm loopfilter.asm)
     endif()
 
     if(NOT X64)
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/primitives.cpp
--- a/source/common/primitives.cpp	Wed Feb 26 22:16:28 2014 -0600
+++ b/source/common/primitives.cpp	Thu Feb 27 18:22:09 2014 +0530
@@ -63,6 +63,7 @@
 void Setup_C_DCTPrimitives(EncoderPrimitives &p);
 void Setup_C_IPFilterPrimitives(EncoderPrimitives &p);
 void Setup_C_IPredPrimitives(EncoderPrimitives &p);
+void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p);
 
 void Setup_C_Primitives(EncoderPrimitives &p)
 {
@@ -70,6 +71,7 @@
     Setup_C_DCTPrimitives(p);        // dct.cpp
     Setup_C_IPFilterPrimitives(p);   // ipfilter.cpp
     Setup_C_IPredPrimitives(p);      // intrapred.cpp
+    Setup_C_LoopFilterPrimitives(p); // loopfilter.cpp
 }
 }
 
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/primitives.h
--- a/source/common/primitives.h	Wed Feb 26 22:16:28 2014 -0600
+++ b/source/common/primitives.h	Thu Feb 27 18:22:09 2014 +0530
@@ -188,6 +188,8 @@
 
 typedef void (*addAvg_t)(int16_t* src0, int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
+typedef void (*processSaoCuOrg_8bit_t)(pixel * pRec, int8_t * m_iOffsetEo, int iLcuWidth, int8_t iSignLeft);
+
 /* Define a structure containing function pointers to optimized encoder
  * primitives.  Each pointer can reference either an assembly routine,
  * a vectorized primitive, or a C function. */
@@ -255,6 +257,9 @@
     plane_copy_deinterleave_t plane_copy_deinterleave_c;
     extendCURowBorder_t extendRowBorder;
 
+    // sao primitives
+    processSaoCuOrg_8bit_t      processSaoCuOrg_8bit_SAO_EO_0;
+
     struct
     {
         filter_pp_t     filter_vpp[NUM_LUMA_PARTITIONS];
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Feb 26 22:16:28 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp	Thu Feb 27 18:22:09 2014 +0530
@@ -32,6 +32,7 @@
 #include "pixel-util.h"
 #include "mc.h"
 #include "ipfilter8.h"
+#include "loopfilter.h"
 #include "blockcopy8.h"
 #include "intrapred.h"
 #include "dct8.h"
@@ -1258,6 +1259,8 @@
     }
     if (cpuMask & X265_CPU_SSE4)
     {
+        p.processSaoCuOrg_8bit_SAO_EO_0 = x265_SAO_EO_0_sse4;
+
         LUMA_ADDAVG(_sse4);
         CHROMA_ADDAVG(_sse4);
         p.cvt16to32_shl = x265_cvt16to32_shl_sse4;
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/x86/loopfilter.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/loopfilter.asm	Thu Feb 27 18:22:09 2014 +0530
@@ -0,0 +1,81 @@
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Praveen Kumar Tiwari <praveen at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+
+SECTION_RODATA 32
+
+pw_2:    times 16 db  2
+
+SECTION .text
+
+;============================================================================================================
+; void SAO_EO_0(Pxl * pRec, int8_t * m_iOffsetEo, Int iLcuWidth, int8_t iSignLeft)
+;============================================================================================================
+INIT_XMM sse4
+cglobal SAO_EO_0, 4, 4, 8, pRec, m_iOffsetEo, iLcuWidth, iSignLeft
+
+    neg         r3                 ; r3 = -iSignLeft
+    movd        m0,    r3d
+    pslldq      m0,    15          ; m0 = [iSignLeft x .. x]
+    pcmpeqb     m4,    m4          ; m4 = [pb -1]
+    pxor        m5,    m5          ; m5 = 0
+    movu        m6,    [r1]        ; m6 = m_iOffsetEo
+
+.loop:
+    movu        m7,    [r0]                    ; m1 = pRec[x]
+    mova        m1,    m7
+    movu        m2,    [r0+1]                  ; m2 = pRec[x+1]
+
+    psubusb     m3,    m2,    m7
+    psubusb     m1,    m2
+    pcmpeqb     m3,    m5
+    pcmpeqb     m1,    m5
+    pcmpeqb     m2,    m7
+
+    pabsb       m3,    m3                      ; m1 = (pRec[x] - pRec[x+1]) > 0) ?  1 : 0
+    por         m1,    m3                      ; m1 = iSignRight
+    pandn       m2, m1
+
+    palignr     m3,    m2,        m0,    15    ; m3 = -iSignLeft
+    psignb      m3,    m4                      ; m3 = iSignLeft
+    mova        m0, m4
+    pslldq      m0, 15
+    pand        m0,    m2                      ; [pb 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1]
+    paddb       m2,    m3
+    paddb       m2,    [pw_2]                  ; m1 = uiEdgeType
+    pshufb      m3,    m6,        m2
+    pmovzxbw    m2,    m7                      ; rec
+    punpckhbw   m7,    m5
+    pmovsxbw    m1,    m3                      ; iOffsetEo
+    punpckhbw   m3,    m3
+    psraw       m3,    8
+    paddw       m2,    m1
+    paddw       m7,    m3
+    packuswb    m2,    m7
+    movu        [r0],  m2
+
+    add         r0q,   16
+    sub         r2d,   16
+    jnz        .loop
+    RET
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/common/x86/loopfilter.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/loopfilter.h	Thu Feb 27 18:22:09 2014 +0530
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * Copyright (C) 2013 x265 project
+ *
+ * Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at multicorewareinc.com.
+ *****************************************************************************/
+
+#ifndef X265_LOOPFILTER_H
+#define X265_LOOPFILTER_H
+
+void x265_SAO_EO_0_sse4(pixel * pRec, int8_t * m_ffsetEo, int iEndX, int8_t iSignLeft);
+
+#endif // ifndef X265_LOOPFILTER_H
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Wed Feb 26 22:16:28 2014 -0600
+++ b/source/test/pixelharness.cpp	Thu Feb 27 18:22:09 2014 +0530
@@ -50,6 +50,8 @@
     pbuf3 = X265_MALLOC(pixel, bufsize);
     pbuf4 = X265_MALLOC(pixel, bufsize);
 
+    psbuf1 = (int8_t*)X265_MALLOC(int8_t, bufsize);
+
     ibuf1 = X265_MALLOC(int, bufsize);
 
     sbuf1 = X265_MALLOC(int16_t, bufsize);
@@ -63,7 +65,7 @@
     short_test_buff2 = X265_MALLOC(int16_t*, TEST_CASES);
     int_test_buff    = X265_MALLOC(int*, TEST_CASES);
     if (!pbuf1 || !pbuf2 || !pbuf3 || !pbuf4 || !sbuf1 || !sbuf2 || !sbuf3 || !ibuf1 ||
-        !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !short_test_buff2)
+        !pixel_test_buff || !short_test_buff || !int_test_buff || !short_test_buff1 || !short_test_buff2 || !psbuf1)
     {
         fprintf(stderr, "malloc failed, unable to initiate tests!\n");
         exit(1);
@@ -114,6 +116,7 @@
         sbuf2[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1; //max(SHORT_MIN, min(rand(), SMAX));
         ibuf1[i] = (rand() % (2 * SMAX + 1)) - SMAX - 1;
 
+        psbuf1[i] = (rand() %65) - 32;
         sbuf3[i] = rand() % PIXEL_MAX; // for blockcopy only
     }
 }
@@ -869,6 +872,39 @@
     return true;
 }
 
+bool PixelHarness::check_SAO_EO_0_C_8bit_t(processSaoCuOrg_8bit_t ref, processSaoCuOrg_8bit_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+    int j = 0;
+
+    for(int i = 0; i < sizeof(ref_dest); i++)
+    {
+        opt_dest[i] = ref_dest[i] = rand() & PIXEL_MAX;
+    }
+
+    int width =  16 * (rand() % 4 + 1);
+
+    int8_t sign = rand () % 3;
+    if (sign == 2)
+    {
+      sign = -1;
+    }
+
+    for (int i = 0; i < ITERS; i++)
+    {
+      ref(ref_dest, psbuf1 + j, width, sign);
+      opt(opt_dest, psbuf1 + j, width, sign);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        j += INCR;
+    }
+
+    return true;
+}
 bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.satd[part])
@@ -1252,6 +1288,15 @@
         }
     }
 
+    if (opt.processSaoCuOrg_8bit_SAO_EO_0)
+    {
+      if (!check_SAO_EO_0_C_8bit_t(ref.processSaoCuOrg_8bit_SAO_EO_0, opt.processSaoCuOrg_8bit_SAO_EO_0))
+      {
+        printf("SAO_EO_0 failed\n");
+        return false;
+      }
+    }
+
     return true;
 }
 
@@ -1531,4 +1576,10 @@
         HEADER0("ssim_end_4");
         REPORT_SPEEDUP(opt.ssim_end_4, ref.ssim_end_4, (int(*)[4])pbuf2, (int(*)[4])pbuf1, 4);
     }
+
+    if (opt.processSaoCuOrg_8bit_SAO_EO_0)
+    {
+        printf("SAO_EO_0");
+        REPORT_SPEEDUP(opt.processSaoCuOrg_8bit_SAO_EO_0, ref.processSaoCuOrg_8bit_SAO_EO_0, pbuf1, psbuf1, 64, 1);
+    }
 }
diff -r c9a0802b64ac -r 33cab8f8f6c2 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Wed Feb 26 22:16:28 2014 -0600
+++ b/source/test/pixelharness.h	Thu Feb 27 18:22:09 2014 +0530
@@ -32,6 +32,7 @@
 protected:
 
     pixel *pbuf1, *pbuf2, *pbuf3, *pbuf4, **pixel_test_buff;
+    int8_t *psbuf1;
     int *ibuf1, **int_test_buff;
     int16_t *sbuf1, *sbuf2, *sbuf3, **short_test_buff, **short_test_buff1, **short_test_buff2;
     bool check_pixelcmp(pixelcmp_t ref, pixelcmp_t opt);
@@ -62,6 +63,7 @@
     bool check_ssim_4x4x2_core(ssim_4x4x2_core_t ref, ssim_4x4x2_core_t opt);
     bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
     bool check_addAvg(addAvg_t, addAvg_t);
+    bool check_SAO_EO_0_C_8bit_t(processSaoCuOrg_8bit_t ref, processSaoCuOrg_8bit_t opt);
 
 public:
 
    
    
More information about the x265-devel
mailing list