[x265] [PATCH] saoCuOrgB0: asm code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon Jan 5 09:12:43 CET 2015


# HG changeset patch
# User Praveen Tiwari
# Date 1420445547 -19800
# Node ID cbdd7a2c3bf6d00a6b5379fedd0fb4de778e0a30
# Parent  ede9aa255489fe410aced728880765a7b975b10e
saoCuOrgB0: asm code

diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/loopfilter.cpp	Mon Jan 05 13:42:27 2015 +0530
@@ -57,10 +57,31 @@
     }
 }
 
+void processSaoCUB0(pixel* rec, const int8_t* offset, int ctuWidth, int ctuHeight, intptr_t stride)
+{
+    #define SAO_BO_BITS 5
+    const int boShift = X265_DEPTH - SAO_BO_BITS;
+    int x, y;
+    for (y = 0; y < ctuHeight; y++)
+    {
+        for (x = 0; x < ctuWidth; x++)
+        {
+            int val = rec[x] + offset[rec[x] >> boShift];
+            if (val < 0)
+               val = 0;
+            else if (val > ((1 << X265_DEPTH) - 1))
+                 val = ((1 << X265_DEPTH) - 1);
+            rec[x] = (pixel)val;
+        }
+        rec += stride;
+    }
+}
+
 namespace x265 {
 void Setup_C_LoopFilterPrimitives(EncoderPrimitives &p)
 {
     p.saoCuOrgE0 = processSaoCUE0;
+    p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
 }
 }
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/primitives.h
--- a/source/common/primitives.h	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/primitives.h	Mon Jan 05 13:42:27 2015 +0530
@@ -191,6 +191,7 @@
 typedef void (*addAvg_t)(const int16_t* src0, const int16_t* src1, pixel* dst, intptr_t src0Stride, intptr_t src1Stride, intptr_t dstStride);
 
 typedef void (*saoCuOrgE0_t)(pixel* rec, int8_t* offsetEo, int width, int8_t signLeft);
+typedef void (*saoCuOrgB0_t)(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 typedef void (*sign_t)(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 typedef void (*planecopy_cp_t) (const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
 typedef void (*planecopy_sp_t) (const uint16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift, uint16_t mask);
@@ -243,6 +244,7 @@
 
     sign_t                sign;
     saoCuOrgE0_t          saoCuOrgE0;
+    saoCuOrgB0_t          saoCuOrgB0;
 
     downscale_t           frameInitLowres;
     cutree_propagate_cost propagateCost;
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Jan 05 13:42:27 2015 +0530
@@ -1648,6 +1648,7 @@
     {
         p.sign = x265_calSign_sse4;
         p.saoCuOrgE0 = x265_saoCuOrgE0_sse4;
+        p.saoCuOrgB0 = x265_saoCuOrgB0_sse4;
 
         LUMA_ADDAVG(_sse4);
         CHROMA_ADDAVG(_sse4);
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/x86/loopfilter.asm
--- a/source/common/x86/loopfilter.asm	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/x86/loopfilter.asm	Mon Jan 05 13:42:27 2015 +0530
@@ -28,7 +28,8 @@
 %include "x86inc.asm"
 
 SECTION_RODATA 32
-
+pb_31:      times 16 db 31
+pb_15:      times 16 db 15
 
 SECTION .text
 cextern pb_1
@@ -84,6 +85,56 @@
     jnz        .loop
     RET
 
+;=====================================================================================
+; void saoCuOrgB0(pixel* rec, const pixel* offset, int lcuWidth, int lcuHeight, int stride)
+;=====================================================================================
+INIT_XMM sse4
+cglobal saoCuOrgB0, 4, 7, 8
+
+    mov         r3d, r3m
+    mov         r4d, r4m
+
+    shr         r2d, 4
+    movu        m3, [r1 + 0]      ; offset[0-15]
+    movu        m4, [r1 + 16]     ; offset[16-31]
+    pxor        m7, m7            ; m7 =[0]
+.loopH
+    mov         r5d, r2d
+    xor         r6,  r6
+
+.loopW
+    movu        m2, [r0 + r6]     ; m0 = [rec]
+    psrlw       m1, m2, 3
+    pand        m1, [pb_31]       ; m1 = [index]
+    pcmpgtb     m0, m1, [pb_15]   ; m2 = [mask]
+
+    pshufb      m6, m3, m1
+    pshufb      m5, m4, m1
+
+    pblendvb    m6, m6, m5, m0    ; NOTE: don't use 3 parameters style, x264 macro have some bug!
+
+    pmovzxbw    m1, m2            ; rec
+    punpckhbw   m2, m7
+
+    pmovsxbw    m0, m6            ; offset
+    punpckhbw   m6, m6
+    psraw       m6, 8
+
+    paddw       m1, m0
+    paddw       m2, m6
+    packuswb    m1, m2
+
+    movu        [r0 + r6], m1
+    add         r6d, 16
+    dec         r5d
+    jnz         .loopW
+
+    lea         r0, [r0 + r4]
+
+    dec         r3d
+    jnz         .loopH
+    RET
+
 ;============================================================================================================
 ; void calSign(int8_t *dst, const Pixel *src1, const Pixel *src2, const int endX)
 ;============================================================================================================
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/common/x86/loopfilter.h
--- a/source/common/x86/loopfilter.h	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/common/x86/loopfilter.h	Mon Jan 05 13:42:27 2015 +0530
@@ -2,6 +2,7 @@
  * Copyright (C) 2013 x265 project
  *
  * Authors: Dnyaneshwar Gorade <dnyaneshwar at multicorewareinc.com>
+ *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -25,6 +26,7 @@
 #define X265_LOOPFILTER_H
 
 void x265_saoCuOrgE0_sse4(pixel * rec, int8_t * offsetEo, int endX, int8_t signLeft);
+void x265_saoCuOrgB0_sse4(pixel* rec, const int8_t* offsetBo, int ctuWidth, int ctuHeight, intptr_t stride);
 void x265_calSign_sse4(int8_t *dst, const pixel *src1, const pixel *src2, const int endX);
 
 #endif // ifndef X265_LOOPFILTER_H
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/encoder/sao.cpp
--- a/source/encoder/sao.cpp	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/encoder/sao.cpp	Mon Jan 05 13:42:27 2015 +0530
@@ -3,6 +3,7 @@
  *
  * Authors: Steve Borho <steve at borho.org>
  *          Min Chen <chenm003 at 163.com>
+ *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -73,7 +74,6 @@
     m_param = NULL;
     m_clipTable = NULL;
     m_clipTableBase = NULL;
-    m_offsetBo = NULL;
     m_tmpU1[0] = NULL;
     m_tmpU1[1] = NULL;
     m_tmpU1[2] = NULL;
@@ -107,7 +107,6 @@
     int numCtu = m_numCuInWidth * m_numCuInHeight;
 
     CHECKED_MALLOC(m_clipTableBase,  pixel, maxY + 2 * rangeExt);
-    CHECKED_MALLOC(m_offsetBo,       pixel, maxY + 2 * rangeExt);
 
     CHECKED_MALLOC(m_tmpL1, pixel, g_maxCUSize + 1);
     CHECKED_MALLOC(m_tmpL2, pixel, g_maxCUSize + 1);
@@ -145,7 +144,6 @@
 void SAO::destroy()
 {
     X265_FREE(m_clipTableBase);
-    X265_FREE(m_offsetBo);
 
     X265_FREE(m_tmpL1);
     X265_FREE(m_tmpL2);
@@ -443,16 +441,31 @@
     }
     case SAO_BO:
     {
-        const pixel* offsetBo = m_offsetBo;
+        const int8_t* offsetBo = m_offsetBo;
 
-        for (y = 0; y < ctuHeight; y++)
+        if (ctuWidth & 15)
         {
-            for (x = 0; x < ctuWidth; x++)
-                rec[x] = offsetBo[rec[x]];
-
-            rec += stride;
+            #define SAO_BO_BITS 5
+            const int boShift = X265_DEPTH - SAO_BO_BITS;
+            int x, y;
+            for (y = 0; y < ctuHeight; y++)
+            {
+                for (x = 0; x < ctuWidth; x++)
+                {
+                     int val = rec[x] + offsetBo[rec[x] >> boShift];
+                     if (val < 0)
+                         val = 0;
+                     else if (val > ((1 << X265_DEPTH) - 1))
+                         val = ((1 << X265_DEPTH) - 1);
+                     rec[x] = (pixel)val;
+                }
+                rec += stride;
+            }
         }
-
+        else
+        {
+            primitives.saoCuOrgB0(rec, offsetBo, ctuWidth, ctuHeight, stride);
+        }
         break;
     }
     default: break;
@@ -495,8 +508,6 @@
 
     memcpy(m_tmpU2[plane], rec, sizeof(pixel) * picWidth);
 
-    const int boShift = X265_DEPTH - SAO_BO_BITS;
-
     for (int idxX = 0; idxX < m_numCuInWidth; idxX++)
     {
         addr = idxY * m_numCuInWidth + idxX;
@@ -510,15 +521,10 @@
             {
                 if (typeIdx == SAO_BO)
                 {
-                    pixel* offsetBo = m_offsetBo;
-                    int offset[SAO_NUM_BO_CLASSES];
-                    memset(offset, 0, sizeof(offset));
+                    memset(m_offsetBo, 0, sizeof(m_offsetBo));
 
                     for (int i = 0; i < SAO_NUM_OFFSET; i++)
-                        offset[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = ctuParam[addr].offset[i] << SAO_BIT_INC;
-
-                    for (int i = 0; i < (1 << X265_DEPTH); i++)
-                        offsetBo[i] = m_clipTable[i + offset[i >> boShift]];
+                        m_offsetBo[((ctuParam[addr].bandPos + i) & (SAO_NUM_BO_CLASSES - 1))] = (int8_t)(ctuParam[addr].offset[i] << SAO_BIT_INC);
                 }
                 else // if (typeIdx == SAO_EO_0 || typeIdx == SAO_EO_1 || typeIdx == SAO_EO_2 || typeIdx == SAO_EO_3)
                 {
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/encoder/sao.h
--- a/source/encoder/sao.h	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/encoder/sao.h	Mon Jan 05 13:42:27 2015 +0530
@@ -3,6 +3,7 @@
  *
  * Authors: Steve Borho <steve at borho.org>
  *          Min Chen <chenm003 at 163.com>
+ *          Praveen Kumar Tiwari <praveen at multicorewareinc.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -77,7 +78,7 @@
     PerPlane*   m_offsetOrgPreDblk;
 
     double      m_depthSaoRate[2][4];
-    pixel*      m_offsetBo;
+    int8_t      m_offsetBo[SAO_NUM_BO_CLASSES];
     int8_t      m_offsetEo[NUM_EDGETYPE];
 
     int         m_numCuInWidth;
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/test/pixelharness.cpp	Mon Jan 05 13:42:27 2015 +0530
@@ -997,6 +997,36 @@
     return true;
 }
 
+bool PixelHarness::check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt)
+{
+    ALIGN_VAR_16(pixel, ref_dest[64 * 64]);
+    ALIGN_VAR_16(pixel, opt_dest[64 * 64]);
+
+    memset(ref_dest, 0xCD, sizeof(ref_dest));
+    memset(opt_dest, 0xCD, sizeof(opt_dest));
+
+    int j = 0;
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int width = 16 * (rand() % 4 + 1);
+        int height = rand() % 64 +1;
+        int stride = rand() % 65;
+
+        ref(ref_dest, psbuf1 + j, width, height, stride);
+        checked(opt, opt_dest, psbuf1 + j, width, height, stride);
+
+        if (memcmp(ref_dest, opt_dest, 64 * 64 * sizeof(pixel)))
+            return false;
+
+        reportfail();
+        j += INCR;
+    }
+
+    return true;
+}
+
+
 bool PixelHarness::testPartition(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.satd[part])
@@ -1431,6 +1461,15 @@
         }
     }
 
+    if (opt.saoCuOrgB0)
+    {
+        if (!check_saoCuOrgB0_t(ref.saoCuOrgB0, opt.saoCuOrgB0))
+        {
+            printf("SAO_BO_0 failed\n");
+            return false;
+        }
+    }
+
     if (opt.planecopy_sp)
     {
         if (!check_planecopy_sp(ref.planecopy_sp, opt.planecopy_sp))
@@ -1760,6 +1799,12 @@
         REPORT_SPEEDUP(opt.saoCuOrgE0, ref.saoCuOrgE0, pbuf1, psbuf1, 64, 1);
     }
 
+    if (opt.saoCuOrgB0)
+    {
+        HEADER0("SAO_BO_0");
+        REPORT_SPEEDUP(opt.saoCuOrgB0, ref.saoCuOrgB0, pbuf1, psbuf1, 64, 64, 64);
+    }
+
     if (opt.planecopy_sp)
     {
         HEADER0("planecopy_sp");
diff -r ede9aa255489 -r cbdd7a2c3bf6 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Mon Jan 05 10:57:01 2015 +0530
+++ b/source/test/pixelharness.h	Mon Jan 05 13:42:27 2015 +0530
@@ -90,6 +90,7 @@
     bool check_ssim_end(ssim_end4_t ref, ssim_end4_t opt);
     bool check_addAvg(addAvg_t, addAvg_t);
     bool check_saoCuOrgE0_t(saoCuOrgE0_t ref, saoCuOrgE0_t opt);
+    bool check_saoCuOrgB0_t(saoCuOrgB0_t ref, saoCuOrgB0_t opt);
     bool check_planecopy_sp(planecopy_sp_t ref, planecopy_sp_t opt);
     bool check_planecopy_cp(planecopy_cp_t ref, planecopy_cp_t opt);
     bool check_psyCost_pp(pixelcmp_t ref, pixelcmp_t opt);


More information about the x265-devel mailing list