[x265] [PATCH] asm: assembly code for IntraPred_DC[4x4]

Min Chen chenm003 at 163.com
Wed Nov 20 05:47:42 CET 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384922749 -28800
# Node ID 400ab5fa31730fe395e981e45e54a051a6651fbf
# Parent  17e5d27ae03452ef9d6c0a8adf26e6c6a93d6751
asm: assembly code for IntraPred_DC[4x4]

diff -r 17e5d27ae034 -r 400ab5fa3173 source/Lib/TLibCommon/TComPrediction.cpp
--- a/source/Lib/TLibCommon/TComPrediction.cpp	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/Lib/TLibCommon/TComPrediction.cpp	Wed Nov 20 12:45:49 2013 +0800
@@ -130,7 +130,7 @@
     assert(g_convertToBit[size] >= 0);   //   4x  4
     assert(g_convertToBit[size] <= 5);   // 128x128
 
-    char log2BlkSize = g_convertToBit[size] + 2;
+    int log2BlkSize = g_convertToBit[size] + 2;
 
     Pel *src = m_predBuf;
     assert(log2BlkSize >= 2 && log2BlkSize < 7);
@@ -164,7 +164,7 @@
     }
     else if (dirMode == DC_IDX)
     {
-        primitives.intra_pred_dc((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, size, bFilter);
+        primitives.intra_pred_dc[log2BlkSize - 2]((pixel*)refAbv + 1, (pixel*)refLft + 1, (pixel*)dst, stride, bFilter);
     }
     else
     {
@@ -175,6 +175,8 @@
 // Angular chroma
 void TComPrediction::predIntraChromaAng(Pel* src, uint32_t dirMode, Pel* dst, uint32_t stride, int width)
 {
+    int log2BlkSize = g_convertToBit[width];
+
     // Create the prediction
     Pel refAbv[3 * MAX_CU_SIZE];
     Pel refLft[3 * MAX_CU_SIZE];
@@ -193,7 +195,7 @@
     }
     else if (dirMode == DC_IDX)
     {
-        primitives.intra_pred_dc(refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, width, false);
+        primitives.intra_pred_dc[log2BlkSize](refAbv + width - 1 + 1, refLft + width - 1 + 1, dst, stride, false);
     }
     else
     {
diff -r 17e5d27ae034 -r 400ab5fa3173 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Wed Nov 20 12:45:49 2013 +0800
@@ -1622,7 +1622,7 @@
             pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
 
             // DC
-            primitives.intra_pred_dc(above + 1, left + 1, tmp, scaleStride, scaleWidth, (scaleWidth <= 16));
+            primitives.intra_pred_dc[log2SizeMinus2](above + 1, left + 1, tmp, scaleStride, (scaleWidth <= 16));
             modeCosts[DC_IDX] = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
 
             Pel *abovePlanar   = above;
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/CMakeLists.txt
--- a/source/common/CMakeLists.txt	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/CMakeLists.txt	Wed Nov 20 12:45:49 2013 +0800
@@ -113,7 +113,7 @@
 
 if(ENABLE_PRIMITIVES_ASM)
     set(C_SRCS asm-primitives.cpp pixel.h mc.h ipfilter8.h blockcopy8.h)
-    set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm)
+    set(A_SRCS pixel-a.asm const-a.asm cpu-a.asm sad-a.asm mc-a.asm mc-a2.asm ipfilter8.asm pixel-util.asm blockcopy8.asm intrapred.asm)
     if (NOT X64)
         set(A_SRCS ${A_SRCS} pixel-32.asm)
     endif()
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/intrapred.cpp
--- a/source/common/intrapred.cpp	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/intrapred.cpp	Wed Nov 20 12:45:49 2013 +0800
@@ -80,7 +80,8 @@
     }
 }
 
-void PredIntraDC(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter)
+template<int width>
+void PredIntraDC(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int bFilter)
 {
     int k, l;
     int blkSize = width;
@@ -300,7 +301,10 @@
 
 void Setup_C_IPredPrimitives(EncoderPrimitives& p)
 {
-    p.intra_pred_dc = PredIntraDC;
+    p.intra_pred_dc[BLOCK_4x4] = PredIntraDC<4>;
+    p.intra_pred_dc[BLOCK_8x8] = PredIntraDC<8>;
+    p.intra_pred_dc[BLOCK_16x16] = PredIntraDC<16>;
+    p.intra_pred_dc[BLOCK_32x32] = PredIntraDC<32>;
     p.intra_pred_planar = PredIntraPlanar;
     p.intra_pred_ang = PredIntraAngBufRef;
     p.intra_pred_allangs[0] = PredIntraAngs_C<4>;
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/primitives.h
--- a/source/common/primitives.h	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/primitives.h	Wed Nov 20 12:45:49 2013 +0800
@@ -177,7 +177,7 @@
 typedef void (*pixelavg_pp_t)(pixel *dst, intptr_t dstride, pixel *src0, intptr_t sstride0, pixel *src1, intptr_t sstride1, int weight);
 typedef void (*blockfill_s_t)(int16_t *dst, intptr_t dstride, int16_t val);
 
-typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int bFilter);
+typedef void (*intra_dc_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int bFilter);
 typedef void (*intra_planar_t)(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width);
 typedef void (*intra_ang_t)(pixel* dst, int dstStride, int width, int dirMode, bool bFilter, pixel *refLeft, pixel *refAbove);
 typedef void (*intra_allangs_t)(pixel *dst, pixel *above0, pixel *left0, pixel *above1, pixel *left1, bool bLuma);
@@ -274,7 +274,7 @@
     filter_p2s_t    chroma_p2s;
     extendCURowBorder_t extendRowBorder;
 
-    intra_dc_t      intra_pred_dc;
+    intra_dc_t      intra_pred_dc[NUM_SQUARE_BLOCKS];
     intra_planar_t  intra_pred_planar;
     intra_ang_t     intra_pred_ang;
     intra_allangs_t intra_pred_allangs[NUM_SQUARE_BLOCKS];
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/vec/intra-sse41.cpp
--- a/source/common/vec/intra-sse41.cpp	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/vec/intra-sse41.cpp	Wed Nov 20 12:45:49 2013 +0800
@@ -102,7 +102,8 @@
     }
 }
 
-void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int width, int filter)
+template<int width>
+void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
 {
     int sum;
     int logSize = g_convertToBit[width] + 2;
@@ -8708,7 +8709,10 @@
     initFileStaticVars();
 
     p.intra_pred_planar = intra_pred_planar;
-    p.intra_pred_dc = intra_pred_dc;
+    p.intra_pred_dc[BLOCK_4x4] = intra_pred_dc<4>;
+    p.intra_pred_dc[BLOCK_8x8] = intra_pred_dc<8>;
+    p.intra_pred_dc[BLOCK_16x16] = intra_pred_dc<16>;
+    p.intra_pred_dc[BLOCK_32x32] = intra_pred_dc<32>;
 
 #if defined(__GNUC__) || defined(__INTEL_COMPILER) || (defined(_MSC_VER) && (_MSC_VER == 1500))
     p.intra_pred_allangs[0] = predIntraAngs4;
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/x86/asm-primitives.cpp	Wed Nov 20 12:45:49 2013 +0800
@@ -634,6 +634,7 @@
         p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse4;
         p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse4;
         p.quant = x265_quant_sse4;
+        p.intra_pred_dc[BLOCK_4x4] = x265_intra_pred_dc4_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/intrapred.asm
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/source/common/x86/intrapred.asm	Wed Nov 20 12:45:49 2013 +0800
@@ -0,0 +1,95 @@
+;*****************************************************************************
+;* Copyright (C) 2013 x265 project
+;*
+;* Authors: Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
+;*
+;* This program is free software; you can redistribute it and/or modify
+;* it under the terms of the GNU General Public License as published by
+;* the Free Software Foundation; either version 2 of the License, or
+;* (at your option) any later version.
+;*
+;* This program is distributed in the hope that it will be useful,
+;* but WITHOUT ANY WARRANTY; without even the implied warranty of
+;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;* GNU General Public License for more details.
+;*
+;* You should have received a copy of the GNU General Public License
+;* along with this program; if not, write to the Free Software
+;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+;*
+;* This program is also available under a commercial proprietary license.
+;* For more information, contact us at licensing at multicorewareinc.com.
+;*****************************************************************************/
+
+%include "x86inc.asm"
+%include "x86util.asm"
+
+SECTION_RODATA 32
+
+
+
+SECTION .text
+
+;-----------------------------------------------------------------------------
+; void intra_pred_dc(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal intra_pred_dc4, 5,6,8
+    pxor        m0, m0
+    movd        m1, [r0]
+    movd        m2, [r1]
+    punpckldq   m1, m2
+    psadbw      m1, m0              ; m1 = sum
+
+    test        r4d, r4d
+
+    mov         r4d, 4096
+    movd        m2, r4d
+    pmulhrsw    m1, m2              ; m1 = (sum + 4) / 8
+    movd        r4d, m1             ; r4d = dc_val
+    pshufb      m1, m0              ; m1 = byte [dc_val ...]
+
+    ; store DC 4x4
+    lea         r5, [r3 * 3]
+    movd        [r2], m1
+    movd        [r2 + r3], m1
+    movd        [r2 + r3 * 2], m1
+    movd        [r2 + r5], m1
+
+    ; do DC filter
+    jz         .end
+    lea         r5d, [r4d * 2 + 2]  ; r5d = DC * 2 + 2
+    add         r4d, r5d            ; r4d = DC * 3 + 2
+    movd        m1, r4d
+    pshuflw     m1, m1, 0           ; m1 = pixDCx3
+
+    ; filter top
+    pmovzxbw    m2, [r0]
+    paddw       m2, m1
+    psraw       m2, 2
+    packuswb    m2, m2
+    movd        [r2], m2            ; overwrite top-left pixel, we will update it later
+
+    ; filter top-left
+    movzx       r0d, byte [r0]
+    add         r5d, r0d
+    movzx       r0d, byte [r1]
+    add         r0d, r5d
+    shr         r0d, 2
+    mov         [r2], r0b
+
+    ; filter left
+    add         r2, r3
+    pmovzxbw    m2, [r1 + 1]
+    paddw       m2, m1
+    psraw       m2, 2
+    packuswb    m2, m2
+    movd        r0d, m2
+    mov         [r2], r0b
+    mov         [r2 + r3], r0h
+    shr         r0d, 16
+    mov         [r2 + r3 * 2], r0b
+
+.end
+
+    RET
diff -r 17e5d27ae034 -r 400ab5fa3173 source/common/x86/pixel.h
--- a/source/common/x86/pixel.h	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/common/x86/pixel.h	Wed Nov 20 12:45:49 2013 +0800
@@ -365,5 +365,6 @@
 void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
 uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
+void x265_intra_pred_dc4_sse4(pixel* above, pixel* left, pixel* dst, intptr_t dstStride, int filter);
 
 #endif // ifndef X265_I386_PIXEL_H
diff -r 17e5d27ae034 -r 400ab5fa3173 source/encoder/compress.cpp
--- a/source/encoder/compress.cpp	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/encoder/compress.cpp	Wed Nov 20 12:45:49 2013 +0800
@@ -145,7 +145,7 @@
     pixelcmp_t sa8d = primitives.sa8d[log2SizeMinus2];
 
     // DC
-    primitives.intra_pred_dc(above + 1, left + 1, tmp, scaleStride, scaleWidth, (scaleWidth <= 16));
+    primitives.intra_pred_dc[log2SizeMinus2](above + 1, left + 1, tmp, scaleStride, (scaleWidth <= 16));
     sad = costMultiplier * sa8d(fenc, scaleStride, tmp, scaleStride);
     bmode = mode = DC_IDX;
     bits  = m_search->xModeBitsIntra(cu, mode, partOffset, depth, initTrDepth);
diff -r 17e5d27ae034 -r 400ab5fa3173 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/encoder/slicetype.cpp	Wed Nov 20 12:45:49 2013 +0800
@@ -566,7 +566,7 @@
         int predsize = cuSize * cuSize;
 
         // generate 35 intra predictions into tmp
-        primitives.intra_pred_dc(pAbove0 + 1, pLeft0 + 1, predictions, cuSize, cuSize, (cuSize <= 16));
+        primitives.intra_pred_dc[nLog2SizeMinus2](pAbove0 + 1, pLeft0 + 1, predictions, cuSize, (cuSize <= 16));
         pixel *above = (cuSize >= 8) ? pAbove1 : pAbove0;
         pixel *left  = (cuSize >= 8) ? pLeft1 : pLeft0;
         primitives.intra_pred_planar((pixel*)above + 1, (pixel*)left + 1, predictions + predsize, cuSize, cuSize);
diff -r 17e5d27ae034 -r 400ab5fa3173 source/test/intrapredharness.cpp
--- a/source/test/intrapredharness.cpp	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/test/intrapredharness.cpp	Wed Nov 20 12:45:49 2013 +0800
@@ -68,17 +68,16 @@
     X265_FREE(pixel_out_33_vec);
 }
 
-bool IntraPredHarness::check_dc_primitive(intra_dc_t ref, intra_dc_t opt)
+bool IntraPredHarness::check_dc_primitive(intra_dc_t ref, intra_dc_t opt, int width)
 {
     int j = ADI_BUF_STRIDE;
 
     for (int i = 0; i <= 100; i++)
     {
-        int rand_width = 1 << ((rand() % 4) + 2);                  // Randomly generated Width
         int rand_filter = rand() & 1;
 
         pixel left[MAX_CU_SIZE * 2 + 1];
-        for (int k = 0; k < rand_width * 2 + 1; k++)
+        for (int k = 0; k < width * 2 + 1; k++)
         {
             left[k] = pixel_buff[j - 1 + k * ADI_BUF_STRIDE];
         }
@@ -87,17 +86,16 @@
         memset(pixel_out_vec, 0xCD, out_size);
         memset(pixel_out_c, 0xCD, out_size);
 #endif
+        ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, rand_filter);
+        opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_filter);
 
-        ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, rand_width, rand_filter);
-        opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_width, rand_filter);
-
-        for (int k = 0; k < rand_width; k++)
+        for (int k = 0; k < width; k++)
         {
-            if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, rand_width))
+            if (memcmp(pixel_out_vec + k * FENC_STRIDE, pixel_out_c + k * FENC_STRIDE, width))
             {
 #if _DEBUG
-                ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, rand_width, rand_filter);
-                opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_width, rand_filter);
+                ref(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_c,   FENC_STRIDE, rand_filter);
+                opt(pixel_buff + j - ADI_BUF_STRIDE, left + 1, pixel_out_vec, FENC_STRIDE, rand_filter);
 #endif
                 return false;
             }
@@ -245,12 +243,16 @@
 
 bool IntraPredHarness::testCorrectness(const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
-    if (opt.intra_pred_dc)
+    for(int i = 0; i < NUM_SQUARE_BLOCKS; i++)
     {
-        if (!check_dc_primitive(ref.intra_pred_dc, opt.intra_pred_dc))
+        if (opt.intra_pred_dc[i])
         {
-            printf("intra_dc failed\n");
-            return false;
+            const int size = (1 << (i + 2));
+            if (!check_dc_primitive(ref.intra_pred_dc[i], opt.intra_pred_dc[i], size))
+            {
+                printf("intra_dc %dx%d failed\n", size, size);
+                return false;
+            }
         }
     }
     if (opt.intra_pred_planar)
@@ -286,14 +288,18 @@
     int width = 64;
     uint16_t srcStride = 96;
 
-    if (opt.intra_pred_dc)
+    for(int i = 0; i < NUM_SQUARE_BLOCKS; i++)
     {
-        printf("intra_dc[filter=0]");
-        REPORT_SPEEDUP(opt.intra_pred_dc, ref.intra_pred_dc,
-                       pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width, 0);
-        printf("intra_dc[filter=1]");
-        REPORT_SPEEDUP(opt.intra_pred_dc, ref.intra_pred_dc,
-                       pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, width, 1);
+        if (opt.intra_pred_dc[i])
+        {
+            const int size = (1 << (i + 2));
+            printf("intra_dc_%dx%d[filter=0]", size, size);
+            REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
+                           pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 0);
+            printf("intra_dc_%dx%d[filter=1]", size, size);
+            REPORT_SPEEDUP(opt.intra_pred_dc[i], ref.intra_pred_dc[i],
+                           pixel_buff + srcStride, pixel_buff, pixel_out_vec, FENC_STRIDE, 1);
+        }
     }
     if (opt.intra_pred_planar)
     {
diff -r 17e5d27ae034 -r 400ab5fa3173 source/test/intrapredharness.h
--- a/source/test/intrapredharness.h	Wed Nov 20 12:45:28 2013 +0800
+++ b/source/test/intrapredharness.h	Wed Nov 20 12:45:49 2013 +0800
@@ -43,7 +43,7 @@
     static const int out_size = 64 * FENC_STRIDE;
     static const int out_size_33 = 33 * 64 * FENC_STRIDE;
 
-    bool check_dc_primitive(intra_dc_t ref, intra_dc_t opt);
+    bool check_dc_primitive(intra_dc_t ref, intra_dc_t opt, int width);
     bool check_planar_primitive(intra_planar_t ref, intra_planar_t opt);
     bool check_angular_primitive(intra_ang_t ref, intra_ang_t opt);
     bool check_allangs_primitive(const intra_allangs_t ref[], const intra_allangs_t opt[]);



More information about the x265-devel mailing list