[x265] [PATCH] intra: sse4 version of strong intrasmoothing

Ximing Cheng chengximing1989 at foxmail.com
Tue Nov 28 16:57:50 CET 2017


# HG changeset patch
# User Ximing Cheng <ximingcheng at tencent.com>
# Date 1511862059 -28800
#      Tue Nov 28 17:40:59 2017 +0800
# Node ID 9cd0cf6e2fd88604d939138e539dd481ec429ab3
# Parent  b24454f3ff6de650aab6835e291837fc4e2a4466
intra: sse4 version of strong intrasmoothing

diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/intrapred.cpp
--- a/source/common/intrapred.cpp	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/intrapred.cpp	Tue Nov 28 17:40:59 2017 +0800
@@ -29,12 +29,43 @@
 namespace {
 
 template<int tuSize>
-void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+void intraFilter(const pixel* samples, pixel* filtered, int bUseStrongFilter)
 {
     const int tuSize2 = tuSize << 1;
 
     pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];
 
+    // strong intra filter
+    if (bUseStrongFilter && tuSize >= 32)
+    {
+        const pixel leftMiddle = samples[tuSize2 + tuSize];
+        const pixel topMiddle = samples[tuSize];
+        const static int threshold = 1 << (X265_DEPTH - 5);
+        const bool bilinearLeft = abs((leftLast + topLeft) - (2 * leftMiddle)) < threshold; //difference between the
+        const bool bilinearAbove = abs((topLeft + topLast) - (2 * topMiddle)) < threshold; //ends and the middle
+
+        if (bilinearLeft && bilinearAbove)
+        {
+            const int shift = 5 + 1;
+            int init = (topLeft << shift) + tuSize;
+            int deltaL, deltaR;
+
+            deltaL = leftLast - topLeft;
+            deltaR = topLast - topLeft;
+
+            filtered[0] = topLeft;
+            for (int i = 1; i < tuSize2; i++)
+            {
+                filtered[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
+                filtered[i] = (pixel)((init + deltaR * i) >> shift);           // Above Filtering
+            }
+            filtered[tuSize2] = topLast;
+            filtered[tuSize2 + tuSize2] = leftLast;
+            return;
+        }
+    }
+    /* 1:2:1 filtering of left and top reference samples */
+
     // filtering top
     for (int i = 1; i < tuSize2; i++)
         filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/predict.cpp
--- a/source/common/predict.cpp	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/predict.cpp	Tue Nov 28 17:40:59 2017 +0800
@@ -594,7 +594,6 @@
 void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
 {
     int tuSize = 1 << intraNeighbors.log2TrSize;
-    int tuSize2 = tuSize << 1;
 
     PicYuv* reconPic = cu.m_encData->m_reconPic;
     pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
@@ -605,41 +604,11 @@
     pixel* refBuf = intraNeighbourBuf[0];
     pixel* fltBuf = intraNeighbourBuf[1];
 
-    pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
-
     if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
     {
         // generate filtered intra prediction samples
-
-        if (cu.m_slice->m_sps->bUseStrongIntraSmoothing && tuSize == 32)
-        {
-            const int threshold = 1 << (X265_DEPTH - 5);
-
-            pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
-
-            if (abs(topLeft + topLast  - (topMiddle  << 1)) < threshold &&
-                abs(topLeft + leftLast - (leftMiddle << 1)) < threshold)
-            {
-                // "strong" bilinear interpolation
-                const int shift = 5 + 1;
-                int init = (topLeft << shift) + tuSize;
-                int deltaL, deltaR;
-
-                deltaL = leftLast - topLeft; deltaR = topLast - topLeft;
-
-                fltBuf[0] = topLeft;
-                for (int i = 1; i < tuSize2; i++)
-                {
-                    fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
-                    fltBuf[i] = (pixel)((init + deltaR * i) >> shift);           // Above Filtering
-                }
-                fltBuf[tuSize2] = topLast;
-                fltBuf[tuSize2 + tuSize2] = leftLast;
-                return;
-            }
-        }
-
-        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf);
+        int bUseStrongIntraSmoothing = cu.m_slice->m_sps->bUseStrongIntraSmoothing;
+        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf, bUseStrongIntraSmoothing);
     }
 }
 
@@ -652,7 +621,7 @@
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
 
     if (m_csp == X265_CSP_I444)
-        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1]);
+        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1], 0);
 }
 
 void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/primitives.h
--- a/source/common/primitives.h	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/primitives.h	Tue Nov 28 17:40:59 2017 +0800
@@ -133,7 +133,7 @@
 
 typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter);
 typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
-typedef void (*intra_filter_t)(const pixel* references, pixel* filtered);
+typedef void (*intra_filter_t)(const pixel* references, pixel* filtered, int bUseStrongFilter);
 
 typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/x86/const-a.asm	Tue Nov 28 17:40:59 2017 +0800
@@ -114,6 +114,10 @@
 const multiH3,              times  1 dw  25,  26,  27,  28,  29,  30,  31,  32
 const multiL,               times  1 dw   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16
 const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32
+const multiH3_1,            times  1 dw  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48
+const multiH3_2,            times  1 dw  41,  42,  43,  44,  45,  46,  47,  48
+const multiH4,              times  1 dw  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64
+const multiH4_1,            times  1 dw  57,  58,  59,  60,  61,  62,  63,  64
 const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
 const pw_planar32_mul,      times  1 dw  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,  21,  20,  19,  18,  17,  16
 const pw_FFFFFFFFFFFFFFF0,           dw 0x00
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/x86/intrapred.h	Tue Nov 28 17:40:59 2017 +0800
@@ -67,7 +67,7 @@
 
 #define DECL_ALL(cpu) \
     FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
-    FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
+    FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered, int bUseStrongFilter); \
     DECL_ANGS(4, cpu); \
     DECL_ANGS(8, cpu); \
     DECL_ANGS(16, cpu); \
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Nov 28 17:40:59 2017 +0800
@@ -543,6 +543,10 @@
 cextern multiH
 cextern multiH2
 cextern multiH3
+cextern multiH3_1
+cextern multiH3_2
+cextern multiH4
+cextern multiH4_1
 cextern multi_2Row
 cextern trans8_shuf
 cextern pw_planar16_mul
@@ -22313,11 +22317,142 @@
     mov             [r1 + 64], r3b                  ; LeftLast
     RET
 
-INIT_XMM sse4
-cglobal intra_filter_32x32, 2,4,6
-    mov             r2b, byte [r0 +  64]            ; topLast
-    mov             r3b, byte [r0 + 128]            ; LeftLast
-
+; this function add strong intra filter
+INIT_XMM sse4
+cglobal intra_filter_32x32, 3,8,7
+    movzx           r3d, byte [r0 +  64]            ; topLast
+    movzx           r4d, byte [r0 + 128]            ; LeftLast
+
+    ; strong intra filter is disabled
+    cmp             r2m, byte 0
+    jz              .normal_filter32
+    ; decide to do strong intra filter
+    movzx           r5d, byte [r0]                  ; topLeft
+    movzx           r6d, byte [r0 + 32]             ; topMiddle
+
+    ; threshold = 8
+    mov             r2d, r3d
+    add             r2d, r5d                        ; (topLast + topLeft)
+    shl             r6d, 1                          ; 2 * topMiddle
+    mov             r7d, r2d
+    sub             r2d, r6d                        ; (topLast + topLeft) - 2 * topMiddle
+    sub             r6d, r7d                        ; 2 * topMiddle - (topLast + topLeft)
+    cmovg           r2d, r6d
+    cmp             r2d, 8
+    ; bilinearAbove is false
+    jns             .normal_filter32
+
+    movzx           r6d, byte [r0 + 96]             ; leftMiddle
+    mov             r2d, r5d
+    add             r2d, r4d
+    shl             r6d, 1
+    mov             r7d, r2d
+    sub             r2d, r6d
+    sub             r6d, r7d
+    cmovg           r2d, r6d
+    cmp             r2d, 8
+    ; bilinearLeft is false
+    jns             .normal_filter32
+
+    ; do strong intra filter shift = 6
+    mov             r2d, r5d
+    shl             r2d, 6
+    add             r2d, 32                         ; init
+    mov             r6d, r4d
+    sub             r6d, r5d                        ; deltaL
+    mov             r7d, r3d
+    sub             r7d, r5d                        ; deltaR
+
+    movd            m0, r2d
+    pshuflw         m0, m0, 0
+    movlhps         m0, m0
+    mova            m4, m0
+
+
+    movd            m1, r7d
+    pshuflw         m1, m1, 0
+    movlhps         m1, m1
+    pmullw          m2, m1, [multiL]                ; [ 1  2  3  4  5  6  7  8]
+    pmullw          m3, m1, [multiH]                ; [ 9 10 11 12 13 14 15 16]
+    paddw           m5, m0, m2
+    paddw           m6, m4, m3
+    psraw           m5, 6
+    psraw           m6, 6
+    packuswb        m5, m6
+    movu            [r1 + 1], m5
+
+    pmullw          m2, m1, [multiH2]               ; [17 18 19 20 21 22 23 24]
+    pmullw          m3, m1, [multiH3]               ; [25 26 27 28 29 30 31 32]
+    paddw           m5, m0, m2
+    paddw           m6, m4, m3
+    psraw           m5, 6
+    psraw           m6, 6
+    packuswb        m5, m6
+    movu            [r1 + 17], m5
+
+    pmullw          m2, m1, [multiH3_1]             ; [33 - 40]
+    pmullw          m3, m1, [multiH3_2]             ; [41 - 48]
+    paddw           m5, m0, m2
+    paddw           m6, m4, m3
+    psraw           m5, 6
+    psraw           m6, 6
+    packuswb        m5, m6
+    movu            [r1 + 33], m5
+
+    pmullw          m2, m1, [multiH4]               ; [49 - 56]
+    pmullw          m1, [multiH4_1]                 ; [57 - 64]
+    paddw           m5, m0, m2
+    paddw           m6, m4, m1
+    psraw           m5, 6
+    psraw           m6, 6
+    packuswb        m5, m6
+    movu            [r1 + 49], m5
+
+    movd            m1, r6d
+    pshuflw         m1, m1, 0
+    movlhps         m1, m1
+    pmullw          m2, m1, [multiL]                ; [ 1  2  3  4  5  6  7  8]
+    pmullw          m3, m1, [multiH]                ; [ 9 10 11 12 13 14 15 16]
+    paddw           m5, m0, m2
+    paddw           m6, m4, m3
+    psraw           m5, 6
+    psraw           m6, 6
+    packuswb        m5, m6
+    movu            [r1 + 65], m5
+
+    pmullw          m2, m1, [multiH2]               ; [17 18 19 20 21 22 23 24]
+    pmullw          m3, m1, [multiH3]               ; [25 26 27 28 29 30 31 32]
+    paddw           m5, m0, m2
+    paddw           m6, m4, m3
+    psraw           m5, 6
+    psraw           m6, 6
+    packuswb        m5, m6
+    movu            [r1 + 81], m5
+
+    pmullw          m2, m1, [multiH3_1]             ; [49 - 56]
+    pmullw          m3, m1, [multiH3_2]             ; [57 - 64]
+    paddw           m5, m0, m2
+    paddw           m6, m4, m3
+    psraw           m5, 6
+    psraw           m6, 6
+    packuswb        m5, m6
+    movu            [r1 + 97], m5
+
+    pmullw          m2, m1, [multiH4]               ; [49 - 56]
+    pmullw          m1, [multiH4_1]                 ; [57 - 64]
+    paddw           m0, m2
+    paddw           m4, m1
+    psraw           m0, 6
+    psraw           m4, 6
+    packuswb        m0, m4
+    movu            [r1 + 113], m0
+
+    mov             [r1], r5b                       ; topLeft
+    mov             [r1 +  64], r3b                 ; topLast
+    mov             [r1 + 128], r4b                 ; LeftLast
+    RET
+
+.normal_filter32
     ; filtering top
     ; 0 to 15
     pmovzxbw        m0, [r0 +  0]
@@ -22514,8 +22649,8 @@
     packuswb        m1, m5
     movu            [r1 + 112], m1
 
-    mov             [r1 +  64], r2b                 ; topLast
-    mov             [r1 + 128], r3b                 ; LeftLast
+    mov             [r1 +  64], r3b                 ; topLast
+    mov             [r1 + 128], r4b                 ; LeftLast
     RET
 
 INIT_YMM avx2
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Wed Nov 22 22:00:48 2017 +0530
+++ b/source/encoder/slicetype.cpp	Tue Nov 28 17:40:59 2017 +0800
@@ -349,7 +349,7 @@
             for (int i = 1; i <= 2 * cuSize; i++)
                 samples[cuSize2 + i] = pixCur[i * fenc.lumaStride];    /* left */
 
-            primitives.cu[sizeIdx].intra_filter(samples, filtered);
+            primitives.cu[sizeIdx].intra_filter(samples, filtered, 0);
 
             int cost, icost = me.COST_MAX;
             uint32_t ilowmode = 0;




More information about the x265-devel mailing list