[x265] [PATCH] intra: sse4 version of strong intra smoothing

Ximing Cheng chengximing1989 at foxmail.com
Mon Nov 20 20:16:40 CET 2017


# HG changeset patch
# User Ximing Cheng <ximingcheng at tencent.com>
# Date 1511205390 -28800
#      Tue Nov 21 03:16:30 2017 +0800
# Node ID 50a37352461d3218c0e62ce48a2772978fc863f3
# Parent  a7c2f80c18afa0deff2c5b18897f5b3ebf70657c
intra: sse4 version of strong intra smoothing

diff -r a7c2f80c18af -r 50a37352461d source/common/intrapred.cpp
--- a/source/common/intrapred.cpp	Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/intrapred.cpp	Tue Nov 21 03:16:30 2017 +0800
@@ -29,12 +29,43 @@
 namespace {
 
 template<int tuSize>
-void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+void intraFilter(const pixel* samples, pixel* filtered, int bUseStrongFilter)
 {
     const int tuSize2 = tuSize << 1;
 
     pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];
 
+    // strong intra filter
+    if (bUseStrongFilter && tuSize >= 32)
+    {
+        const pixel leftMiddle = samples[tuSize2 + tuSize];
+        const pixel topMiddle = samples[tuSize];
+        const static int threshold = 1 << (X265_DEPTH - 5);
+        const bool bilinearLeft = abs((leftLast + topLeft) - (2 * leftMiddle)) < threshold; //difference between the
+        const bool bilinearAbove = abs((topLeft + topLast) - (2 * topMiddle)) < threshold; //ends and the middle
+
+        if (bilinearLeft && bilinearAbove)
+        {
+            const int shift = 5 + 1;
+            int init = (topLeft << shift) + tuSize;
+            int deltaL, deltaR;
+
+            deltaL = leftLast - topLeft;
+            deltaR = topLast - topLeft;
+
+            filtered[0] = topLeft;
+            for (int i = 1; i < tuSize2; i++)
+            {
+                filtered[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
+                filtered[i] = (pixel)((init + deltaR * i) >> shift);           // Above Filtering
+            }
+            filtered[tuSize2] = topLast;
+            filtered[tuSize2 + tuSize2] = leftLast;
+            return;
+        }
+    }
+
+    /* 1:2:1 filtering of left and top reference samples */
     // filtering top
     for (int i = 1; i < tuSize2; i++)
         filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
diff -r a7c2f80c18af -r 50a37352461d source/common/predict.cpp
--- a/source/common/predict.cpp	Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/predict.cpp	Tue Nov 21 03:16:30 2017 +0800
@@ -594,7 +594,6 @@
 void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
 {
     int tuSize = 1 << intraNeighbors.log2TrSize;
-    int tuSize2 = tuSize << 1;
 
     PicYuv* reconPic = cu.m_encData->m_reconPic;
     pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
@@ -605,41 +604,11 @@
     pixel* refBuf = intraNeighbourBuf[0];
     pixel* fltBuf = intraNeighbourBuf[1];
 
-    pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
-
     if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
     {
         // generate filtered intra prediction samples
-
-        if (cu.m_slice->m_sps->bUseStrongIntraSmoothing && tuSize == 32)
-        {
-            const int threshold = 1 << (X265_DEPTH - 5);
-
-            pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
-
-            if (abs(topLeft + topLast  - (topMiddle  << 1)) < threshold &&
-                abs(topLeft + leftLast - (leftMiddle << 1)) < threshold)
-            {
-                // "strong" bilinear interpolation
-                const int shift = 5 + 1;
-                int init = (topLeft << shift) + tuSize;
-                int deltaL, deltaR;
-
-                deltaL = leftLast - topLeft; deltaR = topLast - topLeft;
-
-                fltBuf[0] = topLeft;
-                for (int i = 1; i < tuSize2; i++)
-                {
-                    fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
-                    fltBuf[i] = (pixel)((init + deltaR * i) >> shift);           // Above Filtering
-                }
-                fltBuf[tuSize2] = topLast;
-                fltBuf[tuSize2 + tuSize2] = leftLast;
-                return;
-            }
-        }
-
-        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf);
+        int bUseStrongIntraSmoothing = cu.m_slice->m_sps->bUseStrongIntraSmoothing;
+        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf, bUseStrongIntraSmoothing);
     }
 }
 
@@ -652,7 +621,7 @@
     fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
 
     if (m_csp == X265_CSP_I444)
-        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1]);
+        primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1], 0);
 }
 
 void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
diff -r a7c2f80c18af -r 50a37352461d source/common/primitives.h
--- a/source/common/primitives.h	Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/primitives.h	Tue Nov 21 03:16:30 2017 +0800
@@ -133,7 +133,7 @@
 
 typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter);
 typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
-typedef void (*intra_filter_t)(const pixel* references, pixel* filtered);
+typedef void (*intra_filter_t)(const pixel* references, pixel* filtered, int bUseStrongFilter);
 
 typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
 typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
diff -r a7c2f80c18af -r 50a37352461d source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/x86/const-a.asm	Tue Nov 21 03:16:30 2017 +0800
@@ -112,6 +112,10 @@
 const multi_2Row,           times  1 dw   1,   2,   3,   4,   1,   2,   3,   4
 const multiH,               times  1 dw   9,  10,  11,  12,  13,  14,  15,  16
 const multiH3,              times  1 dw  25,  26,  27,  28,  29,  30,  31,  32
+const multiH3_1,            times  1 dw  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,  47,  48
+const multiH3_2,            times  1 dw  41,  42,  43,  44,  45,  46,  47,  48
+const multiH4,              times  1 dw  49,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64
+const multiH4_1,            times  1 dw  57,  58,  59,  60,  61,  62,  63,  64
 const multiL,               times  1 dw   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,  15,  16
 const multiH2,              times  1 dw  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,  29,  30,  31,  32
 const pw_planar16_mul,      times  1 dw  15,  14,  13,  12,  11,  10,   9,   8,   7,   6,   5,   4,   3,   2,   1,   0
diff -r a7c2f80c18af -r 50a37352461d source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h	Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/x86/intrapred.h	Tue Nov 21 03:16:30 2017 +0800
@@ -67,7 +67,7 @@
 
 #define DECL_ALL(cpu) \
     FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
-    FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
+    FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered, int bUseStrongFilter); \
     DECL_ANGS(4, cpu); \
     DECL_ANGS(8, cpu); \
     DECL_ANGS(16, cpu); \
diff -r a7c2f80c18af -r 50a37352461d source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/x86/intrapred8.asm	Tue Nov 21 03:16:30 2017 +0800
@@ -543,6 +543,10 @@
 cextern multiH
 cextern multiH2
 cextern multiH3
+cextern multiH3_1
+cextern multiH3_2
+cextern multiH4
+cextern multiH4_1
 cextern multi_2Row
 cextern trans8_shuf
 cextern pw_planar16_mul
@@ -22313,11 +22317,144 @@
     mov             [r1 + 64], r3b                  ; LeftLast
     RET
 
-INIT_XMM sse4
-cglobal intra_filter_32x32, 2,4,6
-    mov             r2b, byte [r0 +  64]            ; topLast
-    mov             r3b, byte [r0 + 128]            ; LeftLast
-
+; this function add strong intra filter
+INIT_XMM sse4
+cglobal intra_filter_32x32, 3,8,7
+    xor             r3d, r3d             ; R9
+    xor             r4d, r4d             ; R10
+    mov             r3b, byte [r0 +  64] ; topLast
+    mov             r4b, byte [r0 + 128] ; LeftLast
+
+    ; strong intra filter is diabled
+    cmp             r2m, byte 0
+    jz              .normal_filter32
+    ; decide to do strong intra filter
+    xor             r5d, r5d             ; R11
+    xor             r6d, r6d             ; RAX
+    xor             r7d, r7d             ; RDI
+    mov             r5b, byte [r0]       ; topLeft
+    mov             r6b, byte [r0 + 96]  ; leftMiddle
+    mov             r7b, byte [r0 + 32]  ; topMiddle
+
+    ; threshold = 8
+    mov             r2d, r3d             ; R8
+    add             r2d, r5d             ; (topLast + topLeft)
+    shl             r7d, 1               ; 2 * topMiddle
+    sub             r2d, r7d
+    mov             r7d, r2d             ; backup r2d
+    sar             r7d, 31
+    xor             r2d, r7d
+    sub             r2d, r7d             ; abs(r2d)
+    cmp             r2d, 8
+    ; bilinearAbove is false
+    jns             .normal_filter32
+
+    mov             r2d, r5d
+    add             r2d, r4d
+    shl             r6d, 1
+    sub             r2d, r6d
+    mov             r6d, r2d
+    sar             r6d, 31
+    xor             r2d, r6d
+    sub             r2d, r6d
+    cmp             r2d, 8
+    ; bilinearLeft is false
+    jns             .normal_filter32
+
+    ; do strong intra filter shift = 6
+    mov             r2d, r5d
+    shl             r2d, 6
+    add             r2d, 32              ; init
+    mov             r6d, r4d
+    sub             r6w, r5w             ; deltaL size is word
+    mov             r7d, r3d
+    sub             r7w, r5w             ; deltaR size is word
+    movd            xmm0, r2d
+    vpbroadcastw    xmm0, xmm0
+    mova            xmm4, xmm0
+
+    movd            xmm1, r7d
+    vpbroadcastw    xmm1, xmm1
+    pmullw          xmm2, xmm1, [multiL] ; [ 1  2  3  4  5  6  7  8]
+    pmullw          xmm3, xmm1, [multiH] ; [ 9 10 11 12 13 14 15 16]
+    paddw           xmm5, xmm0, xmm2
+    paddw           xmm6, xmm4, xmm3
+    psraw           xmm5, 6
+    psraw           xmm6, 6
+    packuswb        xmm5, xmm6
+    movu            [r1 + 1], xmm5
+
+    pmullw          xmm2, xmm1, [multiH2]; [17 18 19 20 21 22 23 24]
+    pmullw          xmm3, xmm1, [multiH3]; [25 26 27 28 29 30 31 32]
+    paddw           xmm5, xmm0, xmm2
+    paddw           xmm6, xmm4, xmm3
+    psraw           xmm5, 6
+    psraw           xmm6, 6
+    packuswb        xmm5, xmm6
+    movu            [r1 + 17], xmm5
+
+    pmullw          xmm2, xmm1, [multiH3_1] ; [33 - 40]
+    pmullw          xmm3, xmm1, [multiH3_2] ; [41 - 48]
+    paddw           xmm5, xmm0, xmm2
+    paddw           xmm6, xmm4, xmm3
+    psraw           xmm5, 6
+    psraw           xmm6, 6
+    packuswb        xmm5, xmm6
+    movu            [r1 + 33], xmm5
+
+    pmullw          xmm2, xmm1, [multiH4]   ; [49 - 56]
+    pmullw          xmm1, [multiH4_1]       ; [57 - 64]
+    paddw           xmm5, xmm0, xmm2
+    paddw           xmm6, xmm4, xmm1
+    psraw           xmm5, 6
+    psraw           xmm6, 6
+    packuswb        xmm5, xmm6
+    movu            [r1 + 49], xmm5
+
+    movd            xmm1, r6d
+    vpbroadcastw    xmm1, xmm1
+    pmullw          xmm2, xmm1, [multiL] ; [ 1  2  3  4  5  6  7  8]
+    pmullw          xmm3, xmm1, [multiH] ; [ 9 10 11 12 13 14 15 16]
+    paddw           xmm5, xmm0, xmm2
+    paddw           xmm6, xmm4, xmm3
+    psraw           xmm5, 6
+    psraw           xmm6, 6
+    packuswb        xmm5, xmm6
+    movu            [r1 + 65], xmm5
+
+    pmullw          xmm2, xmm1, [multiH2]; [17 18 19 20 21 22 23 24]
+    pmullw          xmm3, xmm1, [multiH3]; [25 26 27 28 29 30 31 32]
+    paddw           xmm5, xmm0, xmm2
+    paddw           xmm6, xmm4, xmm3
+    psraw           xmm5, 6
+    psraw           xmm6, 6
+    packuswb        xmm5, xmm6
+    movu            [r1 + 81], xmm5
+
+    pmullw          xmm2, xmm1, [multiH3_1] ; [49 - 56]
+    pmullw          xmm3, xmm1, [multiH3_2] ; [57 - 64]
+    paddw           xmm5, xmm0, xmm2
+    paddw           xmm6, xmm4, xmm3
+    psraw           xmm5, 6
+    psraw           xmm6, 6
+    packuswb        xmm5, xmm6
+    movu            [r1 + 97], xmm5
+
+    pmullw          xmm2, xmm1, [multiH4]   ; [49 - 56]
+    pmullw          xmm1, [multiH4_1]       ; [57 - 64]
+    paddw           xmm0, xmm2
+    paddw           xmm4, xmm1
+    psraw           xmm0, 6
+    psraw           xmm4, 6
+    packuswb        xmm0, xmm4
+    movu            [r1 + 113], xmm0
+
+    mov             [r1], r5b               ; topLeft
+    mov             [r1 +  64], r3b         ; topLast
+    mov             [r1 + 128], r4b         ; LeftLast
+    RET
+
+.normal_filter32
     ; filtering top
     ; 0 to 15
     pmovzxbw        m0, [r0 +  0]
@@ -22514,8 +22651,8 @@
     packuswb        m1, m5
     movu            [r1 + 112], m1
 
-    mov             [r1 +  64], r2b                 ; topLast
-    mov             [r1 + 128], r3b                 ; LeftLast
+    mov             [r1 +  64], r3b                 ; topLast
+    mov             [r1 + 128], r4b                 ; LeftLast
     RET
 
 INIT_YMM avx2
diff -r a7c2f80c18af -r 50a37352461d source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp	Mon Nov 20 14:31:22 2017 +0530
+++ b/source/encoder/slicetype.cpp	Tue Nov 21 03:16:30 2017 +0800
@@ -349,7 +349,7 @@
             for (int i = 1; i <= 2 * cuSize; i++)
                 samples[cuSize2 + i] = pixCur[i * fenc.lumaStride];    /* left */
 
-            primitives.cu[sizeIdx].intra_filter(samples, filtered);
+            primitives.cu[sizeIdx].intra_filter(samples, filtered, 0);
 
             int cost, icost = me.COST_MAX;
             uint32_t ilowmode = 0;




More information about the x265-devel mailing list