[x265] [PATCH] intra: sse4 version of strong intrasmoothing
Ximing Cheng
chengximing1989 at foxmail.com
Tue Nov 28 16:57:50 CET 2017
# HG changeset patch
# User Ximing Cheng <ximingcheng at tencent.com>
# Date 1511862059 -28800
# Tue Nov 28 17:40:59 2017 +0800
# Node ID 9cd0cf6e2fd88604d939138e539dd481ec429ab3
# Parent b24454f3ff6de650aab6835e291837fc4e2a4466
intra: sse4 version of strong intrasmoothing
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/intrapred.cpp
--- a/source/common/intrapred.cpp Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/intrapred.cpp Tue Nov 28 17:40:59 2017 +0800
@@ -29,12 +29,43 @@
namespace {
template<int tuSize>
-void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+void intraFilter(const pixel* samples, pixel* filtered, int bUseStrongFilter)
{
const int tuSize2 = tuSize << 1;
pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];
+ // strong intra filter
+ if (bUseStrongFilter && tuSize >= 32)
+ {
+ const pixel leftMiddle = samples[tuSize2 + tuSize];
+ const pixel topMiddle = samples[tuSize];
+ const static int threshold = 1 << (X265_DEPTH - 5);
+ const bool bilinearLeft = abs((leftLast + topLeft) - (2 * leftMiddle)) < threshold; //difference between the
+ const bool bilinearAbove = abs((topLeft + topLast) - (2 * topMiddle)) < threshold; //ends and the middle
+
+ if (bilinearLeft && bilinearAbove)
+ {
+ const int shift = 5 + 1;
+ int init = (topLeft << shift) + tuSize;
+ int deltaL, deltaR;
+
+ deltaL = leftLast - topLeft;
+ deltaR = topLast - topLeft;
+
+ filtered[0] = topLeft;
+ for (int i = 1; i < tuSize2; i++)
+ {
+ filtered[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
+ filtered[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering
+ }
+ filtered[tuSize2] = topLast;
+ filtered[tuSize2 + tuSize2] = leftLast;
+ return;
+ }
+ }
+ /* 1:2:1 filtering of left and top reference samples */
+
// filtering top
for (int i = 1; i < tuSize2; i++)
filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/predict.cpp
--- a/source/common/predict.cpp Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/predict.cpp Tue Nov 28 17:40:59 2017 +0800
@@ -594,7 +594,6 @@
void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
{
int tuSize = 1 << intraNeighbors.log2TrSize;
- int tuSize2 = tuSize << 1;
PicYuv* reconPic = cu.m_encData->m_reconPic;
pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
@@ -605,41 +604,11 @@
pixel* refBuf = intraNeighbourBuf[0];
pixel* fltBuf = intraNeighbourBuf[1];
- pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
-
if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
{
// generate filtered intra prediction samples
-
- if (cu.m_slice->m_sps->bUseStrongIntraSmoothing && tuSize == 32)
- {
- const int threshold = 1 << (X265_DEPTH - 5);
-
- pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
-
- if (abs(topLeft + topLast - (topMiddle << 1)) < threshold &&
- abs(topLeft + leftLast - (leftMiddle << 1)) < threshold)
- {
- // "strong" bilinear interpolation
- const int shift = 5 + 1;
- int init = (topLeft << shift) + tuSize;
- int deltaL, deltaR;
-
- deltaL = leftLast - topLeft; deltaR = topLast - topLeft;
-
- fltBuf[0] = topLeft;
- for (int i = 1; i < tuSize2; i++)
- {
- fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
- fltBuf[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering
- }
- fltBuf[tuSize2] = topLast;
- fltBuf[tuSize2 + tuSize2] = leftLast;
- return;
- }
- }
-
- primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf);
+ int bUseStrongIntraSmoothing = cu.m_slice->m_sps->bUseStrongIntraSmoothing;
+ primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf, bUseStrongIntraSmoothing);
}
}
@@ -652,7 +621,7 @@
fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
if (m_csp == X265_CSP_I444)
- primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1]);
+ primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1], 0);
}
void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/primitives.h
--- a/source/common/primitives.h Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/primitives.h Tue Nov 28 17:40:59 2017 +0800
@@ -133,7 +133,7 @@
typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter);
typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
-typedef void (*intra_filter_t)(const pixel* references, pixel* filtered);
+typedef void (*intra_filter_t)(const pixel* references, pixel* filtered, int bUseStrongFilter);
typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/x86/const-a.asm Tue Nov 28 17:40:59 2017 +0800
@@ -114,6 +114,10 @@
const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32
const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
+const multiH3_1, times 1 dw 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48
+const multiH3_2, times 1 dw 41, 42, 43, 44, 45, 46, 47, 48
+const multiH4, times 1 dw 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+const multiH4_1, times 1 dw 57, 58, 59, 60, 61, 62, 63, 64
const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
const pw_planar32_mul, times 1 dw 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16
const pw_FFFFFFFFFFFFFFF0, dw 0x00
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/x86/intrapred.h Tue Nov 28 17:40:59 2017 +0800
@@ -67,7 +67,7 @@
#define DECL_ALL(cpu) \
FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
- FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
+ FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered, int bUseStrongFilter); \
DECL_ANGS(4, cpu); \
DECL_ANGS(8, cpu); \
DECL_ANGS(16, cpu); \
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Wed Nov 22 22:00:48 2017 +0530
+++ b/source/common/x86/intrapred8.asm Tue Nov 28 17:40:59 2017 +0800
@@ -543,6 +543,10 @@
cextern multiH
cextern multiH2
cextern multiH3
+cextern multiH3_1
+cextern multiH3_2
+cextern multiH4
+cextern multiH4_1
cextern multi_2Row
cextern trans8_shuf
cextern pw_planar16_mul
@@ -22313,11 +22317,142 @@
mov [r1 + 64], r3b ; LeftLast
RET
-INIT_XMM sse4
-cglobal intra_filter_32x32, 2,4,6
- mov r2b, byte [r0 + 64] ; topLast
- mov r3b, byte [r0 + 128] ; LeftLast
-
+; this function add strong intra filter
+INIT_XMM sse4
+cglobal intra_filter_32x32, 3,8,7
+ movzx r3d, byte [r0 + 64] ; topLast
+ movzx r4d, byte [r0 + 128] ; LeftLast
+
+ ; strong intra filter is disabled
+ cmp r2m, byte 0
+ jz .normal_filter32
+ ; decide to do strong intra filter
+ movzx r5d, byte [r0] ; topLeft
+ movzx r6d, byte [r0 + 32] ; topMiddle
+
+ ; threshold = 8
+ mov r2d, r3d
+ add r2d, r5d ; (topLast + topLeft)
+ shl r6d, 1 ; 2 * topMiddle
+ mov r7d, r2d
+ sub r2d, r6d ; (topLast + topLeft) - 2 * topMiddle
+ sub r6d, r7d ; 2 * topMiddle - (topLast + topLeft)
+ cmovg r2d, r6d
+ cmp r2d, 8
+ ; bilinearAbove is false
+ jns .normal_filter32
+
+ movzx r6d, byte [r0 + 96] ; leftMiddle
+ mov r2d, r5d
+ add r2d, r4d
+ shl r6d, 1
+ mov r7d, r2d
+ sub r2d, r6d
+ sub r6d, r7d
+ cmovg r2d, r6d
+ cmp r2d, 8
+ ; bilinearLeft is false
+ jns .normal_filter32
+
+ ; do strong intra filter shift = 6
+ mov r2d, r5d
+ shl r2d, 6
+ add r2d, 32 ; init
+ mov r6d, r4d
+ sub r6d, r5d ; deltaL
+ mov r7d, r3d
+ sub r7d, r5d ; deltaR
+
+ movd m0, r2d
+ pshuflw m0, m0, 0
+ movlhps m0, m0
+ mova m4, m0
+
+
+ movd m1, r7d
+ pshuflw m1, m1, 0
+ movlhps m1, m1
+ pmullw m2, m1, [multiL] ; [ 1 2 3 4 5 6 7 8]
+ pmullw m3, m1, [multiH] ; [ 9 10 11 12 13 14 15 16]
+ paddw m5, m0, m2
+ paddw m6, m4, m3
+ psraw m5, 6
+ psraw m6, 6
+ packuswb m5, m6
+ movu [r1 + 1], m5
+
+ pmullw m2, m1, [multiH2] ; [17 18 19 20 21 22 23 24]
+ pmullw m3, m1, [multiH3] ; [25 26 27 28 29 30 31 32]
+ paddw m5, m0, m2
+ paddw m6, m4, m3
+ psraw m5, 6
+ psraw m6, 6
+ packuswb m5, m6
+ movu [r1 + 17], m5
+
+ pmullw m2, m1, [multiH3_1] ; [33 - 40]
+ pmullw m3, m1, [multiH3_2] ; [41 - 48]
+ paddw m5, m0, m2
+ paddw m6, m4, m3
+ psraw m5, 6
+ psraw m6, 6
+ packuswb m5, m6
+ movu [r1 + 33], m5
+
+ pmullw m2, m1, [multiH4] ; [49 - 56]
+ pmullw m1, [multiH4_1] ; [57 - 64]
+ paddw m5, m0, m2
+ paddw m6, m4, m1
+ psraw m5, 6
+ psraw m6, 6
+ packuswb m5, m6
+ movu [r1 + 49], m5
+
+ movd m1, r6d
+ pshuflw m1, m1, 0
+ movlhps m1, m1
+ pmullw m2, m1, [multiL] ; [ 1 2 3 4 5 6 7 8]
+ pmullw m3, m1, [multiH] ; [ 9 10 11 12 13 14 15 16]
+ paddw m5, m0, m2
+ paddw m6, m4, m3
+ psraw m5, 6
+ psraw m6, 6
+ packuswb m5, m6
+ movu [r1 + 65], m5
+
+ pmullw m2, m1, [multiH2] ; [17 18 19 20 21 22 23 24]
+ pmullw m3, m1, [multiH3] ; [25 26 27 28 29 30 31 32]
+ paddw m5, m0, m2
+ paddw m6, m4, m3
+ psraw m5, 6
+ psraw m6, 6
+ packuswb m5, m6
+ movu [r1 + 81], m5
+
+ pmullw m2, m1, [multiH3_1] ; [49 - 56]
+ pmullw m3, m1, [multiH3_2] ; [57 - 64]
+ paddw m5, m0, m2
+ paddw m6, m4, m3
+ psraw m5, 6
+ psraw m6, 6
+ packuswb m5, m6
+ movu [r1 + 97], m5
+
+ pmullw m2, m1, [multiH4] ; [49 - 56]
+ pmullw m1, [multiH4_1] ; [57 - 64]
+ paddw m0, m2
+ paddw m4, m1
+ psraw m0, 6
+ psraw m4, 6
+ packuswb m0, m4
+ movu [r1 + 113], m0
+
+ mov [r1], r5b ; topLeft
+ mov [r1 + 64], r3b ; topLast
+ mov [r1 + 128], r4b ; LeftLast
+ RET
+
+.normal_filter32
; filtering top
; 0 to 15
pmovzxbw m0, [r0 + 0]
@@ -22514,8 +22649,8 @@
packuswb m1, m5
movu [r1 + 112], m1
- mov [r1 + 64], r2b ; topLast
- mov [r1 + 128], r3b ; LeftLast
+ mov [r1 + 64], r3b ; topLast
+ mov [r1 + 128], r4b ; LeftLast
RET
INIT_YMM avx2
diff -r b24454f3ff6d -r 9cd0cf6e2fd8 source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Wed Nov 22 22:00:48 2017 +0530
+++ b/source/encoder/slicetype.cpp Tue Nov 28 17:40:59 2017 +0800
@@ -349,7 +349,7 @@
for (int i = 1; i <= 2 * cuSize; i++)
samples[cuSize2 + i] = pixCur[i * fenc.lumaStride]; /* left */
- primitives.cu[sizeIdx].intra_filter(samples, filtered);
+ primitives.cu[sizeIdx].intra_filter(samples, filtered, 0);
int cost, icost = me.COST_MAX;
uint32_t ilowmode = 0;
More information about the x265-devel
mailing list