[x265] [PATCH] intra: sse4 version of strong intra smoothing
Ximing Cheng
chengximing1989 at foxmail.com
Mon Nov 20 20:16:40 CET 2017
# HG changeset patch
# User Ximing Cheng <ximingcheng at tencent.com>
# Date 1511205390 -28800
# Tue Nov 21 03:16:30 2017 +0800
# Node ID 50a37352461d3218c0e62ce48a2772978fc863f3
# Parent a7c2f80c18afa0deff2c5b18897f5b3ebf70657c
intra: sse4 version of strong intra smoothing
diff -r a7c2f80c18af -r 50a37352461d source/common/intrapred.cpp
--- a/source/common/intrapred.cpp Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/intrapred.cpp Tue Nov 21 03:16:30 2017 +0800
@@ -29,12 +29,43 @@
namespace {
template<int tuSize>
-void intraFilter(const pixel* samples, pixel* filtered) /* 1:2:1 filtering of left and top reference samples */
+void intraFilter(const pixel* samples, pixel* filtered, int bUseStrongFilter)
{
const int tuSize2 = tuSize << 1;
pixel topLeft = samples[0], topLast = samples[tuSize2], leftLast = samples[tuSize2 + tuSize2];
+ // strong intra filter
+ if (bUseStrongFilter && tuSize >= 32)
+ {
+ const pixel leftMiddle = samples[tuSize2 + tuSize];
+ const pixel topMiddle = samples[tuSize];
+ const static int threshold = 1 << (X265_DEPTH - 5);
+ const bool bilinearLeft = abs((leftLast + topLeft) - (2 * leftMiddle)) < threshold; //difference between the
+ const bool bilinearAbove = abs((topLeft + topLast) - (2 * topMiddle)) < threshold; //ends and the middle
+
+ if (bilinearLeft && bilinearAbove)
+ {
+ const int shift = 5 + 1;
+ int init = (topLeft << shift) + tuSize;
+ int deltaL, deltaR;
+
+ deltaL = leftLast - topLeft;
+ deltaR = topLast - topLeft;
+
+ filtered[0] = topLeft;
+ for (int i = 1; i < tuSize2; i++)
+ {
+ filtered[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
+ filtered[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering
+ }
+ filtered[tuSize2] = topLast;
+ filtered[tuSize2 + tuSize2] = leftLast;
+ return;
+ }
+ }
+
+ /* 1:2:1 filtering of left and top reference samples */
// filtering top
for (int i = 1; i < tuSize2; i++)
filtered[i] = ((samples[i] << 1) + samples[i - 1] + samples[i + 1] + 2) >> 2;
diff -r a7c2f80c18af -r 50a37352461d source/common/predict.cpp
--- a/source/common/predict.cpp Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/predict.cpp Tue Nov 21 03:16:30 2017 +0800
@@ -594,7 +594,6 @@
void Predict::initAdiPattern(const CUData& cu, const CUGeom& cuGeom, uint32_t puAbsPartIdx, const IntraNeighbors& intraNeighbors, int dirMode)
{
int tuSize = 1 << intraNeighbors.log2TrSize;
- int tuSize2 = tuSize << 1;
PicYuv* reconPic = cu.m_encData->m_reconPic;
pixel* adiOrigin = reconPic->getLumaAddr(cu.m_cuAddr, cuGeom.absPartIdx + puAbsPartIdx);
@@ -605,41 +604,11 @@
pixel* refBuf = intraNeighbourBuf[0];
pixel* fltBuf = intraNeighbourBuf[1];
- pixel topLeft = refBuf[0], topLast = refBuf[tuSize2], leftLast = refBuf[tuSize2 + tuSize2];
-
if (dirMode == ALL_IDX ? (8 | 16 | 32) & tuSize : g_intraFilterFlags[dirMode] & tuSize)
{
// generate filtered intra prediction samples
-
- if (cu.m_slice->m_sps->bUseStrongIntraSmoothing && tuSize == 32)
- {
- const int threshold = 1 << (X265_DEPTH - 5);
-
- pixel topMiddle = refBuf[32], leftMiddle = refBuf[tuSize2 + 32];
-
- if (abs(topLeft + topLast - (topMiddle << 1)) < threshold &&
- abs(topLeft + leftLast - (leftMiddle << 1)) < threshold)
- {
- // "strong" bilinear interpolation
- const int shift = 5 + 1;
- int init = (topLeft << shift) + tuSize;
- int deltaL, deltaR;
-
- deltaL = leftLast - topLeft; deltaR = topLast - topLeft;
-
- fltBuf[0] = topLeft;
- for (int i = 1; i < tuSize2; i++)
- {
- fltBuf[i + tuSize2] = (pixel)((init + deltaL * i) >> shift); // Left Filtering
- fltBuf[i] = (pixel)((init + deltaR * i) >> shift); // Above Filtering
- }
- fltBuf[tuSize2] = topLast;
- fltBuf[tuSize2 + tuSize2] = leftLast;
- return;
- }
- }
-
- primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf);
+ int bUseStrongIntraSmoothing = cu.m_slice->m_sps->bUseStrongIntraSmoothing;
+ primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(refBuf, fltBuf, bUseStrongIntraSmoothing);
}
}
@@ -652,7 +621,7 @@
fillReferenceSamples(adiOrigin, picStride, intraNeighbors, intraNeighbourBuf[0]);
if (m_csp == X265_CSP_I444)
- primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1]);
+ primitives.cu[intraNeighbors.log2TrSize - 2].intra_filter(intraNeighbourBuf[0], intraNeighbourBuf[1], 0);
}
void Predict::initIntraNeighbors(const CUData& cu, uint32_t absPartIdx, uint32_t tuDepth, bool isLuma, IntraNeighbors *intraNeighbors)
diff -r a7c2f80c18af -r 50a37352461d source/common/primitives.h
--- a/source/common/primitives.h Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/primitives.h Tue Nov 21 03:16:30 2017 +0800
@@ -133,7 +133,7 @@
typedef void (*intra_pred_t)(pixel* dst, intptr_t dstStride, const pixel *srcPix, int dirMode, int bFilter);
typedef void (*intra_allangs_t)(pixel *dst, pixel *refPix, pixel *filtPix, int bLuma);
-typedef void (*intra_filter_t)(const pixel* references, pixel* filtered);
+typedef void (*intra_filter_t)(const pixel* references, pixel* filtered, int bUseStrongFilter);
typedef void (*cpy2Dto1D_shl_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
typedef void (*cpy2Dto1D_shr_t)(int16_t* dst, const int16_t* src, intptr_t srcStride, int shift);
diff -r a7c2f80c18af -r 50a37352461d source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/x86/const-a.asm Tue Nov 21 03:16:30 2017 +0800
@@ -112,6 +112,10 @@
const multi_2Row, times 1 dw 1, 2, 3, 4, 1, 2, 3, 4
const multiH, times 1 dw 9, 10, 11, 12, 13, 14, 15, 16
const multiH3, times 1 dw 25, 26, 27, 28, 29, 30, 31, 32
+const multiH3_1, times 1 dw 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48
+const multiH3_2, times 1 dw 41, 42, 43, 44, 45, 46, 47, 48
+const multiH4, times 1 dw 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64
+const multiH4_1, times 1 dw 57, 58, 59, 60, 61, 62, 63, 64
const multiL, times 1 dw 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16
const multiH2, times 1 dw 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32
const pw_planar16_mul, times 1 dw 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
diff -r a7c2f80c18af -r 50a37352461d source/common/x86/intrapred.h
--- a/source/common/x86/intrapred.h Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/x86/intrapred.h Tue Nov 21 03:16:30 2017 +0800
@@ -67,7 +67,7 @@
#define DECL_ALL(cpu) \
FUNCDEF_TU(void, all_angs_pred, cpu, pixel *dest, pixel *refPix, pixel *filtPix, int bLuma); \
- FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered); \
+ FUNCDEF_TU(void, intra_filter, cpu, const pixel *samples, pixel *filtered, int bUseStrongFilter); \
DECL_ANGS(4, cpu); \
DECL_ANGS(8, cpu); \
DECL_ANGS(16, cpu); \
diff -r a7c2f80c18af -r 50a37352461d source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Mon Nov 20 14:31:22 2017 +0530
+++ b/source/common/x86/intrapred8.asm Tue Nov 21 03:16:30 2017 +0800
@@ -543,6 +543,10 @@
cextern multiH
cextern multiH2
cextern multiH3
+cextern multiH3_1
+cextern multiH3_2
+cextern multiH4
+cextern multiH4_1
cextern multi_2Row
cextern trans8_shuf
cextern pw_planar16_mul
@@ -22313,11 +22317,144 @@
mov [r1 + 64], r3b ; LeftLast
RET
-INIT_XMM sse4
-cglobal intra_filter_32x32, 2,4,6
- mov r2b, byte [r0 + 64] ; topLast
- mov r3b, byte [r0 + 128] ; LeftLast
-
+; this function add strong intra filter
+INIT_XMM sse4
+cglobal intra_filter_32x32, 3,8,7
+ xor r3d, r3d ; R9
+ xor r4d, r4d ; R10
+ mov r3b, byte [r0 + 64] ; topLast
+ mov r4b, byte [r0 + 128] ; LeftLast
+
+ ; strong intra filter is diabled
+ cmp r2m, byte 0
+ jz .normal_filter32
+ ; decide to do strong intra filter
+ xor r5d, r5d ; R11
+ xor r6d, r6d ; RAX
+ xor r7d, r7d ; RDI
+ mov r5b, byte [r0] ; topLeft
+ mov r6b, byte [r0 + 96] ; leftMiddle
+ mov r7b, byte [r0 + 32] ; topMiddle
+
+ ; threshold = 8
+ mov r2d, r3d ; R8
+ add r2d, r5d ; (topLast + topLeft)
+ shl r7d, 1 ; 2 * topMiddle
+ sub r2d, r7d
+ mov r7d, r2d ; backup r2d
+ sar r7d, 31
+ xor r2d, r7d
+ sub r2d, r7d ; abs(r2d)
+ cmp r2d, 8
+ ; bilinearAbove is false
+ jns .normal_filter32
+
+ mov r2d, r5d
+ add r2d, r4d
+ shl r6d, 1
+ sub r2d, r6d
+ mov r6d, r2d
+ sar r6d, 31
+ xor r2d, r6d
+ sub r2d, r6d
+ cmp r2d, 8
+ ; bilinearLeft is false
+ jns .normal_filter32
+
+ ; do strong intra filter shift = 6
+ mov r2d, r5d
+ shl r2d, 6
+ add r2d, 32 ; init
+ mov r6d, r4d
+ sub r6w, r5w ; deltaL size is word
+ mov r7d, r3d
+ sub r7w, r5w ; deltaR size is word
+ movd xmm0, r2d
+ vpbroadcastw xmm0, xmm0
+ mova xmm4, xmm0
+
+ movd xmm1, r7d
+ vpbroadcastw xmm1, xmm1
+ pmullw xmm2, xmm1, [multiL] ; [ 1 2 3 4 5 6 7 8]
+ pmullw xmm3, xmm1, [multiH] ; [ 9 10 11 12 13 14 15 16]
+ paddw xmm5, xmm0, xmm2
+ paddw xmm6, xmm4, xmm3
+ psraw xmm5, 6
+ psraw xmm6, 6
+ packuswb xmm5, xmm6
+ movu [r1 + 1], xmm5
+
+ pmullw xmm2, xmm1, [multiH2]; [17 18 19 20 21 22 23 24]
+ pmullw xmm3, xmm1, [multiH3]; [25 26 27 28 29 30 31 32]
+ paddw xmm5, xmm0, xmm2
+ paddw xmm6, xmm4, xmm3
+ psraw xmm5, 6
+ psraw xmm6, 6
+ packuswb xmm5, xmm6
+ movu [r1 + 17], xmm5
+
+ pmullw xmm2, xmm1, [multiH3_1] ; [33 - 40]
+ pmullw xmm3, xmm1, [multiH3_2] ; [41 - 48]
+ paddw xmm5, xmm0, xmm2
+ paddw xmm6, xmm4, xmm3
+ psraw xmm5, 6
+ psraw xmm6, 6
+ packuswb xmm5, xmm6
+ movu [r1 + 33], xmm5
+
+ pmullw xmm2, xmm1, [multiH4] ; [49 - 56]
+ pmullw xmm1, [multiH4_1] ; [57 - 64]
+ paddw xmm5, xmm0, xmm2
+ paddw xmm6, xmm4, xmm1
+ psraw xmm5, 6
+ psraw xmm6, 6
+ packuswb xmm5, xmm6
+ movu [r1 + 49], xmm5
+
+ movd xmm1, r6d
+ vpbroadcastw xmm1, xmm1
+ pmullw xmm2, xmm1, [multiL] ; [ 1 2 3 4 5 6 7 8]
+ pmullw xmm3, xmm1, [multiH] ; [ 9 10 11 12 13 14 15 16]
+ paddw xmm5, xmm0, xmm2
+ paddw xmm6, xmm4, xmm3
+ psraw xmm5, 6
+ psraw xmm6, 6
+ packuswb xmm5, xmm6
+ movu [r1 + 65], xmm5
+
+ pmullw xmm2, xmm1, [multiH2]; [17 18 19 20 21 22 23 24]
+ pmullw xmm3, xmm1, [multiH3]; [25 26 27 28 29 30 31 32]
+ paddw xmm5, xmm0, xmm2
+ paddw xmm6, xmm4, xmm3
+ psraw xmm5, 6
+ psraw xmm6, 6
+ packuswb xmm5, xmm6
+ movu [r1 + 81], xmm5
+
+ pmullw xmm2, xmm1, [multiH3_1] ; [49 - 56]
+ pmullw xmm3, xmm1, [multiH3_2] ; [57 - 64]
+ paddw xmm5, xmm0, xmm2
+ paddw xmm6, xmm4, xmm3
+ psraw xmm5, 6
+ psraw xmm6, 6
+ packuswb xmm5, xmm6
+ movu [r1 + 97], xmm5
+
+ pmullw xmm2, xmm1, [multiH4] ; [49 - 56]
+ pmullw xmm1, [multiH4_1] ; [57 - 64]
+ paddw xmm0, xmm2
+ paddw xmm4, xmm1
+ psraw xmm0, 6
+ psraw xmm4, 6
+ packuswb xmm0, xmm4
+ movu [r1 + 113], xmm0
+
+ mov [r1], r5b ; topLeft
+ mov [r1 + 64], r3b ; topLast
+ mov [r1 + 128], r4b ; LeftLast
+ RET
+
+.normal_filter32
; filtering top
; 0 to 15
pmovzxbw m0, [r0 + 0]
@@ -22514,8 +22651,8 @@
packuswb m1, m5
movu [r1 + 112], m1
- mov [r1 + 64], r2b ; topLast
- mov [r1 + 128], r3b ; LeftLast
+ mov [r1 + 64], r3b ; topLast
+ mov [r1 + 128], r4b ; LeftLast
RET
INIT_YMM avx2
diff -r a7c2f80c18af -r 50a37352461d source/encoder/slicetype.cpp
--- a/source/encoder/slicetype.cpp Mon Nov 20 14:31:22 2017 +0530
+++ b/source/encoder/slicetype.cpp Tue Nov 21 03:16:30 2017 +0800
@@ -349,7 +349,7 @@
for (int i = 1; i <= 2 * cuSize; i++)
samples[cuSize2 + i] = pixCur[i * fenc.lumaStride]; /* left */
- primitives.cu[sizeIdx].intra_filter(samples, filtered);
+ primitives.cu[sizeIdx].intra_filter(samples, filtered, 0);
int cost, icost = me.COST_MAX;
uint32_t ilowmode = 0;
More information about the x265-devel
mailing list