[x265] [PATCH 1 of 2] asm: separated deblocking filter into horizontal & vertical primitives for asm
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Fri Oct 9 14:47:07 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1444121396 -19800
# Tue Oct 06 14:19:56 2015 +0530
# Node ID 38e4b94377fa6ffe57472c49ecff6c909ed4f6dc
# Parent f8ad1ff7074aab85a6cf376886014c88f46b7275
asm: separated deblocking filter into horizontal & vertical primitives for asm
diff -r f8ad1ff7074a -r 38e4b94377fa source/common/deblock.cpp
--- a/source/common/deblock.cpp Thu Oct 08 15:27:34 2015 -0500
+++ b/source/common/deblock.cpp Tue Oct 06 14:19:56 2015 +0530
@@ -280,31 +280,6 @@
* \param maskQ indicator to enable filtering on partQ
* \param maskP1 decision weak filter/no filter for partP
* \param maskQ1 decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
- int32_t tc2 = 2 * tc;
- int32_t tcP = (tc2 & maskP);
- int32_t tcQ = (tc2 & maskQ);
- for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
- {
- int16_t m4 = (int16_t)src[0];
- int16_t m3 = (int16_t)src[-offset];
- int16_t m5 = (int16_t)src[offset];
- int16_t m2 = (int16_t)src[-offset * 2];
- int16_t m6 = (int16_t)src[offset * 2];
- int16_t m1 = (int16_t)src[-offset * 3];
- int16_t m7 = (int16_t)src[offset * 3];
- int16_t m0 = (int16_t)src[-offset * 4];
- src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
- src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
- src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
- src[0] = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
- src[offset] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
- src[offset * 2] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
- }
-}
-
-/* Weak filter */
static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
int32_t maskP1, int32_t maskQ1)
{
@@ -446,7 +421,12 @@
useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
if (sw)
- pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
+ {
+ int32_t tc2 = 2 * tc;
+ int32_t tcP = (tc2 & maskP);
+ int32_t tcQ = (tc2 & maskQ);
+ primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ);
+ }
else
{
int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
diff -r f8ad1ff7074a -r 38e4b94377fa source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp Thu Oct 08 15:27:34 2015 -0500
+++ b/source/common/loopfilter.cpp Tue Oct 06 14:19:56 2015 +0530
@@ -137,6 +137,27 @@
rec += stride;
}
}
+
+void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ)
+{
+ for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
+ {
+ int16_t m4 = (int16_t)src[0];
+ int16_t m3 = (int16_t)src[-offset];
+ int16_t m5 = (int16_t)src[offset];
+ int16_t m2 = (int16_t)src[-offset * 2];
+ int16_t m6 = (int16_t)src[offset * 2];
+ int16_t m1 = (int16_t)src[-offset * 3];
+ int16_t m7 = (int16_t)src[offset * 3];
+ int16_t m0 = (int16_t)src[-offset * 4];
+ src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+ src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
+ src[-offset] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+ src[0] = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+ src[offset] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
+ src[offset * 2] = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
+ }
+}
}
namespace X265_NS {
@@ -151,5 +172,9 @@
p.saoCuOrgE3[1] = processSaoCUE3;
p.saoCuOrgB0 = processSaoCUB0;
p.sign = calSign;
+
+ // C code is same for EDGE_VER and EDGE_HOR only asm code is different
+ p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
+ p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
}
}
diff -r f8ad1ff7074a -r 38e4b94377fa source/common/primitives.h
--- a/source/common/primitives.h Thu Oct 08 15:27:34 2015 -0500
+++ b/source/common/primitives.h Tue Oct 06 14:19:56 2015 +0530
@@ -196,6 +196,8 @@
typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
+typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
/* Function pointers to optimized encoder primitives. Each pointer can reference
* either an assembly routine, a SIMD intrinsic primitive, or a C function */
struct EncoderPrimitives
@@ -330,6 +332,7 @@
costCoeffRemain_t costCoeffRemain;
costC1C2Flag_t costC1C2Flag;
+ pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
/* There is one set of chroma primitives per color space. An encoder will
* have just a single color space and thus it will only ever use one entry
More information about the x265-devel
mailing list