[x265] [PATCH 1 of 2] asm: separated deblocking filter into horizontal & vertical primitives for asm

dnyaneshwar at multicorewareinc.com dnyaneshwar at multicorewareinc.com
Fri Oct 9 14:47:07 CEST 2015


# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1444121396 -19800
#      Tue Oct 06 14:19:56 2015 +0530
# Node ID 38e4b94377fa6ffe57472c49ecff6c909ed4f6dc
# Parent  f8ad1ff7074aab85a6cf376886014c88f46b7275
asm: separated deblocking filter into horizontal & vertical primitives for asm

diff -r f8ad1ff7074a -r 38e4b94377fa source/common/deblock.cpp
--- a/source/common/deblock.cpp	Thu Oct 08 15:27:34 2015 -0500
+++ b/source/common/deblock.cpp	Tue Oct 06 14:19:56 2015 +0530
@@ -280,31 +280,6 @@
  * \param maskQ   indicator to enable filtering on partQ
  * \param maskP1  decision weak filter/no filter for partP
  * \param maskQ1  decision weak filter/no filter for partQ */
-static inline void pelFilterLumaStrong(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ)
-{
-    int32_t tc2 = 2 * tc;
-    int32_t tcP = (tc2 & maskP);
-    int32_t tcQ = (tc2 & maskQ);
-    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
-    {
-        int16_t m4  = (int16_t)src[0];
-        int16_t m3  = (int16_t)src[-offset];
-        int16_t m5  = (int16_t)src[offset];
-        int16_t m2  = (int16_t)src[-offset * 2];
-        int16_t m6  = (int16_t)src[offset * 2];
-        int16_t m1  = (int16_t)src[-offset * 3];
-        int16_t m7  = (int16_t)src[offset * 3];
-        int16_t m0  = (int16_t)src[-offset * 4];
-        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
-        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
-        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
-        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
-        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
-        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
-    }
-}
-
-/* Weak filter */
 static inline void pelFilterLuma(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tc, int32_t maskP, int32_t maskQ,
                                  int32_t maskP1, int32_t maskQ1)
 {
@@ -446,7 +421,12 @@
                    useStrongFiltering(offset, beta, tc, src + unitOffset + srcStep * 3));
 
         if (sw)
-            pelFilterLumaStrong(src + unitOffset, srcStep, offset, tc, maskP, maskQ);
+        {
+            int32_t tc2 = 2 * tc;
+            int32_t tcP = (tc2 & maskP);
+            int32_t tcQ = (tc2 & maskQ);
+            primitives.pelFilterLumaStrong[dir](src + unitOffset, srcStep, offset, tcP, tcQ);
+        }
         else
         {
             int32_t sideThreshold = (beta + (beta >> 1)) >> 3;
diff -r f8ad1ff7074a -r 38e4b94377fa source/common/loopfilter.cpp
--- a/source/common/loopfilter.cpp	Thu Oct 08 15:27:34 2015 -0500
+++ b/source/common/loopfilter.cpp	Tue Oct 06 14:19:56 2015 +0530
@@ -137,6 +137,27 @@
         rec += stride;
     }
 }
+
+void pelFilterLumaStrong_c(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ)
+{
+    for (int32_t i = 0; i < UNIT_SIZE; i++, src += srcStep)
+    {
+        int16_t m4  = (int16_t)src[0];
+        int16_t m3  = (int16_t)src[-offset];
+        int16_t m5  = (int16_t)src[offset];
+        int16_t m2  = (int16_t)src[-offset * 2];
+        int16_t m6  = (int16_t)src[offset * 2];
+        int16_t m1  = (int16_t)src[-offset * 3];
+        int16_t m7  = (int16_t)src[offset * 3];
+        int16_t m0  = (int16_t)src[-offset * 4];
+        src[-offset * 3] = (pixel)(x265_clip3(-tcP, tcP, ((2 * m0 + 3 * m1 + m2 + m3 + m4 + 4) >> 3) - m1) + m1);
+        src[-offset * 2] = (pixel)(x265_clip3(-tcP, tcP, ((m1 + m2 + m3 + m4 + 2) >> 2) - m2) + m2);
+        src[-offset]     = (pixel)(x265_clip3(-tcP, tcP, ((m1 + 2 * m2 + 2 * m3 + 2 * m4 + m5 + 4) >> 3) - m3) + m3);
+        src[0]           = (pixel)(x265_clip3(-tcQ, tcQ, ((m2 + 2 * m3 + 2 * m4 + 2 * m5 + m6 + 4) >> 3) - m4) + m4);
+        src[offset]      = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + m6 + 2) >> 2) - m5) + m5);
+        src[offset * 2]  = (pixel)(x265_clip3(-tcQ, tcQ, ((m3 + m4 + m5 + 3 * m6 + 2 * m7 + 4) >> 3) - m6) + m6);
+    }
+}
 }
 
 namespace X265_NS {
@@ -151,5 +172,9 @@
     p.saoCuOrgE3[1] = processSaoCUE3;
     p.saoCuOrgB0 = processSaoCUB0;
     p.sign = calSign;
+
+    // C code is same for EDGE_VER and EDGE_HOR only asm code is different
+    p.pelFilterLumaStrong[0] = pelFilterLumaStrong_c;
+    p.pelFilterLumaStrong[1] = pelFilterLumaStrong_c;
 }
 }
diff -r f8ad1ff7074a -r 38e4b94377fa source/common/primitives.h
--- a/source/common/primitives.h	Thu Oct 08 15:27:34 2015 -0500
+++ b/source/common/primitives.h	Tue Oct 06 14:19:56 2015 +0530
@@ -196,6 +196,8 @@
 typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero, int idx);
 typedef uint32_t (*costC1C2Flag_t)(uint16_t *absCoeff, intptr_t numC1Flag, uint8_t *baseCtxMod, intptr_t ctxOffset);
 
+typedef void (*pelFilterLumaStrong_t)(pixel* src, intptr_t srcStep, intptr_t offset, int32_t tcP, int32_t tcQ);
+
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
 struct EncoderPrimitives
@@ -330,6 +332,7 @@
     costCoeffRemain_t     costCoeffRemain;
     costC1C2Flag_t        costC1C2Flag;
 
+    pelFilterLumaStrong_t pelFilterLumaStrong[2]; // EDGE_VER = 0, EDGE_HOR = 1
 
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry


More information about the x265-devel mailing list