[x265] [PATCH] asm: ssse3 version of findPosFirstLast, 365c -> 75c

Min Chen chenm003 at 163.com
Mon Apr 20 13:58:35 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1429531109 -28800
# Node ID e4ba0f96286048b8037201c0543675819f32ba0a
# Parent  bc9631ab9d36475c7d625fc9272ee0ddd7f5bfd7
asm: ssse3 version of findPosFirstLast, 365c -> 75c
---
 source/common/constants.cpp          |    2 +-
 source/common/dct.cpp                |   32 ++++++++++++++++
 source/common/primitives.h           |    2 +
 source/common/quant.cpp              |   15 +------
 source/common/x86/asm-primitives.cpp |    2 +
 source/common/x86/pixel-util.h       |    1 +
 source/common/x86/pixel-util8.asm    |   38 ++++++++++++++++++
 source/test/pixelharness.cpp         |   69 ++++++++++++++++++++++++++++++++++
 source/test/pixelharness.h           |    1 +
 9 files changed, 149 insertions(+), 13 deletions(-)

diff -r bc9631ab9d36 -r e4ba0f962860 source/common/constants.cpp
--- a/source/common/constants.cpp	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/constants.cpp	Mon Apr 20 19:58:29 2015 +0800
@@ -324,7 +324,7 @@
       4,  12, 20, 28,  5, 13, 21, 29,  6, 14, 22, 30,  7, 15, 23, 31, 36, 44, 52, 60, 37, 45, 53, 61, 38, 46, 54, 62, 39, 47, 55, 63 }
 };
 
-const uint16_t g_scan4x4[NUM_SCAN_TYPE][4 * 4] =
+ALIGN_VAR_16(const uint16_t, g_scan4x4[NUM_SCAN_TYPE][4 * 4]) =
 {
     { 0,  4,  1,  8,  5,  2, 12,  9,  6,  3, 13, 10,  7, 14, 11, 15 },
     { 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15 },
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/dct.cpp
--- a/source/common/dct.cpp	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/dct.cpp	Mon Apr 20 19:58:29 2015 +0800
@@ -785,6 +785,37 @@
     return scanPosLast - 1;
 }
 
+uint32_t findPosFirstLast_c(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
+{
+    int n;
+
+    for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
+    {
+        const uint32_t idx = scanTbl[n];
+        const uint32_t idxY = idx / MLS_CG_SIZE;
+        const uint32_t idxX = idx % MLS_CG_SIZE;
+        if (dstCoeff[idxY * trSize + idxX])
+            break;
+    }
+
+    X265_CHECK(n >= 0, "non-zero coeff scan failuare!\n");
+
+    uint32_t lastNZPosInCG = (uint32_t)n;
+
+    for (n = 0;; n++)
+    {
+        const uint32_t idx = scanTbl[n];
+        const uint32_t idxY = idx / MLS_CG_SIZE;
+        const uint32_t idxX = idx % MLS_CG_SIZE;
+        if (dstCoeff[idxY * trSize + idxX])
+            break;
+    }
+
+    uint32_t firstNZPosInCG = (uint32_t)n;
+
+    return ((lastNZPosInCG << 16) | firstNZPosInCG);
+}
+
 }  // closing - anonymous file-static namespace
 
 namespace x265 {
@@ -818,5 +849,6 @@
     p.cu[BLOCK_32x32].copy_cnt = copy_count<32>;
 
     p.findPosLast = findPosLast_c;
+    p.findPosFirstLast = findPosFirstLast_c;
 }
 }
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/primitives.h
--- a/source/common/primitives.h	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/primitives.h	Mon Apr 20 19:58:29 2015 +0800
@@ -181,6 +181,7 @@
 typedef void (*cutree_propagate_cost) (int* dst, const uint16_t* propagateIn, const int32_t* intraCosts, const uint16_t* interCosts, const int32_t* invQscales, const double* fpsFactor, int len);
 
 typedef int (*findPosLast_t)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -293,6 +294,7 @@
 
 
     findPosLast_t         findPosLast;
+    findPosFirstLast_t    findPosFirstLast;
 
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/quant.cpp
--- a/source/common/quant.cpp	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/quant.cpp	Mon Apr 20 19:58:29 2015 +0800
@@ -1138,19 +1138,10 @@
 
             /* measure distance between first and last non-zero coef in this
              * coding group */
-            for (n = SCAN_SET_SIZE - 1; n >= 0; --n)
-                if (dstCoeff[codeParams.scan[n + subPos]])
-                    break;
+            const uint32_t posFirstLast = primitives.findPosFirstLast(&dstCoeff[codeParams.scan[subPos]], trSize, g_scan4x4[codeParams.scanType]);
+            int firstNZPosInCG = (uint16_t)posFirstLast;
+            int lastNZPosInCG = posFirstLast >> 16;
 
-            X265_CHECK(n >= 0, "non-zero coeff scan failuare!\n");
-
-            int lastNZPosInCG = n;
-
-            for (n = 0;; n++)
-                if (dstCoeff[codeParams.scan[n + subPos]])
-                    break;
-
-            int firstNZPosInCG = n;
 
             if (lastNZPosInCG - firstNZPosInCG >= SBH_THRESHOLD)
             {
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp	Mon Apr 20 19:58:29 2015 +0800
@@ -1015,6 +1015,7 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].p2s = x265_filterPixelToShort_4x2_ssse3;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].p2s = x265_filterPixelToShort_8x2_ssse3;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].p2s = x265_filterPixelToShort_8x6_ssse3;
+        p.findPosFirstLast = x265_findPosFirstLast_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
@@ -1498,6 +1499,7 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = x265_filterPixelToShort_32x32_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = x265_filterPixelToShort_32x48_ssse3;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = x265_filterPixelToShort_32x64_ssse3;
+        p.findPosFirstLast = x265_findPosFirstLast_ssse3;
     }
     if (cpuMask & X265_CPU_SSE4)
     {
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/x86/pixel-util.h	Mon Apr 20 19:58:29 2015 +0800
@@ -79,6 +79,7 @@
 void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
 
 int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
     void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
diff -r bc9631ab9d36 -r e4ba0f962860 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/common/x86/pixel-util8.asm	Mon Apr 20 19:58:29 2015 +0800
@@ -5659,3 +5659,41 @@
     lea         eax, [r11d - 1]
     RET
 %endif
+
+
+;-----------------------------------------------------------------------------
+; uint32_t[last first] findPosFirstAndLast(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16])
+;-----------------------------------------------------------------------------
+INIT_XMM ssse3
+cglobal findPosFirstLast, 3,3,3
+    ; convert stride to int16_t
+    add         r1d, r1d
+
+    ; loading scan table and convert to Byte
+    mova        m0, [r2]
+    packuswb    m0, [r2 + mmsize]
+
+    ; loading 16 of coeff
+    movh        m1, [r0]
+    movhps      m1, [r0 + r1]
+    movh        m2, [r0 + r1 * 2]
+    lea         r1, [r1 * 3]
+    movhps      m2, [r0 + r1]
+    packsswb    m1, m2
+
+    ; get non-zero mask
+    pxor        m2, m2
+    pcmpeqb     m1, m2
+
+    ; reorder by Zigzag scan
+    pshufb      m1, m0
+
+    ; get First and Last pos
+    xor         eax, eax
+    pmovmskb    r0d, m1
+    not         r0w
+    bsr         r1w, r0w
+    bsf          ax, r0w
+    shl         r1d, 16
+    or          eax, r1d
+    RET
diff -r bc9631ab9d36 -r e4ba0f962860 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/test/pixelharness.cpp	Mon Apr 20 19:58:29 2015 +0800
@@ -1266,6 +1266,53 @@
     return true;
 }
 
+bool PixelHarness::check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt)
+{
+    ALIGN_VAR_16(coeff_t, ref_src[32 * 32 + ITERS * 2]);
+
+    for (int i = 0; i < 32 * 32; i++)
+    {
+        ref_src[i] = rand() & SHORT_MAX;
+    }
+
+    // extra test area all of 0x1234
+    for (int i = 0; i < ITERS * 2; i++)
+    {
+        ref_src[32 * 32 + i] = 0x1234;
+    }
+
+    for (int i = 0; i < ITERS; i++)
+    {
+        int rand_scan_type = rand() % NUM_SCAN_TYPE;
+        int rand_scan_size = (rand() % NUM_SCAN_SIZE) + 2;
+        coeff_t *rand_src = ref_src + i;
+
+        const uint16_t* const scanTbl = g_scan4x4[rand_scan_type];
+
+        int j;
+        for (j = 0; j < SCAN_SET_SIZE; j++)
+        {
+            const uint32_t idxY = j / MLS_CG_SIZE;
+            const uint32_t idxX = j % MLS_CG_SIZE;
+            if (rand_src[idxY * rand_scan_size + idxX]) break;
+        }
+
+        // fill one coeff when all coeff group are zero
+        if (j >= SCAN_SET_SIZE)
+            rand_src[0] = 0x0BAD;
+
+        uint32_t ref_scanPos = ref(rand_src, (1 << rand_scan_size), scanTbl);
+        uint32_t opt_scanPos = (int)checked(opt, rand_src, (1 << rand_scan_size), scanTbl);
+
+        if (ref_scanPos != opt_scanPos)
+            return false;
+
+        reportfail();
+    }
+
+    return true;
+}
+
 bool PixelHarness::testPU(int part, const EncoderPrimitives& ref, const EncoderPrimitives& opt)
 {
     if (opt.pu[part].satd)
@@ -1804,6 +1851,15 @@
         }
     }
 
+    if (opt.findPosFirstLast)
+    {
+        if (!check_findPosFirstLast(ref.findPosFirstLast, opt.findPosFirstLast))
+        {
+            printf("findPosFirstLast failed!\n");
+            return false;
+        }
+    }
+
     return true;
 }
 
@@ -2180,4 +2236,17 @@
         memset(coefBuf + 32 * 31, 1, 32 * sizeof(coeff_t));
         REPORT_SPEEDUP(opt.findPosLast, ref.findPosLast, g_scanOrder[SCAN_DIAG][NUM_SCAN_SIZE - 1], coefBuf, (uint16_t*)sbuf1, (uint16_t*)sbuf2, (uint8_t*)psbuf1, 32);
     }
+
+    if (opt.findPosFirstLast)
+    {
+        HEADER0("findPosFirstLast");
+        coeff_t coefBuf[32 * MLS_CG_SIZE];
+        memset(coefBuf, 0, sizeof(coefBuf));
+        // every CG can't be all zeros!
+        coefBuf[3 + 0 * 32] = 0x0BAD;
+        coefBuf[3 + 1 * 32] = 0x0BAD;
+        coefBuf[3 + 2 * 32] = 0x0BAD;
+        coefBuf[3 + 3 * 32] = 0x0BAD;
+        REPORT_SPEEDUP(opt.findPosFirstLast, ref.findPosFirstLast, coefBuf, 32, g_scan4x4[SCAN_DIAG]);
+    }
 }
diff -r bc9631ab9d36 -r e4ba0f962860 source/test/pixelharness.h
--- a/source/test/pixelharness.h	Mon Apr 20 12:41:16 2015 +0800
+++ b/source/test/pixelharness.h	Mon Apr 20 19:58:29 2015 +0800
@@ -109,6 +109,7 @@
     bool check_psyCost_ss(pixelcmp_ss_t ref, pixelcmp_ss_t opt);
     bool check_calSign(sign_t ref, sign_t opt);
     bool check_findPosLast(findPosLast_t ref, findPosLast_t opt);
+    bool check_findPosFirstLast(findPosFirstLast_t ref, findPosFirstLast_t opt);
 
 public:
 



More information about the x265-devel mailing list