[x265] [PATCH 1 of 6] asm: SSE4 of costCoeffRemain in codeCoeffNxN()

Min Chen chenm003 at 163.com
Tue Jun 9 20:01:41 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1433872858 25200
# Node ID d41bc83c21b9f1eba483ec4b5e298a33b6ee1c1e
# Parent  b252468dde7ffca57da27575388d95ce538945d2
asm: SSE4 of costCoeffRemain in codeCoeffNxN()
---
 source/common/dct.cpp                |   49 +++++++++++++++
 source/common/primitives.h           |    2 +
 source/common/x86/asm-primitives.cpp |    1 +
 source/common/x86/pixel-util.h       |    1 +
 source/common/x86/pixel-util8.asm    |  111 ++++++++++++++++++++++++++++++++++
 source/encoder/entropy.cpp           |   40 +------------
 6 files changed, 166 insertions(+), 38 deletions(-)

diff -r b252468dde7f -r d41bc83c21b9 source/common/dct.cpp
--- a/source/common/dct.cpp	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/dct.cpp	Tue Jun 09 11:00:58 2015 -0700
@@ -30,6 +30,7 @@
 #include "common.h"
 #include "primitives.h"
 #include "contexts.h"   // costCoeffNxN_c
+#include "threading.h"  // CLZ
 
 using namespace x265;
 
@@ -873,6 +874,53 @@
     return (sum & 0xFFFFFF);
 }
 
+uint32_t costCoeffRemain_c(uint16_t *absCoeff, int numNonZero)
+{
+    uint32_t goRiceParam = 0;
+    int firstCoeff2 = 1;
+    uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
+
+    uint32_t sum = 0;
+    int idx = 0;
+    do
+    {
+        int baseLevel = (baseLevelN & 3) | firstCoeff2;
+        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
+        baseLevelN >>= 2;
+        int codeNumber = absCoeff[idx] - baseLevel;
+
+        if (codeNumber >= 0)
+        {
+            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
+            uint32_t length = 0;
+
+            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
+            if (codeNumber >= 0)
+            {
+                {
+                    unsigned long cidx;
+                    CLZ(cidx, codeNumber + 1);
+                    length = cidx;
+                }
+                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
+
+                codeNumber = (length + length);
+            }
+            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
+
+            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
+        }
+        if (absCoeff[idx] >= 2)
+            firstCoeff2 = 0;
+        idx++;
+    }
+    while(idx < numNonZero);
+
+    return sum;
+}
+
 }  // closing - anonymous file-static namespace
 
 namespace x265 {
@@ -908,5 +956,6 @@
     p.scanPosLast = scanPosLast_c;
     p.findPosFirstLast = findPosFirstLast_c;
     p.costCoeffNxN = costCoeffNxN_c;
+    p.costCoeffRemain = costCoeffRemain_c;
 }
 }
diff -r b252468dde7f -r d41bc83c21b9 source/common/primitives.h
--- a/source/common/primitives.h	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/primitives.h	Tue Jun 09 11:00:58 2015 -0700
@@ -187,6 +187,7 @@
 typedef uint32_t (*findPosFirstLast_t)(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 typedef uint32_t (*costCoeffNxN_t)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+typedef uint32_t (*costCoeffRemain_t)(uint16_t *absCoeff, int numNonZero);
 
 /* Function pointers to optimized encoder primitives. Each pointer can reference
  * either an assembly routine, a SIMD intrinsic primitive, or a C function */
@@ -313,6 +314,7 @@
     findPosFirstLast_t    findPosFirstLast;
 
     costCoeffNxN_t        costCoeffNxN;
+    costCoeffRemain_t     costCoeffRemain;
 
     /* There is one set of chroma primitives per color space. An encoder will
      * have just a single color space and thus it will only ever use one entry
diff -r b252468dde7f -r d41bc83c21b9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Tue Jun 09 11:00:58 2015 -0700
@@ -2113,6 +2113,8 @@
         // TODO: it is passed smoke test, but we need testbench, so temporary disable
         //p.costCoeffNxN = x265_costCoeffNxN_sse4;
 #endif
+        // TODO: it is passed smoke test, but we need testbench to active it, so temporary disable
+        p.costCoeffRemain = x265_costCoeffRemain_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r b252468dde7f -r d41bc83c21b9 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/pixel-util.h	Tue Jun 09 11:00:58 2015 -0700
@@ -83,6 +83,7 @@
 uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 uint32_t x265_costCoeffNxN_sse4(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
+uint32_t x265_costCoeffRemain_sse4(uint16_t *absCoeff, int numNonZero);
 
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r b252468dde7f -r d41bc83c21b9 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/common/x86/pixel-util8.asm	Tue Jun 09 11:00:58 2015 -0700
@@ -6538,3 +6538,114 @@
     and         eax, 0xFFFFFF
     RET
 %endif ; ARCH_X86_64
+
+
+;uint32_t goRiceParam = 0;
+;int firstCoeff2 = 1;
+;uint32_t baseLevelN = 0x5555AAAA; // 2-bits encode format baseLevel
+;idx = 0;
+;do
+;{
+;    int baseLevel = (baseLevelN & 3) | firstCoeff2;
+;    baseLevelN >>= 2;
+;    int codeNumber = absCoeff[idx] - baseLevel;
+;    if (codeNumber >= 0)
+;    {
+;        uint32_t length = 0;
+;        codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
+;        if (codeNumber >= 0)
+;        {
+;            {
+;                unsigned long cidx;
+;                CLZ(cidx, codeNumber + 1);
+;                length = cidx;
+;            }
+;            codeNumber = (length + length);
+;        }
+;        sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
+;        if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+;            goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+;    }
+;    if (absCoeff[idx] >= 2)
+;        firstCoeff2 = 0;
+;    idx++;
+;}
+;while(idx < numNonZero);
+
+; uint32_t costCoeffRemain(uint16_t *absCoeff, int numNonZero)
+INIT_XMM sse4
+cglobal costCoeffRemain, 0,7,1
+    ; assign RCX to R3
+    ; RAX always in R6 and free
+  %if WIN64
+    DECLARE_REG_TMP 3,1,2,0
+    mov         t0, r0
+  %elif ARCH_X86_64
+    ; *nix x64 didn't do anything
+    DECLARE_REG_TMP 0,1,2,3
+  %else ; X86_32
+    DECLARE_REG_TMP 6,3,2,1
+    mov         t0, r0m
+  %endif
+
+    mova        m0, [t0]
+    packsswb    m0, [t0 + mmsize]
+    pcmpgtb     m0, [pb_1]
+    pmovmskb    r2d, m0
+    bsf         r2d, r2d
+    lea         r2d, [r2 * 2 + 1]
+    xor         r4d, r4d
+    bts         r4d, r2d
+    dec         r4d
+    and         r4d, 0x55555555
+    or          r4d, 0x5555AAAA
+
+    xor         t3d, t3d
+    xor         r5d, r5d
+
+    ; register mapping
+    ; r4d - baseLevelN
+    ; r2  - tmp
+    ; t3  - goRiceParam
+    ; eax - tmp - absCoeff[idx]
+    ; r5  - sum
+
+.loop:
+    movzx       eax, word [t0]
+    add         t0, 2
+    mov         r2d, r4d
+    and         r2d, 3
+    shr         r4d, 2
+    sub         eax, r2d                ; codeNumber = absCoeff[idx] - baseLevel
+    jl         .next
+
+    shr         eax, t3b                ; codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION
+
+    lea         r2d, [eax - 3 + 1]      ; CLZ(cidx, codeNumber + 1);
+    bsr         r2d, r2d
+    add         r2d, r2d                ; codeNumber = (length + length)
+
+    sub         eax, 3
+    cmovge      eax, r2d
+
+    lea         eax, [3 + 1 + t3 + rax] ; sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber)
+    add         r5d, eax
+
+    ; if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
+    ;     goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
+    cmp         t3d, 4
+    setl        al
+
+    mov         r2d, 3
+    shl         r2d, t3b
+    cmp         word [t0 - 2], r2w
+    setg        r2b
+    and         al, r2b
+    add         t3b, al
+
+.next:
+    dec   dword r1m
+    jnz        .loop
+
+    mov         eax, r5d
+    RET
diff -r b252468dde7f -r d41bc83c21b9 source/encoder/entropy.cpp
--- a/source/encoder/entropy.cpp	Tue Jun 09 10:16:44 2015 +0530
+++ b/source/encoder/entropy.cpp	Tue Jun 09 11:00:58 2015 -0700
@@ -1747,44 +1747,8 @@
 
                 if (!m_bitIf)
                 {
-                    uint32_t sum = 0;
-                    // FastRd path
-                    idx = 0;
-                    do
-                    {
-                        int baseLevel = (baseLevelN & 3) | firstCoeff2;
-                        X265_CHECK(baseLevel == ((idx < C1FLAG_NUMBER) ? (2 + firstCoeff2) : 1), "baseLevel check failurr\n");
-                        baseLevelN >>= 2;
-                        int codeNumber = absCoeff[idx] - baseLevel;
-
-                        if (codeNumber >= 0)
-                        {
-                            //writeCoefRemainExGolomb(absCoeff[idx] - baseLevel, goRiceParam);
-                            uint32_t length = 0;
-
-                            codeNumber = ((uint32_t)codeNumber >> goRiceParam) - COEF_REMAIN_BIN_REDUCTION;
-                            if (codeNumber >= 0)
-                            {
-                                {
-                                    unsigned long cidx;
-                                    CLZ(cidx, codeNumber + 1);
-                                    length = cidx;
-                                }
-                                X265_CHECK((codeNumber != 0) || (length == 0), "length check failure\n");
-
-                                codeNumber = (length + length);
-                            }
-                            sum += (COEF_REMAIN_BIN_REDUCTION + 1 + goRiceParam + codeNumber);
-
-                            if (absCoeff[idx] > (COEF_REMAIN_BIN_REDUCTION << goRiceParam))
-                                goRiceParam = (goRiceParam + 1) - (goRiceParam >> 2);
-                            X265_CHECK(goRiceParam <= 4, "goRiceParam check failure\n");
-                        }
-                        if (absCoeff[idx] >= 2)
-                            firstCoeff2 = 0;
-                        idx++;
-                    }
-                    while(idx < numNonZero);
+                    // Fast RD path
+                    uint32_t sum = primitives.costCoeffRemain(absCoeff, numNonZero);
                     m_fracBits += ((uint64_t)sum << 15);
                 }
                 else



More information about the x265-devel mailing list