[x265] [PATCH] asm: x64 version findPosLast, 73815c -> 25890c (2.85x)

Min Chen chenm003 at 163.com
Thu Mar 19 22:37:44 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1426801058 25200
# Node ID a5cecdf6b94c62c43c935e41d76c91ebfb5a00be
# Parent  82437ab8a38c84a3233d82a8a6906f432aa532d3
asm: x64 version findPosLast, 73815c ->  25890c (2.85x)
---
 source/common/x86/asm-primitives.cpp |    3 ++
 source/common/x86/pixel-util.h       |    2 +
 source/common/x86/pixel-util8.asm    |   66 ++++++++++++++++++++++++++++++++++
 3 files changed, 71 insertions(+), 0 deletions(-)

diff -r 82437ab8a38c -r a5cecdf6b94c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Mar 19 09:56:23 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp	Thu Mar 19 14:37:38 2015 -0700
@@ -1725,6 +1725,9 @@
         p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = x265_interp_4tap_vert_ss_24x32_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2;
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2;
+
+        if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
+            p.findPosLast = x265_findPosLast_x64;
     }
 #endif
 }
diff -r 82437ab8a38c -r a5cecdf6b94c source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Thu Mar 19 09:56:23 2015 -0500
+++ b/source/common/x86/pixel-util.h	Thu Mar 19 14:37:38 2015 -0700
@@ -77,6 +77,8 @@
 void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
 void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
 
+int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
     void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
     void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  scr1, intptr_t srcStride0, intptr_t srcStride1);
diff -r 82437ab8a38c -r a5cecdf6b94c source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Mar 19 09:56:23 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Thu Mar 19 14:37:38 2015 -0700
@@ -5379,3 +5379,69 @@
     RET
 %endmacro
 
+;int x265_test_func(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
+;{
+;    int scanPosLast = 0;
+;    do
+;    {
+;        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
+;
+;        const uint32_t posLast = scan[scanPosLast++];
+;
+;        const int curCoeff = coeff[posLast];
+;        const uint32_t isNZCoeff = (curCoeff != 0);
+;        numSig -= isNZCoeff;
+;
+;        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
+;        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
+;        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
+;    }
+;    while (numSig > 0);
+;    return scanPosLast - 1;
+;}
+
+INIT_CPUFLAGS
+cglobal findPosLast_x64, 5,12
+    mov         r5d, r5m
+    xor         r11d, r11d                  ; cgIdx
+    xor         r7d, r7d                    ; tmp for non-zero flag
+
+.loop:
+    xor         r8d, r8d                    ; coeffSign[]
+    xor         r9d, r9d                    ; coeffFlag[]
+    xor         r10d, r10d                  ; coeffNum[]
+
+%assign x 0
+%rep 16
+    movzx       r6d, word [r0 + x * 2]
+    movsx       r6d, word [r1 + r6 * 2]
+    test        r6d, r6d
+    setnz       r7b
+    shr         r6d, 31
+    shlx        r6d, r6d, r10d
+    or          r8d, r6d
+    lea         r9, [r9 * 2 + r7]
+    add         r10d, r7d
+%assign x x+1
+%endrep
+
+    ; store latest group data
+    mov         [r2 + r11 * 2], r8w
+    mov         [r3 + r11 * 2], r9w
+    mov         [r4 + r11], r10b
+    inc         r11d
+
+    add         r0, 16 * 2
+    sub         r5d, r10d
+    jnz        .loop
+
+    ; store group data
+    tzcnt       r6d, r9d
+    shrx        r9d, r9d, r6d
+    mov         [r3 + (r11 - 1) * 2], r9w
+
+    ; get posLast
+    shl         r11d, 4
+    sub         r11d, r6d
+    lea         eax, [r11d - 1]
+    RET



More information about the x265-devel mailing list