[x265] [PATCH] asm: x64 version findPosLast, 73815c -> 25890c (2.85x)

Steve Borho steve at borho.org
Fri Mar 20 17:06:56 CET 2015


On 03/19, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1426807873 25200
> # Node ID 1dbf31b0717604d1e5c34ed6de5aa51f42e32df8
> # Parent  82437ab8a38c84a3233d82a8a6906f432aa532d3
> asm: x64 version findPosLast, 73815c ->  25890c (2.85x)

queued for testing

> ---
>  source/common/x86/asm-primitives.cpp |    3 +
>  source/common/x86/pixel-util.h       |    2 +
>  source/common/x86/pixel-util8.asm    |   68 ++++++++++++++++++++++++++++++++++
>  3 files changed, 73 insertions(+), 0 deletions(-)
> 
> diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp	Thu Mar 19 09:56:23 2015 -0500
> +++ b/source/common/x86/asm-primitives.cpp	Thu Mar 19 16:31:13 2015 -0700
> @@ -1725,6 +1725,9 @@
>          p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = x265_interp_4tap_vert_ss_24x32_avx2;
>          p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2;
>          p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2;
> +
> +        if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
> +            p.findPosLast = x265_findPosLast_x64;
>      }
>  #endif
>  }
> diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h	Thu Mar 19 09:56:23 2015 -0500
> +++ b/source/common/x86/pixel-util.h	Thu Mar 19 16:31:13 2015 -0700
> @@ -77,6 +77,8 @@
>  void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
>  void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
>  
> +int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
> +
>  #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
>      void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t*  dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
>      void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t*  scr1, intptr_t srcStride0, intptr_t srcStride1);
> diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm	Thu Mar 19 09:56:23 2015 -0500
> +++ b/source/common/x86/pixel-util8.asm	Thu Mar 19 16:31:13 2015 -0700
> @@ -5379,3 +5379,71 @@
>      RET
>  %endmacro
>  
> +;int x265_test_func(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
> +;{
> +;    int scanPosLast = 0;
> +;    do
> +;    {
> +;        const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
> +;
> +;        const uint32_t posLast = scan[scanPosLast++];
> +;
> +;        const int curCoeff = coeff[posLast];
> +;        const uint32_t isNZCoeff = (curCoeff != 0);
> +;        numSig -= isNZCoeff;
> +;
> +;        coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
> +;        coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
> +;        coeffNum[cgIdx] += (uint8_t)isNZCoeff;
> +;    }
> +;    while (numSig > 0);
> +;    return scanPosLast - 1;
> +;}
> +
> +%if ARCH_X86_64 == 1
> +INIT_CPUFLAGS
> +cglobal findPosLast_x64, 5,12
> +    mov         r5d, r5m
> +    xor         r11d, r11d                  ; cgIdx
> +    xor         r7d, r7d                    ; tmp for non-zero flag
> +
> +.loop:
> +    xor         r8d, r8d                    ; coeffSign[]
> +    xor         r9d, r9d                    ; coeffFlag[]
> +    xor         r10d, r10d                  ; coeffNum[]
> +
> +%assign x 0
> +%rep 16
> +    movzx       r6d, word [r0 + x * 2]
> +    movsx       r6d, word [r1 + r6 * 2]
> +    test        r6d, r6d
> +    setnz       r7b
> +    shr         r6d, 31
> +    shlx        r6d, r6d, r10d
> +    or          r8d, r6d
> +    lea         r9, [r9 * 2 + r7]
> +    add         r10d, r7d
> +%assign x x+1
> +%endrep
> +
> +    ; store latest group data
> +    mov         [r2 + r11 * 2], r8w
> +    mov         [r3 + r11 * 2], r9w
> +    mov         [r4 + r11], r10b
> +    inc         r11d
> +
> +    add         r0, 16 * 2
> +    sub         r5d, r10d
> +    jnz        .loop
> +
> +    ; store group data
> +    tzcnt       r6d, r9d
> +    shrx        r9d, r9d, r6d
> +    mov         [r3 + (r11 - 1) * 2], r9w
> +
> +    ; get posLast
> +    shl         r11d, 4
> +    sub         r11d, r6d
> +    lea         eax, [r11d - 1]
> +    RET
> +%endif
> 
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel

-- 
Steve Borho


More information about the x265-devel mailing list