[x265] [PATCH] asm: x64 version findPosLast, 73815c -> 25890c (2.85x)
Steve Borho
steve at borho.org
Fri Mar 20 17:06:56 CET 2015
On 03/19, Min Chen wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1426807873 25200
> # Node ID 1dbf31b0717604d1e5c34ed6de5aa51f42e32df8
> # Parent 82437ab8a38c84a3233d82a8a6906f432aa532d3
> asm: x64 version findPosLast, 73815c -> 25890c (2.85x)
queued for testing
> ---
> source/common/x86/asm-primitives.cpp | 3 +
> source/common/x86/pixel-util.h | 2 +
> source/common/x86/pixel-util8.asm | 68 ++++++++++++++++++++++++++++++++++
> 3 files changed, 73 insertions(+), 0 deletions(-)
>
> diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Thu Mar 19 09:56:23 2015 -0500
> +++ b/source/common/x86/asm-primitives.cpp Thu Mar 19 16:31:13 2015 -0700
> @@ -1725,6 +1725,9 @@
> p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = x265_interp_4tap_vert_ss_24x32_avx2;
> p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2;
> p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2;
> +
> + if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
> + p.findPosLast = x265_findPosLast_x64;
> }
> #endif
> }
> diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h Thu Mar 19 09:56:23 2015 -0500
> +++ b/source/common/x86/pixel-util.h Thu Mar 19 16:31:13 2015 -0700
> @@ -77,6 +77,8 @@
> void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
> void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
>
> +int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
> +
> #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
> void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
> void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1);
> diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Thu Mar 19 09:56:23 2015 -0500
> +++ b/source/common/x86/pixel-util8.asm Thu Mar 19 16:31:13 2015 -0700
> @@ -5379,3 +5379,71 @@
> RET
> %endmacro
>
> +;int x265_test_func(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
> +;{
> +; int scanPosLast = 0;
> +; do
> +; {
> +; const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
> +;
> +; const uint32_t posLast = scan[scanPosLast++];
> +;
> +; const int curCoeff = coeff[posLast];
> +; const uint32_t isNZCoeff = (curCoeff != 0);
> +; numSig -= isNZCoeff;
> +;
> +; coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
> +; coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
> +; coeffNum[cgIdx] += (uint8_t)isNZCoeff;
> +; }
> +; while (numSig > 0);
> +; return scanPosLast - 1;
> +;}
> +
> +%if ARCH_X86_64 == 1
> +INIT_CPUFLAGS
> +cglobal findPosLast_x64, 5,12
> + mov r5d, r5m
> + xor r11d, r11d ; cgIdx
> + xor r7d, r7d ; tmp for non-zero flag
> +
> +.loop:
> + xor r8d, r8d ; coeffSign[]
> + xor r9d, r9d ; coeffFlag[]
> + xor r10d, r10d ; coeffNum[]
> +
> +%assign x 0
> +%rep 16
> + movzx r6d, word [r0 + x * 2]
> + movsx r6d, word [r1 + r6 * 2]
> + test r6d, r6d
> + setnz r7b
> + shr r6d, 31
> + shlx r6d, r6d, r10d
> + or r8d, r6d
> + lea r9, [r9 * 2 + r7]
> + add r10d, r7d
> +%assign x x+1
> +%endrep
> +
> + ; store latest group data
> + mov [r2 + r11 * 2], r8w
> + mov [r3 + r11 * 2], r9w
> + mov [r4 + r11], r10b
> + inc r11d
> +
> + add r0, 16 * 2
> + sub r5d, r10d
> + jnz .loop
> +
> + ; store group data
> + tzcnt r6d, r9d
> + shrx r9d, r9d, r6d
> + mov [r3 + (r11 - 1) * 2], r9w
> +
> + ; get posLast
> + shl r11d, 4
> + sub r11d, r6d
> + lea eax, [r11d - 1]
> + RET
> +%endif
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list