[x265] [PATCH] asm: x64 version findPosLast, 73815c -> 25890c (2.85x)
Min Chen
chenm003 at 163.com
Fri Mar 20 00:31:42 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1426807873 25200
# Node ID 1dbf31b0717604d1e5c34ed6de5aa51f42e32df8
# Parent 82437ab8a38c84a3233d82a8a6906f432aa532d3
asm: x64 version findPosLast, 73815c -> 25890c (2.85x)
---
source/common/x86/asm-primitives.cpp | 3 +
source/common/x86/pixel-util.h | 2 +
source/common/x86/pixel-util8.asm | 68 ++++++++++++++++++++++++++++++++++
3 files changed, 73 insertions(+), 0 deletions(-)
diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 19 09:56:23 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Thu Mar 19 16:31:13 2015 -0700
@@ -1725,6 +1725,9 @@
p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].filter_vss = x265_interp_4tap_vert_ss_24x32_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].filter_vss = x265_interp_4tap_vert_ss_32x8_avx2;
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_vss = x265_interp_4tap_vert_ss_32x16_avx2;
+
+ if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
+ p.findPosLast = x265_findPosLast_x64;
}
#endif
}
diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Thu Mar 19 09:56:23 2015 -0500
+++ b/source/common/x86/pixel-util.h Thu Mar 19 16:31:13 2015 -0700
@@ -77,6 +77,8 @@
void x265_scale1D_128to64_avx2(pixel*, const pixel*, intptr_t);
void x265_scale2D_64to32_ssse3(pixel*, const pixel*, intptr_t);
+int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
void x265_pixel_sub_ps_ ## W ## x ## H ## cpu(int16_t* dest, intptr_t destride, const pixel* src0, const pixel* src1, intptr_t srcstride0, intptr_t srcstride1); \
void x265_pixel_add_ps_ ## W ## x ## H ## cpu(pixel* dest, intptr_t destride, const pixel* src0, const int16_t* scr1, intptr_t srcStride0, intptr_t srcStride1);
diff -r 82437ab8a38c -r 1dbf31b07176 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Thu Mar 19 09:56:23 2015 -0500
+++ b/source/common/x86/pixel-util8.asm Thu Mar 19 16:31:13 2015 -0700
@@ -5379,3 +5379,71 @@
RET
%endmacro
+;int x265_test_func(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
+;{
+; int scanPosLast = 0;
+; do
+; {
+; const uint32_t cgIdx = (uint32_t)scanPosLast >> MLS_CG_SIZE;
+;
+; const uint32_t posLast = scan[scanPosLast++];
+;
+; const int curCoeff = coeff[posLast];
+; const uint32_t isNZCoeff = (curCoeff != 0);
+; numSig -= isNZCoeff;
+;
+; coeffSign[cgIdx] += (uint16_t)(((uint32_t)curCoeff >> 31) << coeffNum[cgIdx]);
+; coeffFlag[cgIdx] = (coeffFlag[cgIdx] << 1) + (uint16_t)isNZCoeff;
+; coeffNum[cgIdx] += (uint8_t)isNZCoeff;
+; }
+; while (numSig > 0);
+; return scanPosLast - 1;
+;}
+
+%if ARCH_X86_64 == 1
+INIT_CPUFLAGS
+cglobal findPosLast_x64, 5,12
+ mov r5d, r5m
+ xor r11d, r11d ; cgIdx
+ xor r7d, r7d ; tmp for non-zero flag
+
+.loop:
+ xor r8d, r8d ; coeffSign[]
+ xor r9d, r9d ; coeffFlag[]
+ xor r10d, r10d ; coeffNum[]
+
+%assign x 0
+%rep 16
+ movzx r6d, word [r0 + x * 2]
+ movsx r6d, word [r1 + r6 * 2]
+ test r6d, r6d
+ setnz r7b
+ shr r6d, 31
+ shlx r6d, r6d, r10d
+ or r8d, r6d
+ lea r9, [r9 * 2 + r7]
+ add r10d, r7d
+%assign x x+1
+%endrep
+
+ ; store latest group data
+ mov [r2 + r11 * 2], r8w
+ mov [r3 + r11 * 2], r9w
+ mov [r4 + r11], r10b
+ inc r11d
+
+ add r0, 16 * 2
+ sub r5d, r10d
+ jnz .loop
+
+ ; store group data
+ tzcnt r6d, r9d
+ shrx r9d, r9d, r6d
+ mov [r3 + (r11 - 1) * 2], r9w
+
+ ; get posLast
+ shl r11d, 4
+ sub r11d, r6d
+ lea eax, [r11d - 1]
+ RET
+%endif
More information about the x265-devel
mailing list