[x265] [PATCH] asm: generic x64 version of findPosLast
Min Chen
chenm003 at 163.com
Tue Apr 21 08:35:36 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1429598130 -28800
# Node ID 3344fc18ba06f84a1a1e18e7933af622cf1fb405
# Parent 0dc1b16bbb61c2231dd58ca4352cd6d06ce21fc6
asm: generic x64 version of findPosLast
---
source/common/x86/asm-primitives.cpp | 13 +++++++-
source/common/x86/pixel-util.h | 1 +
source/common/x86/pixel-util8.asm | 60 +++++++++++++++++++++++++++++++++-
3 files changed, 72 insertions(+), 2 deletions(-)
diff -r 0dc1b16bbb61 -r 3344fc18ba06 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Apr 21 14:35:26 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp Tue Apr 21 14:35:30 2015 +0800
@@ -800,6 +800,10 @@
#error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
#endif
+#if X86_64
+ p.findPosLast = x265_findPosLast_x64;
+#endif
+
if (cpuMask & X265_CPU_SSE2)
{
/* We do not differentiate CPUs which support MMX and not SSE2. We only check
@@ -1263,12 +1267,19 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = x265_filterPixelToShort_32x32_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = x265_filterPixelToShort_32x48_avx2;
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = x265_filterPixelToShort_32x64_avx2;
+
+ if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
+ p.findPosLast = x265_findPosLast_x64_bmi2;
}
}
#else // if HIGH_BIT_DEPTH
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 8bpp
{
+#if X86_64
+ p.findPosLast = x265_findPosLast_x64;
+#endif
+
if (cpuMask & X265_CPU_SSE2)
{
/* We do not differentiate CPUs which support MMX and not SSE2. We only check
@@ -2375,7 +2386,7 @@
p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
- p.findPosLast = x265_findPosLast_x64;
+ p.findPosLast = x265_findPosLast_x64_bmi2;
}
#endif
}
diff -r 0dc1b16bbb61 -r 3344fc18ba06 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Tue Apr 21 14:35:26 2015 +0800
+++ b/source/common/x86/pixel-util.h Tue Apr 21 14:35:30 2015 +0800
@@ -79,6 +79,7 @@
void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+int x265_findPosLast_x64_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
#define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r 0dc1b16bbb61 -r 3344fc18ba06 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Apr 21 14:35:26 2015 +0800
+++ b/source/common/x86/pixel-util8.asm Tue Apr 21 14:35:30 2015 +0800
@@ -5613,7 +5613,7 @@
;}
%if ARCH_X86_64 == 1
-INIT_CPUFLAGS
+INIT_CPUFLAGS bmi2
cglobal findPosLast_x64, 5,12
mov r5d, r5m
xor r11d, r11d ; cgIdx
@@ -5658,6 +5658,64 @@
sub r11d, r6d
lea eax, [r11d - 1]
RET
+
+
+; t3 must be ecx, since it's used for shift.
+%if WIN64
+ DECLARE_REG_TMP 3,1,2,0
+%elif ARCH_X86_64
+ DECLARE_REG_TMP 0,1,2,3
+%else ; X86_32
+ %error Unsupport platform X86_32
+%endif
+INIT_CPUFLAGS
+cglobal findPosLast_x64, 5,12
+ mov r10, r3mp
+ movifnidn t0, r0mp
+ mov r5d, r5m
+ xor r11d, r11d ; cgIdx
+ xor r7d, r7d ; tmp for non-zero flag
+
+.loop:
+ xor r8d, r8d ; coeffSign[]
+ xor r9d, r9d ; coeffFlag[]
+ xor t3d, t3d ; coeffNum[]
+
+%assign x 0
+%rep 16
+ movzx r6d, word [t0 + x * 2]
+ movsx r6d, word [t1 + r6 * 2]
+ test r6d, r6d
+ setnz r7b
+ shr r6d, 31
+ shl r6d, t3b
+ or r8d, r6d
+ lea r9, [r9 * 2 + r7]
+ add t3d, r7d
+%assign x x+1
+%endrep
+
+ ; store latest group data
+ mov [t2 + r11 * 2], r8w
+ mov [r10 + r11 * 2], r9w
+ mov [r4 + r11], t3b
+ inc r11d
+
+ add t0, 16 * 2
+ sub r5d, t3d
+ jnz .loop
+
+ ; store group data
+ bsf t3d, r9d
+ shr r9d, t3b
+ mov [r10 + (r11 - 1) * 2], r9w
+
+ ; get posLast
+ shl r11d, 4
+ sub r11d, r6d
+ lea eax, [r11d - 1]
+ RET
+IACA_END
%endif
More information about the x265-devel
mailing list