[x265] [PATCH] asm: generic x64 version of findPosLast

Min Chen chenm003 at 163.com
Tue Apr 21 08:35:36 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1429598130 -28800
# Node ID 3344fc18ba06f84a1a1e18e7933af622cf1fb405
# Parent  0dc1b16bbb61c2231dd58ca4352cd6d06ce21fc6
asm: generic x64 version of findPosLast
---
 source/common/x86/asm-primitives.cpp |   13 +++++++-
 source/common/x86/pixel-util.h       |    1 +
 source/common/x86/pixel-util8.asm    |   60 +++++++++++++++++++++++++++++++++-
 3 files changed, 72 insertions(+), 2 deletions(-)

diff -r 0dc1b16bbb61 -r 3344fc18ba06 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Apr 21 14:35:26 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp	Tue Apr 21 14:35:30 2015 +0800
@@ -800,6 +800,10 @@
 #error "Unsupported build configuration (32bit x86 and HIGH_BIT_DEPTH), you must configure ENABLE_ASSEMBLY=OFF"
 #endif
 
+#if X86_64
+    p.findPosLast = x265_findPosLast_x64;
+#endif
+
     if (cpuMask & X265_CPU_SSE2)
     {
         /* We do not differentiate CPUs which support MMX and not SSE2. We only check
@@ -1263,12 +1267,19 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].p2s = x265_filterPixelToShort_32x32_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = x265_filterPixelToShort_32x48_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = x265_filterPixelToShort_32x64_avx2;
+
+        if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
+            p.findPosLast = x265_findPosLast_x64_bmi2;
     }
 }
 #else // if HIGH_BIT_DEPTH
 
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask) // 8bpp
 {
+#if X86_64
+    p.findPosLast = x265_findPosLast_x64;
+#endif
+
     if (cpuMask & X265_CPU_SSE2)
     {
         /* We do not differentiate CPUs which support MMX and not SSE2. We only check
@@ -2375,7 +2386,7 @@
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
 
         if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
-            p.findPosLast = x265_findPosLast_x64;
+            p.findPosLast = x265_findPosLast_x64_bmi2;
     }
 #endif
 }
diff -r 0dc1b16bbb61 -r 3344fc18ba06 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Tue Apr 21 14:35:26 2015 +0800
+++ b/source/common/x86/pixel-util.h	Tue Apr 21 14:35:30 2015 +0800
@@ -79,6 +79,7 @@
 void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
 
 int x265_findPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
+int x265_findPosLast_x64_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig);
 uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r 0dc1b16bbb61 -r 3344fc18ba06 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Tue Apr 21 14:35:26 2015 +0800
+++ b/source/common/x86/pixel-util8.asm	Tue Apr 21 14:35:30 2015 +0800
@@ -5613,7 +5613,7 @@
 ;}
 
 %if ARCH_X86_64 == 1
-INIT_CPUFLAGS
+INIT_CPUFLAGS bmi2
 cglobal findPosLast_x64, 5,12
     mov         r5d, r5m
     xor         r11d, r11d                  ; cgIdx
@@ -5658,6 +5658,64 @@
     sub         r11d, r6d
     lea         eax, [r11d - 1]
     RET
+
+
+; t3 must be ecx, since it's used for shift.
+%if WIN64
+    DECLARE_REG_TMP 3,1,2,0
+%elif ARCH_X86_64
+    DECLARE_REG_TMP 0,1,2,3
+%else ; X86_32
+    %error Unsupport platform X86_32
+%endif
+INIT_CPUFLAGS
+cglobal findPosLast_x64, 5,12
+    mov         r10, r3mp
+    movifnidn   t0, r0mp
+    mov         r5d, r5m
+    xor         r11d, r11d                  ; cgIdx
+    xor         r7d, r7d                    ; tmp for non-zero flag
+
+.loop:
+    xor         r8d, r8d                    ; coeffSign[]
+    xor         r9d, r9d                    ; coeffFlag[]
+    xor         t3d, t3d                    ; coeffNum[]
+
+%assign x 0
+%rep 16
+    movzx       r6d, word [t0 + x * 2]
+    movsx       r6d, word [t1 + r6 * 2]
+    test        r6d, r6d
+    setnz       r7b
+    shr         r6d, 31
+    shl         r6d, t3b
+    or          r8d, r6d
+    lea         r9, [r9 * 2 + r7]
+    add         t3d, r7d
+%assign x x+1
+%endrep
+
+    ; store latest group data
+    mov         [t2 + r11 * 2], r8w
+    mov         [r10 + r11 * 2], r9w
+    mov         [r4 + r11], t3b
+    inc         r11d
+
+    add         t0, 16 * 2
+    sub         r5d, t3d
+    jnz        .loop
+
+    ; store group data
+    bsf         t3d, r9d
+    shr         r9d, t3b
+    mov         [r10 + (r11 - 1) * 2], r9w
+
+    ; get posLast
+    shl         r11d, 4
+    sub         r11d, r6d
+    lea         eax, [r11d - 1]
+    RET
+IACA_END
 %endif
 
 



More information about the x265-devel mailing list