[x265] [PATCH 5 of 6] asm: avx2+bmi2 version of scanPosLast, 27.6k -> 6.8k cycles

Min Chen chenm003 at 163.com
Wed Apr 22 15:31:59 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1429709479 -28800
# Node ID d70698f3f9d9ae6e7a37a5e5aa0b737913cae514
# Parent  563f0e586018fc223d499a6418d9210fb68d4a7e
asm: avx2+bmi2 version of scanPosLast, 27.6k -> 6.8k cycles
---
 source/common/x86/asm-primitives.cpp |    8 +-
 source/common/x86/const-a.asm        |    1 +
 source/common/x86/pixel-util.h       |    2 +-
 source/common/x86/pixel-util8.asm    |  119 ++++++++++++++++++++++-----------
 4 files changed, 85 insertions(+), 45 deletions(-)

diff -r 563f0e586018 -r d70698f3f9d9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Apr 22 21:31:15 2015 +0800
+++ b/source/common/x86/asm-primitives.cpp	Wed Apr 22 21:31:19 2015 +0800
@@ -1268,8 +1268,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].p2s = x265_filterPixelToShort_32x48_avx2;
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].p2s = x265_filterPixelToShort_32x64_avx2;
 
-        if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
-            p.scanPosLast = x265_scanPosLast_x64_bmi2;
+        if (cpuMask & X265_CPU_BMI2)
+            p.scanPosLast = x265_scanPosLast_avx2_bmi2;
     }
 }
 #else // if HIGH_BIT_DEPTH
@@ -2400,8 +2400,8 @@
         p.chroma[X265_CSP_I444].pu[LUMA_32x24].filter_hps = x265_interp_4tap_horiz_ps_32x24_avx2;
         p.chroma[X265_CSP_I444].pu[LUMA_32x8].filter_hps = x265_interp_4tap_horiz_ps_32x8_avx2;
 
-        if ((cpuMask & X265_CPU_BMI1) && (cpuMask & X265_CPU_BMI2))
-            p.scanPosLast = x265_scanPosLast_x64_bmi2;
+        if (cpuMask & X265_CPU_BMI2)
+            p.scanPosLast = x265_scanPosLast_avx2_bmi2;
     }
 #endif
 }
diff -r 563f0e586018 -r d70698f3f9d9 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Wed Apr 22 21:31:15 2015 +0800
+++ b/source/common/x86/const-a.asm	Wed Apr 22 21:31:19 2015 +0800
@@ -37,6 +37,7 @@
 const pb_3,                 times 16 db 3
 const pb_4,                 times 32 db 4
 const pb_8,                 times 32 db 8
+const pb_15,                times 32 db 15
 const pb_16,                times 32 db 16
 const pb_32,                times 32 db 32
 const pb_64,                times 32 db 64
diff -r 563f0e586018 -r d70698f3f9d9 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h	Wed Apr 22 21:31:15 2015 +0800
+++ b/source/common/x86/pixel-util.h	Wed Apr 22 21:31:19 2015 +0800
@@ -79,7 +79,7 @@
 void x265_scale2D_64to32_avx2(pixel*, const pixel*, intptr_t);
 
 int x265_scanPosLast_x64(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
-int x265_scanPosLast_x64_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
+int x265_scanPosLast_avx2_bmi2(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 uint32_t x265_findPosFirstLast_ssse3(const int16_t *dstCoeff, const intptr_t trSize, const uint16_t scanTbl[16]);
 
 #define SETUP_CHROMA_PIXELSUB_PS_FUNC(W, H, cpu) \
diff -r 563f0e586018 -r d70698f3f9d9 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Wed Apr 22 21:31:15 2015 +0800
+++ b/source/common/x86/pixel-util8.asm	Wed Apr 22 21:31:19 2015 +0800
@@ -68,6 +68,7 @@
 cextern pb_2
 cextern pb_4
 cextern pb_8
+cextern pb_15
 cextern pb_16
 cextern pb_32
 cextern pb_64
@@ -5591,7 +5592,7 @@
     RET
 %endmacro
 
-;int x265_test_func(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig)
+;int scanPosLast(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize)
 ;{
 ;    int scanPosLast = 0;
 ;    do
@@ -5613,50 +5614,89 @@
 ;}
 
 %if ARCH_X86_64 == 1
-INIT_CPUFLAGS bmi2
-cglobal scanPosLast_x64, 5,12
-    mov         r5d, r5m
-    xor         r11d, r11d                  ; cgIdx
-    xor         r7d, r7d                    ; tmp for non-zero flag
+INIT_XMM avx2,bmi2
+cglobal scanPosLast, 7,11,6
+    ; convert unit of Stride(trSize) to int16_t
+    mov         r7d, r7m
+    add         r7d, r7d
+
+    ; loading scan table and convert to Byte
+    mova        m0, [r6]
+    packuswb    m0, [r6 + mmsize]
+    pxor        m1, m0, [pb_15]
+
+    ; clear CG count
+    xor         r9d, r9d
+
+    ; m0 - Zigzag scan table
+    ; m1 - revert order scan table
+    ; m4 - zero
+    ; m5 - ones
+
+    pxor        m4, m4
+    pcmpeqb     m5, m5
+    lea         r8d, [r7d * 3]
 
 .loop:
-    xor         r8d, r8d                    ; coeffSign[]
-    xor         r9d, r9d                    ; coeffFlag[]
-    xor         r10d, r10d                  ; coeffNum[]
-
-%assign x 0
-%rep 16
-    movzx       r6d, word [r0 + x * 2]
-    movsx       r6d, word [r1 + r6 * 2]
-    test        r6d, r6d
-    setnz       r7b
-    shr         r6d, 31
-    shlx        r6d, r6d, r10d
-    or          r8d, r6d
-    lea         r9, [r9 * 2 + r7]
-    add         r10d, r7d
-%assign x x+1
-%endrep
-
-    ; store latest group data
-    mov         [r2 + r11 * 2], r8w
-    mov         [r3 + r11 * 2], r9w
-    mov         [r4 + r11], r10b
-    inc         r11d
-
+    ; position of current CG
+    movzx       r6d, word [r0]
+    lea         r6, [r6 * 2 + r1]
     add         r0, 16 * 2
-    sub         r5d, r10d
-    jnz        .loop
-
-    ; store group data
-    tzcnt       r6d, r9d
-    shrx        r9d, r9d, r6d
-    mov         [r3 + (r11 - 1) * 2], r9w
-
-    ; get posLast
-    shl         r11d, 4
-    sub         r11d, r6d
-    lea         eax, [r11d - 1]
+
+    ; loading current CG
+    movh        m2, [r6]
+    movhps      m2, [r6 + r7]
+    movh        m3, [r6 + r7 * 2]
+    movhps      m3, [r6 + r8]
+    packsswb    m2, m3
+
+    ; Zigzag
+    pshufb      m3, m2, m0
+    pshufb      m2, m1
+
+    ; get sign
+    pmovmskb    r6d, m3
+    pcmpeqb     m3, m4
+    pmovmskb    r10d, m3
+    not         r10d
+    pext        r6d, r6d, r10d
+    mov         [r2 + r9 * 2], r6w
+
+    ; get non-zero flag
+    ; TODO: reuse above result with reorder
+    pcmpeqb     m2, m4
+    pxor        m2, m5
+    pmovmskb    r6d, m2
+    mov         [r3 + r9 * 2], r6w
+
+    ; get non-zero number, POPCNT is faster
+    pabsb       m2, m2
+    psadbw      m2, m4
+    movhlps     m3, m2
+    paddd       m2, m3
+    movd        r6d, m2
+    mov         [r4 + r9], r6b
+
+    inc         r9d
+    sub         r5d, r6d
+    jg         .loop
+
+    ; fixup last CG non-zero flag
+    dec         r9d
+    movzx       r0d, word [r3 + r9 * 2]
+;%if cpuflag(bmi1)  ; 2uops?
+;    tzcnt       r1d, r0d
+;%else
+    bsf         r1d, r0d
+;%endif
+    shrx        r0d, r0d, r1d
+    mov         [r3 + r9 * 2], r0w
+
+    ; get last pos
+    mov         eax, r9d
+    shl         eax, 4
+    xor         r1d, 15
+    add         eax, r1d
     RET
 
 
@@ -5715,7 +5755,6 @@
     sub         r11d, t3d
     lea         eax, [r11d - 1]
     RET
-IACA_END
 %endif
 
 



More information about the x265-devel mailing list