[x265] [PATCH 3 of 3] asm: cvt16to32_cnt[8x8] for TSkip

Min Chen chenm003 at 163.com
Sat Aug 2 02:56:50 CEST 2014


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1406940997 25200
# Node ID f9861fde0aab9b2974f96153e333243224c115cd
# Parent  a25d83e9037bb62015d5d62f18f8182620a44d8c
asm: cvt16to32_cnt[8x8] for TSkip

diff -r a25d83e9037b -r f9861fde0aab source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Fri Aug 01 17:56:27 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp	Fri Aug 01 17:56:37 2014 -0700
@@ -1231,6 +1231,7 @@
 
         // TODO: check POPCNT flag!
         p.cvt16to32_cnt[BLOCK_4x4] = x265_cvt16to32_cnt_4_sse4;
+        p.cvt16to32_cnt[BLOCK_8x8] = x265_cvt16to32_cnt_8_sse4;
 
         HEVC_SATD(sse4);
         SA8D_INTER_FROM_BLOCK(sse4);
@@ -1331,6 +1332,7 @@
         p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
         p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
         p.cvt16to32_cnt[BLOCK_4x4] = x265_cvt16to32_cnt_4_avx2;
+        p.cvt16to32_cnt[BLOCK_8x8] = x265_cvt16to32_cnt_8_avx2;
     }
 #endif // if HIGH_BIT_DEPTH
 }
diff -r a25d83e9037b -r f9861fde0aab source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm	Fri Aug 01 17:56:27 2014 -0700
+++ b/source/common/x86/blockcopy8.asm	Fri Aug 01 17:56:37 2014 -0700
@@ -30,6 +30,7 @@
 tab_Vm:    db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
 
 cextern pw_4
+cextern pb_8
 
 SECTION .text
 
@@ -3177,3 +3178,155 @@
     not         ax
     popcnt      ax, ax
     RET
+
+
+;--------------------------------------------------------------------------------------
+; void cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal cvt16to32_cnt_8, 3,5,6
+    add         r2d, r2d
+    pxor        m4, m4
+    mov         r3d, 8/4
+    lea         r4, [r2 * 3]
+    pxor        m5, m5
+
+.loop
+    ; row 0
+    movu        m0, [r1]
+    mova        m2, m0
+    pmovsxwd    m1, m0
+    punpckhwd   m0, m0
+    psrad       m0, 16
+    movu        [r0 + 0 * mmsize], m1
+    movu        [r0 + 1 * mmsize], m0
+
+    ; row 1
+    movu        m0, [r1 + r2]
+    packsswb    m2, m0
+    pcmpeqb     m2, m4
+    paddb       m5, m2
+    pmovsxwd    m1, m0
+    punpckhwd   m0, m0
+    psrad       m0, 16
+    movu        [r0 + 2 * mmsize], m1
+    movu        [r0 + 3 * mmsize], m0
+
+    ; row 2
+    movu        m0, [r1 + r2 * 2]
+    mova        m2, m0
+    pmovsxwd    m1, m0
+    punpckhwd   m0, m0
+    psrad       m0, 16
+    movu        [r0 + 4 * mmsize], m1
+    movu        [r0 + 5 * mmsize], m0
+
+    ; row 3
+    movu        m0, [r1 + r4]
+    packsswb    m2, m0
+    pcmpeqb     m2, m4
+    paddb       m5, m2
+    pmovsxwd    m1, m0
+    punpckhwd   m0, m0
+    psrad       m0, 16
+    movu        [r0 + 6 * mmsize], m1
+    movu        [r0 + 7 * mmsize], m0
+
+    add         r0, 8 * mmsize
+    lea         r1, [r1 + r2 * 4]
+    dec         r3d
+    jnz        .loop
+
+    ; get count
+    movhlps     m3, m5
+    paddb       m3, m5
+
+    paddb       m3, [pb_8]
+    psadbw      m3, m4
+
+    movd        eax, m3
+    RET
+
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal cvt16to32_cnt_8, 3,4,6
+  %define tmpd eax
+%else
+cglobal cvt16to32_cnt_8, 3,5,6
+  %define tmpd r4d
+%endif
+    add         r2d, r2d
+    pxor        m4, m4
+    lea         r3, [r2 * 3]
+
+    ; row 0
+    movu        xm0, [r1]
+    mova        xm2, xm0
+    pmovsxwd    m1, xm0
+    movu        [r0 + 0 * mmsize], m1
+
+    ; row 1
+    movu        xm0, [r1 + r2]
+    vinserti128 m2, m2, xm0, 1
+    pmovsxwd    m1, xm0
+    movu        [r0 + 1 * mmsize], m1
+
+    ; row 2
+    movu        xm0, [r1 + r2 * 2]
+    mova        xm5, xm0
+    pmovsxwd    m1, xm0
+    movu        [r0 + 2 * mmsize], m1
+
+    ; row 3
+    movu        xm0, [r1 + r3]
+    vinserti128 m5, m5, xm0, 1
+    packsswb    m2, m5
+    pcmpeqb     m2, m4
+    pmovmskb    tmpd, m2
+    not         tmpd
+    popcnt      tmpd, tmpd
+    pmovsxwd    m1, xm0
+    movu        [r0 + 3 * mmsize], m1
+
+    add         r0, 4 * mmsize
+    lea         r1, [r1 + r2 * 4]
+
+    ; row 4
+    movu        xm0, [r1]
+    mova        xm2, xm0
+    pmovsxwd    m1, xm0
+    movu        [r0 + 0 * mmsize], m1
+
+    ; row 5
+    movu        xm0, [r1 + r2]
+    vinserti128 m2, m2, xm0, 1
+    pmovsxwd    m1, xm0
+    movu        [r0 + 1 * mmsize], m1
+
+    ; row 6
+    movu        xm0, [r1 + r2 * 2]
+    mova        xm5, xm0
+    pmovsxwd    m1, xm0
+    movu        [r0 + 2 * mmsize], m1
+
+    ; row 7
+    movu        xm0, [r1 + r3]
+    pmovsxwd    m1, xm0
+    movu        [r0 + 3 * mmsize], m1
+    vinserti128 m5, m5, xm0, 1
+
+    ; get count
+    packsswb    m2, m5
+    pcmpeqb     m2, m4
+    pmovmskb    r0d, m2
+    not         r0d
+    popcnt      r0d, r0d
+
+%if ARCH_X86_64 == 1
+    add         tmpd, r0d
+%else
+    add         r0d, tmpd
+%endif
+    RET
+;IACA_END
diff -r a25d83e9037b -r f9861fde0aab source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h	Fri Aug 01 17:56:27 2014 -0700
+++ b/source/common/x86/blockcopy8.h	Fri Aug 01 17:56:37 2014 -0700
@@ -31,6 +31,9 @@
 uint32_t x265_cvt16to32_cnt_16_sse4(int32_t * dst, int16_t * src, intptr_t);
 uint32_t x265_cvt16to32_cnt_32_sse4(int32_t * dst, int16_t * src, intptr_t);
 uint32_t x265_cvt16to32_cnt_4_avx2(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_8_avx2(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_16_avx2(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_32_avx2(int32_t * dst, int16_t * src, intptr_t);
 
 #define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
     void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
diff -r a25d83e9037b -r f9861fde0aab source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm	Fri Aug 01 17:56:27 2014 -0700
+++ b/source/common/x86/const-a.asm	Fri Aug 01 17:56:37 2014 -0700
@@ -53,6 +53,7 @@
 const pb_1,        times 32 db 1
 const pb_a1,       times 16 db 0xa1
 const pb_3,        times 16 db 3
+const pb_8,        times 16 db 8
 const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
 
 const pw_2,        times 8 dw 2



More information about the x265-devel mailing list