[x265] [PATCH 3 of 3] asm: cvt16to32_cnt[8x8] for TSkip
Min Chen
chenm003 at 163.com
Sat Aug 2 02:56:50 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1406940997 25200
# Node ID f9861fde0aab9b2974f96153e333243224c115cd
# Parent a25d83e9037bb62015d5d62f18f8182620a44d8c
asm: cvt16to32_cnt[8x8] for TSkip
diff -r a25d83e9037b -r f9861fde0aab source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Fri Aug 01 17:56:27 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp Fri Aug 01 17:56:37 2014 -0700
@@ -1231,6 +1231,7 @@
// TODO: check POPCNT flag!
p.cvt16to32_cnt[BLOCK_4x4] = x265_cvt16to32_cnt_4_sse4;
+ p.cvt16to32_cnt[BLOCK_8x8] = x265_cvt16to32_cnt_8_sse4;
HEVC_SATD(sse4);
SA8D_INTER_FROM_BLOCK(sse4);
@@ -1331,6 +1332,7 @@
p.sad_x4[LUMA_16x32] = x265_pixel_sad_x4_16x32_avx2;
p.ssd_s[BLOCK_32x32] = x265_pixel_ssd_s_32_avx2;
p.cvt16to32_cnt[BLOCK_4x4] = x265_cvt16to32_cnt_4_avx2;
+ p.cvt16to32_cnt[BLOCK_8x8] = x265_cvt16to32_cnt_8_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r a25d83e9037b -r f9861fde0aab source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Fri Aug 01 17:56:27 2014 -0700
+++ b/source/common/x86/blockcopy8.asm Fri Aug 01 17:56:37 2014 -0700
@@ -30,6 +30,7 @@
tab_Vm: db 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
cextern pw_4
+cextern pb_8
SECTION .text
@@ -3177,3 +3178,155 @@
not ax
popcnt ax, ax
RET
+
+
+;--------------------------------------------------------------------------------------
+; void cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal cvt16to32_cnt_8, 3,5,6
+ add r2d, r2d
+ pxor m4, m4
+ mov r3d, 8/4
+ lea r4, [r2 * 3]
+ pxor m5, m5
+
+.loop
+ ; row 0
+ movu m0, [r1]
+ mova m2, m0
+ pmovsxwd m1, m0
+ punpckhwd m0, m0
+ psrad m0, 16
+ movu [r0 + 0 * mmsize], m1
+ movu [r0 + 1 * mmsize], m0
+
+ ; row 1
+ movu m0, [r1 + r2]
+ packsswb m2, m0
+ pcmpeqb m2, m4
+ paddb m5, m2
+ pmovsxwd m1, m0
+ punpckhwd m0, m0
+ psrad m0, 16
+ movu [r0 + 2 * mmsize], m1
+ movu [r0 + 3 * mmsize], m0
+
+ ; row 2
+ movu m0, [r1 + r2 * 2]
+ mova m2, m0
+ pmovsxwd m1, m0
+ punpckhwd m0, m0
+ psrad m0, 16
+ movu [r0 + 4 * mmsize], m1
+ movu [r0 + 5 * mmsize], m0
+
+ ; row 3
+ movu m0, [r1 + r4]
+ packsswb m2, m0
+ pcmpeqb m2, m4
+ paddb m5, m2
+ pmovsxwd m1, m0
+ punpckhwd m0, m0
+ psrad m0, 16
+ movu [r0 + 6 * mmsize], m1
+ movu [r0 + 7 * mmsize], m0
+
+ add r0, 8 * mmsize
+ lea r1, [r1 + r2 * 4]
+ dec r3d
+ jnz .loop
+
+ ; get count
+ movhlps m3, m5
+ paddb m3, m5
+
+ paddb m3, [pb_8]
+ psadbw m3, m4
+
+ movd eax, m3
+ RET
+
+
+INIT_YMM avx2
+%if ARCH_X86_64 == 1
+cglobal cvt16to32_cnt_8, 3,4,6
+ %define tmpd eax
+%else
+cglobal cvt16to32_cnt_8, 3,5,6
+ %define tmpd r4d
+%endif
+ add r2d, r2d
+ pxor m4, m4
+ lea r3, [r2 * 3]
+
+ ; row 0
+ movu xm0, [r1]
+ mova xm2, xm0
+ pmovsxwd m1, xm0
+ movu [r0 + 0 * mmsize], m1
+
+ ; row 1
+ movu xm0, [r1 + r2]
+ vinserti128 m2, m2, xm0, 1
+ pmovsxwd m1, xm0
+ movu [r0 + 1 * mmsize], m1
+
+ ; row 2
+ movu xm0, [r1 + r2 * 2]
+ mova xm5, xm0
+ pmovsxwd m1, xm0
+ movu [r0 + 2 * mmsize], m1
+
+ ; row 3
+ movu xm0, [r1 + r3]
+ vinserti128 m5, m5, xm0, 1
+ packsswb m2, m5
+ pcmpeqb m2, m4
+ pmovmskb tmpd, m2
+ not tmpd
+ popcnt tmpd, tmpd
+ pmovsxwd m1, xm0
+ movu [r0 + 3 * mmsize], m1
+
+ add r0, 4 * mmsize
+ lea r1, [r1 + r2 * 4]
+
+ ; row 4
+ movu xm0, [r1]
+ mova xm2, xm0
+ pmovsxwd m1, xm0
+ movu [r0 + 0 * mmsize], m1
+
+ ; row 5
+ movu xm0, [r1 + r2]
+ vinserti128 m2, m2, xm0, 1
+ pmovsxwd m1, xm0
+ movu [r0 + 1 * mmsize], m1
+
+ ; row 6
+ movu xm0, [r1 + r2 * 2]
+ mova xm5, xm0
+ pmovsxwd m1, xm0
+ movu [r0 + 2 * mmsize], m1
+
+ ; row 7
+ movu xm0, [r1 + r3]
+ pmovsxwd m1, xm0
+ movu [r0 + 3 * mmsize], m1
+ vinserti128 m5, m5, xm0, 1
+
+ ; get count
+ packsswb m2, m5
+ pcmpeqb m2, m4
+ pmovmskb r0d, m2
+ not r0d
+ popcnt r0d, r0d
+
+%if ARCH_X86_64 == 1
+ add tmpd, r0d
+%else
+ add r0d, tmpd
+%endif
+ RET
+;IACA_END
diff -r a25d83e9037b -r f9861fde0aab source/common/x86/blockcopy8.h
--- a/source/common/x86/blockcopy8.h Fri Aug 01 17:56:27 2014 -0700
+++ b/source/common/x86/blockcopy8.h Fri Aug 01 17:56:37 2014 -0700
@@ -31,6 +31,9 @@
uint32_t x265_cvt16to32_cnt_16_sse4(int32_t * dst, int16_t * src, intptr_t);
uint32_t x265_cvt16to32_cnt_32_sse4(int32_t * dst, int16_t * src, intptr_t);
uint32_t x265_cvt16to32_cnt_4_avx2(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_8_avx2(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_16_avx2(int32_t * dst, int16_t * src, intptr_t);
+uint32_t x265_cvt16to32_cnt_32_avx2(int32_t * dst, int16_t * src, intptr_t);
#define SETUP_BLOCKCOPY_FUNC(W, H, cpu) \
void x265_blockcopy_pp_ ## W ## x ## H ## cpu(pixel * a, intptr_t stridea, pixel * b, intptr_t strideb); \
diff -r a25d83e9037b -r f9861fde0aab source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Fri Aug 01 17:56:27 2014 -0700
+++ b/source/common/x86/const-a.asm Fri Aug 01 17:56:37 2014 -0700
@@ -53,6 +53,7 @@
const pb_1, times 32 db 1
const pb_a1, times 16 db 0xa1
const pb_3, times 16 db 3
+const pb_8, times 16 db 8
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
const pw_2, times 8 dw 2
More information about the x265-devel
mailing list