[x265] [PATCH] asm: cvt16to32_cnt[32x32] for TSkip
Min Chen
chenm003 at 163.com
Wed Aug 6 02:35:13 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1407285305 25200
# Node ID ca70276334d293a269d52f79f9b96f831b3411dc
# Parent c473f49e2818a9fd100dfb0c07f149accd17a28a
asm: cvt16to32_cnt[32x32] for TSkip
diff -r c473f49e2818 -r ca70276334d2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 05 21:41:53 2014 +0900
+++ b/source/common/x86/asm-primitives.cpp Tue Aug 05 17:35:05 2014 -0700
@@ -1618,6 +1618,7 @@
p.cvt16to32_cnt[BLOCK_4x4] = x265_cvt16to32_cnt_4_sse4;
p.cvt16to32_cnt[BLOCK_8x8] = x265_cvt16to32_cnt_8_sse4;
p.cvt16to32_cnt[BLOCK_16x16] = x265_cvt16to32_cnt_16_sse4;
+ p.cvt16to32_cnt[BLOCK_32x32] = x265_cvt16to32_cnt_32_sse4;
HEVC_SATD(sse4);
SA8D_INTER_FROM_BLOCK(sse4);
@@ -1721,6 +1722,7 @@
p.cvt16to32_cnt[BLOCK_4x4] = x265_cvt16to32_cnt_4_avx2;
p.cvt16to32_cnt[BLOCK_8x8] = x265_cvt16to32_cnt_8_avx2;
p.cvt16to32_cnt[BLOCK_16x16] = x265_cvt16to32_cnt_16_avx2;
+ p.cvt16to32_cnt[BLOCK_32x32] = x265_cvt16to32_cnt_32_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r c473f49e2818 -r ca70276334d2 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Aug 05 21:41:53 2014 +0900
+++ b/source/common/x86/blockcopy8.asm Tue Aug 05 17:35:05 2014 -0700
@@ -32,6 +32,7 @@
cextern pw_4
cextern pb_8
cextern pb_32
+cextern pb_128
SECTION .text
@@ -3740,3 +3741,97 @@
psadbw xm0, xm3
movd eax, xm0
RET
+
+
+;--------------------------------------------------------------------------------------
+; uint32_t cvt16to32_cnt(int32_t *dst, int16_t *src, intptr_t stride);
+;--------------------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal cvt16to32_cnt_32, 3,4,8
+ add r2d, r2d
+ mov r3d, 32/1
+ pxor m6, m6
+ pxor m7, m7
+
+.loop
+ ; row 0
+ movu m0, [r1 + 0 * mmsize]
+ movu m1, [r1 + 1 * mmsize]
+ movu m2, [r1 + 2 * mmsize]
+ movu m3, [r1 + 3 * mmsize]
+ packsswb m4, m0, m1
+ packsswb m5, m2, m3
+ pcmpeqb m4, m7
+ pcmpeqb m5, m7
+ paddb m6, m4
+ paddb m6, m5
+
+ pmovsxwd m4, m0
+ pmovsxwd m5, [r1 + 0 * mmsize + mmsize/2]
+ movu [r0 + 0 * mmsize], m4
+ movu [r0 + 1 * mmsize], m5
+ pmovsxwd m4, m1
+ pmovsxwd m5, [r1 + 1 * mmsize + mmsize/2]
+ movu [r0 + 2 * mmsize], m4
+ movu [r0 + 3 * mmsize], m5
+ pmovsxwd m4, m2
+ pmovsxwd m5, [r1 + 2 * mmsize + mmsize/2]
+ movu [r0 + 4 * mmsize], m4
+ movu [r0 + 5 * mmsize], m5
+ pmovsxwd m4, m3
+ pmovsxwd m5, [r1 + 3 * mmsize + mmsize/2]
+ movu [r0 + 6 * mmsize], m4
+ movu [r0 + 7 * mmsize], m5
+
+ add r0, 8 * mmsize
+ add r1, r2
+ dec r3d
+ jnz .loop
+
+ ; get count
+ movhlps m0, m6
+ paddb m0, m6
+ paddb m0, [pb_128]
+ psadbw m0, m7
+ movd eax, m0
+ RET
+
+
+INIT_YMM avx2
+cglobal cvt16to32_cnt_32, 3,4,5
+ add r2d, r2d
+ mov r3d, 32/1
+ xorpd m3, m3
+ xorpd m4, m4
+
+.loop
+ ; row 0
+ movu m0, [r1 + 0 * mmsize]
+ movu m1, [r1 + 1 * mmsize]
+ packsswb m2, m0, m1
+ pcmpeqb m2, m4
+ paddb m3, m2
+
+ pmovsxwd m2, xm0
+ pmovsxwd m0, [r1 + 0 * mmsize + mmsize/2]
+ movu [r0 + 0 * mmsize], m2
+ movu [r0 + 1 * mmsize], m0
+ pmovsxwd m0, xm1
+ pmovsxwd m1, [r1 + 1 * mmsize + mmsize/2]
+ movu [r0 + 2 * mmsize], m0
+ movu [r0 + 3 * mmsize], m1
+
+ add r0, 4 * mmsize
+ add r1, r2
+ dec r3d
+ jnz .loop
+
+ ; get count
+ vextracti128 xm0, m3, 1
+ paddb xm0, xm3
+ movhlps xm3, xm0
+ paddb xm0, xm3
+ paddb xm0, [pb_128]
+ psadbw xm0, xm4
+ movd eax, xm0
+ RET
diff -r c473f49e2818 -r ca70276334d2 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue Aug 05 21:41:53 2014 +0900
+++ b/source/common/x86/const-a.asm Tue Aug 05 17:35:05 2014 -0700
@@ -55,6 +55,7 @@
const pb_3, times 16 db 3
const pb_8, times 16 db 8
const pb_32, times 16 db 32
+const pb_128, times 16 db 128
const pb_shuf8x8c, db 0,0,0,0,2,2,2,2,4,4,4,4,6,6,6,6
const pw_2, times 8 dw 2
More information about the x265-devel
mailing list