[x265] [PATCH] copy_cnt_32: avx2 asm code as per new interface, improved 1521.17 cycles -> 934.46 cycles
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Sep 11 16:06:54 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1410444407 -19800
# Node ID 05162453203f955413aec5153a85ccdda1a3f519
# Parent df74723eb9a3861f6bba7f33d09a37efe53932a4
copy_cnt_32: avx2 asm code as per new interface, improved 1521.17 cycles -> 934.46 cycles
diff -r df74723eb9a3 -r 05162453203f source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Sep 11 19:08:24 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Sep 11 19:36:47 2014 +0530
@@ -1732,7 +1732,7 @@
p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
- // p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
+ p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
p.cvt32to16_shl[BLOCK_4x4] = x265_cvt32to16_shl_4_avx2;
diff -r df74723eb9a3 -r 05162453203f source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Sep 11 19:08:24 2014 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Sep 11 19:36:47 2014 +0530
@@ -4268,42 +4268,47 @@
INIT_YMM avx2
-cglobal copy_cnt_32, 3,4,5
+cglobal copy_cnt_32, 3,5,5
add r2d, r2d
- mov r3d, 32/1
- xorpd m3, m3
+ mov r3d, 32/2
+
+ movu m3, [pb_1]
xorpd m4, m4
-.loop
+.loop:
; row 0
- movu m0, [r1 + 0 * mmsize]
- movu m1, [r1 + 1 * mmsize]
- packsswb m2, m0, m1
- pcmpeqb m2, m4
- paddb m3, m2
-
- pmovsxwd m2, xm0
- pmovsxwd m0, [r1 + 0 * mmsize + mmsize/2]
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m0
- pmovsxwd m0, xm1
- pmovsxwd m1, [r1 + 1 * mmsize + mmsize/2]
- movu [r0 + 2 * mmsize], m0
- movu [r0 + 3 * mmsize], m1
-
- add r0, 4 * mmsize
- add r1, r2
+ movu m0, [r1]
+ movu [r0], m0
+ movu m1, [r1 + 32]
+ movu [r0 + 32], m1
+
+ packsswb m0, m1
+ pminub m0, m3
+
+ ; row 1
+ movu m1, [r1 + r2]
+ movu [r0 + 64], m1
+ movu m2, [r1 + r2 + 32]
+ movu [r0 + 96], m2
+
+ packsswb m1, m2
+ pminub m1, m3
+ paddb m0, m1
+ paddb m4, m0
+
+ add r0, 128
+ lea r1, [r1 + 2 * r2]
dec r3d
- jnz .loop
+ jnz .loop
; get count
- vextracti128 xm0, m3, 1
- paddb xm0, xm3
- movhlps xm3, xm0
- paddb xm0, xm3
- paddb xm0, [pb_128]
- psadbw xm0, xm4
- movd eax, xm0
+ xorpd m0, m0
+ vextracti128 xm1, m4, 1
+ paddb xm4, xm1
+ psadbw xm4, xm0
+ movhlps xm1, xm4
+ paddd xm4, xm1
+ movd eax, xm4
RET
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list