[x265] [PATCH] copy_cnt_16: avx2 asm code, improved 514.32 cycles -> 313.66 cycles
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Sep 18 13:08:23 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1411038475 -19800
# Node ID 532f798f98d7c7f5c493a819046a45e29b2da16a
# Parent e723ecc1e5c99c451cbc8034514b9dc590a2d4ef
copy_cnt_16: avx2 asm code, improved 514.32 cycles -> 313.66 cycles
diff -r e723ecc1e5c9 -r 532f798f98d7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Sep 18 15:30:18 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Sep 18 16:37:55 2014 +0530
@@ -1730,7 +1730,7 @@
* code is updated, avx2 version will be enabled */
p.copy_cnt[BLOCK_8x8] = x265_copy_cnt_8_avx2;
- // p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
+ p.copy_cnt[BLOCK_16x16] = x265_copy_cnt_16_avx2;
// p.copy_cnt[BLOCK_32x32] = x265_copy_cnt_32_avx2;
diff -r e723ecc1e5c9 -r 532f798f98d7 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Sep 18 15:30:18 2014 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Sep 18 16:37:55 2014 +0530
@@ -4159,69 +4159,48 @@
INIT_YMM avx2
-cglobal copy_cnt_16, 3,5,5
+cglobal copy_cnt_16, 3, 5, 5
add r2d, r2d
- lea r4, [r2 * 3]
- mov r3d, 16/4
- ; NOTE: xorpd is faster than pxor
+ lea r3, [r2 * 3]
+ mov r4d, 16/4
+
+ mova m3, [pb_1]
xorpd m4, m4
- xorpd m3, m3
-
-.loop
- ; row 0
+
+.loop:
+ ; row 0 - 1
movu m0, [r1]
- movu xm1, [r1 + mmsize/2]
- pmovsxwd m2, xm0
- pmovsxwd m1, xm1
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m1
-
- ; row 1
+ movu [r0], m0
movu m1, [r1 + r2]
- movu xm2, [r1 + r2 + mmsize/2]
+ movu [r0 + 32], m1
+
packsswb m0, m1
- pcmpeqb m0, m3
+ pminub m0, m3
+
+ ; row 2 - 3
+ movu m1, [r1 + r2 * 2]
+ movu [r0 + 64], m1
+ movu m2, [r1 + r3]
+ movu [r0 + 96], m2
+
+ packsswb m1, m2
+ pminub m1, m3
+ paddb m0, m1
paddb m4, m0
- pmovsxwd m1, xm1
- pmovsxwd m2, xm2
- movu [r0 + 2 * mmsize], m1
- movu [r0 + 3 * mmsize], m2
-
- ; move output pointer here to avoid 128 bytes offset limit
- add r0, 4 * mmsize
-
- ; row 2
- movu m0, [r1 + r2 * 2]
- movu xm1, [r1 + r2 * 2 + mmsize/2]
- pmovsxwd m2, xm0
- pmovsxwd m1, xm1
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m1
-
- ; row 3
- movu m1, [r1 + r4]
- movu xm2, [r1 + r4 + mmsize/2]
- packsswb m0, m1
- pcmpeqb m0, m3
- paddb m4, m0
- pmovsxwd m1, xm1
- pmovsxwd m2, xm2
- movu [r0 + 2 * mmsize], m1
- movu [r0 + 3 * mmsize], m2
-
- add r0, 4 * mmsize
- lea r1, [r1 + r2 * 4]
- dec r3d
- jnz .loop
+
+ add r0, 128
+ lea r1, [r1 + 4 * r2]
+ dec r4d
+ jnz .loop
; get count
- vextracti128 xm0, m4, 1
- paddb xm0, xm4
- movhlps xm1, xm0
- paddb xm0, xm1
- paddb xm0, [pb_32]
- psadbw xm0, xm3
- movd eax, xm0
+ xorpd m0, m0
+ vextracti128 xm1, m4, 1
+ paddb xm4, xm1
+ psadbw xm4, xm0
+ movhlps xm1, xm4
+ paddd xm4, xm1
+ movd eax, xm4
RET
;--------------------------------------------------------------------------------------
diff -r e723ecc1e5c9 -r 532f798f98d7 source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Thu Sep 18 15:30:18 2014 +0530
+++ b/source/common/x86/const-a.asm Thu Sep 18 16:37:55 2014 +0530
@@ -29,6 +29,8 @@
SECTION_RODATA 32
+const pb_1, times 32 db 1
+
const hsub_mul, times 16 db 1, -1
const pw_1, times 16 dw 1
const pw_16, times 16 dw 16
@@ -53,7 +55,6 @@
const pb_64, times 16 db 64
const pb_01, times 8 db 0,1
const pb_0, times 16 db 0
-const pb_1, times 32 db 1
const pb_a1, times 16 db 0xa1
const pb_3, times 16 db 3
const pb_8, times 16 db 8
More information about the x265-devel
mailing list