[x265] [PATCH] copy_cnt_32, AVX2 asm code as per new interface, performance improved from 16.81x to 32.16x on HASWELL-I5
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Wed Sep 10 11:15:43 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1410340533 -19800
# Node ID 7bc4db02ccc728f6e2ddedd036c96e3d37b90f22
# Parent c5b3e04e4eba2fcc4298c225d11ab25e0da82558
copy_cnt_32, AVX2 asm code as per new interface, performance improved from 16.81x to 32.16x on HASWELL-I5
diff -r c5b3e04e4eba -r 7bc4db02ccc7 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Wed Sep 10 13:35:30 2014 +0530
+++ b/source/common/x86/blockcopy8.asm Wed Sep 10 14:45:33 2014 +0530
@@ -4317,42 +4317,68 @@
INIT_YMM avx2
-cglobal copy_cnt_32, 3,4,5
+cglobal copy_cnt_32, 3,5,7
add r2d, r2d
- mov r3d, 32/1
- xorpd m3, m3
- xorpd m4, m4
-
-.loop
+ lea r3, [r2 * 3]
+ mov r4d, 32/4
+
+ xorpd m5, m5
+ xorpd m6, m6
+
+.loop:
; row 0
- movu m0, [r1 + 0 * mmsize]
- movu m1, [r1 + 1 * mmsize]
- packsswb m2, m0, m1
- pcmpeqb m2, m4
- paddb m3, m2
-
- pmovsxwd m2, xm0
- pmovsxwd m0, [r1 + 0 * mmsize + mmsize/2]
- movu [r0 + 0 * mmsize], m2
- movu [r0 + 1 * mmsize], m0
- pmovsxwd m0, xm1
- pmovsxwd m1, [r1 + 1 * mmsize + mmsize/2]
- movu [r0 + 2 * mmsize], m0
- movu [r0 + 3 * mmsize], m1
-
- add r0, 4 * mmsize
- add r1, r2
- dec r3d
- jnz .loop
+ movu m0, [r1]
+ movu [r0], m0
+ movu m1, [r1 + 32]
+ movu [r0 + 32], m1
+
+ vpacksswb m0, m1
+ pminub m0, [pb_1]
+
+ ; row 1
+ movu m1, [r1 + r2]
+ movu [r0 + 64], m1
+ movu m2, [r1 + r2 + 32]
+ movu [r0 + 96], m2
+
+ vpacksswb m1, m2
+ pminub m1, [pb_1]
+ paddb m0, m1
+
+ ; row 2
+ movu m2, [r1 + r2 * 2]
+ movu [r0 + 128], m2
+ movu m3, [r1 + r2 * 2 + 32]
+ movu [r0 + 160], m3
+
+ vpacksswb m2, m3
+ pminub m2, [pb_1]
+
+ ; row 3
+ movu m3, [r1 + r3]
+ movu [r0 + 192], m3
+ movu m4, [r1 + r3 + 32]
+ movu [r0 + 224], m4
+
+ vpacksswb m3, m4
+ pminub m3, [pb_1]
+ paddb m2, m3
+
+ paddb m0, m2
+ paddb m6, m0
+
+ add r0, 256
+ lea r1, [r1 + 4 * r2]
+ dec r4d
+ jnz .loop
; get count
- vextracti128 xm0, m3, 1
- paddb xm0, xm3
- movhlps xm3, xm0
- paddb xm0, xm3
- paddb xm0, [pb_128]
- psadbw xm0, xm4
- movd eax, xm0
+ vextracti128 xm1, m6, 1
+ paddb xm6, xm1
+ psadbw xm6, xm5
+ movhlps xm1, xm6
+ paddd xm6, xm1
+ movd eax, xm6
RET
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list