[x265] [PATCH] copy_cnt_8 AVX2 asm code, as per new interface
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Tue Sep 9 14:23:25 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1410265389 -19800
# Node ID cbb7e2beff033e441a13cd82bbfd85b362cd6d24
# Parent d011073f35258cb2f0ad95db6038c2d9fb840b27
copy_cnt_8 AVX2 asm code, as per new interface
diff -r d011073f3525 -r cbb7e2beff03 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Tue Sep 09 14:07:14 2014 +0530
+++ b/source/common/x86/blockcopy8.asm Tue Sep 09 17:53:09 2014 +0530
@@ -4076,85 +4076,49 @@
INIT_YMM avx2
-%if ARCH_X86_64 == 1
-cglobal copy_cnt_8, 3,4,6
- %define tmpd eax
-%else
-cglobal copy_cnt_8, 3,5,6
- %define tmpd r4d
-%endif
+cglobal copy_cnt_8, 3,3,6
add r2d, r2d
- pxor m4, m4
- lea r3, [r2 * 3]
-
- ; row 0
+ xorpd m5, m5
+
+ ; row 0 - 1
movu xm0, [r1]
- mova xm2, xm0
- pmovsxwd m1, xm0
- movu [r0 + 0 * mmsize], m1
-
- ; row 1
- movu xm0, [r1 + r2]
- vinserti128 m2, m2, xm0, 1
- pmovsxwd m1, xm0
- movu [r0 + 1 * mmsize], m1
-
- ; row 2
- movu xm0, [r1 + r2 * 2]
- mova xm5, xm0
- pmovsxwd m1, xm0
- movu [r0 + 2 * mmsize], m1
-
- ; row 3
- movu xm0, [r1 + r3]
- vinserti128 m5, m5, xm0, 1
- packsswb m2, m5
- pcmpeqb m2, m4
- pmovmskb tmpd, m2
- not tmpd
- popcnt tmpd, tmpd
- pmovsxwd m1, xm0
- movu [r0 + 3 * mmsize], m1
-
- add r0, 4 * mmsize
- lea r1, [r1 + r2 * 4]
-
- ; row 4
- movu xm0, [r1]
- mova xm2, xm0
- pmovsxwd m1, xm0
- movu [r0 + 0 * mmsize], m1
-
- ; row 5
- movu xm0, [r1 + r2]
- vinserti128 m2, m2, xm0, 1
- pmovsxwd m1, xm0
- movu [r0 + 1 * mmsize], m1
-
- ; row 6
- movu xm0, [r1 + r2 * 2]
- mova xm5, xm0
- pmovsxwd m1, xm0
- movu [r0 + 2 * mmsize], m1
-
- ; row 7
- movu xm0, [r1 + r3]
- pmovsxwd m1, xm0
- movu [r0 + 3 * mmsize], m1
- vinserti128 m5, m5, xm0, 1
+ movu xm1, [r1 + r2]
+ vinserti128 m0, m0, xm1, 1
+ movu [r0], m0
+
+ ; row 2 - 3
+ movu xm1, [r1 + r2 * 2]
+ lea r1, [r1 + r2 * 2]
+ movu xm2, [r1 + r2]
+ vinserti128 m1, m1, xm2, 1
+ movu [r0 + 32], m1
+
+ ; row 4 - 5
+ movu xm2, [r1 + r2 * 2]
+ lea r1, [r1 + r2 * 2]
+ movu xm3, [r1 + r2]
+ vinserti128 m2, m2, xm3, 1
+ movu [r0 + 64], m2
+
+ ; row 6 - 7
+ movu xm3, [r1 + r2 * 2]
+ lea r1, [r1 + r2 * 2]
+ movu xm4, [r1 + r2]
+ vinserti128 m3, m3, xm4, 1
+ movu [r0 + 96], m3
; get count
- packsswb m2, m5
- pcmpeqb m2, m4
- pmovmskb r0d, m2
- not r0d
- popcnt r0d, r0d
-
-%if ARCH_X86_64 == 1
- add tmpd, r0d
-%else
- add r0d, tmpd
-%endif
+ vpacksswb m0, m1
+ vpcmpeqb m0, m5
+ vpmovmskb eax, m0
+ not eax
+ popcnt eax, eax
+ vpacksswb m2, m3
+ vpcmpeqb m2, m5
+ vpmovmskb r1d, m2
+ not r1d
+ popcnt r1d, r1d
+ add eax, r1d
RET
More information about the x265-devel
mailing list