[x265] [PATCH] copy_cnt_8, AVX2 asm code as per new interface, performance improved from 5.13x to 7.59x on HASWELL-I5
chen
chenm003 at 163.com
Wed Sep 10 23:08:00 CEST 2014
Its right.
At 2014-09-10 13:53:52,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1410328371 -19800
># Node ID d29cb300975a491287abdfb6abd2a9d3141e99f0
># Parent 408e2e6f0f709525cedb784a65386a116f2d3d00
>copy_cnt_8, AVX2 asm code as per new interface, performance improved from 5.13x to 7.59x on HASWELL-I5
>
>diff -r 408e2e6f0f70 -r d29cb300975a source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Tue Sep 09 22:23:26 2014 +0200
>+++ b/source/common/x86/blockcopy8.asm Wed Sep 10 11:22:51 2014 +0530
>@@ -4079,85 +4079,44 @@
>
>
> INIT_YMM avx2
>-%if ARCH_X86_64 == 1
>-cglobal copy_cnt_8, 3,4,6
>- %define tmpd eax
>-%else
>-cglobal copy_cnt_8, 3,5,6
>- %define tmpd r4d
>-%endif
>+cglobal copy_cnt_8, 3,4,5
> add r2d, r2d
>- pxor m4, m4
> lea r3, [r2 * 3]
>
>- ; row 0
>+ ; row 0 - 1
> movu xm0, [r1]
>- mova xm2, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 0 * mmsize], m1
>-
>- ; row 1
>- movu xm0, [r1 + r2]
>- vinserti128 m2, m2, xm0, 1
>- pmovsxwd m1, xm0
>- movu [r0 + 1 * mmsize], m1
>-
>- ; row 2
>- movu xm0, [r1 + r2 * 2]
>- mova xm5, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 2 * mmsize], m1
>-
>- ; row 3
>- movu xm0, [r1 + r3]
>- vinserti128 m5, m5, xm0, 1
>- packsswb m2, m5
>- pcmpeqb m2, m4
>- pmovmskb tmpd, m2
>- not tmpd
>- popcnt tmpd, tmpd
>- pmovsxwd m1, xm0
>- movu [r0 + 3 * mmsize], m1
>-
>- add r0, 4 * mmsize
>- lea r1, [r1 + r2 * 4]
>-
>- ; row 4
>- movu xm0, [r1]
>- mova xm2, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 0 * mmsize], m1
>-
>- ; row 5
>- movu xm0, [r1 + r2]
>- vinserti128 m2, m2, xm0, 1
>- pmovsxwd m1, xm0
>- movu [r0 + 1 * mmsize], m1
>-
>- ; row 6
>- movu xm0, [r1 + r2 * 2]
>- mova xm5, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 2 * mmsize], m1
>-
>- ; row 7
>- movu xm0, [r1 + r3]
>- pmovsxwd m1, xm0
>- movu [r0 + 3 * mmsize], m1
>- vinserti128 m5, m5, xm0, 1
>+ vinserti128 m0, m0, [r1 + r2], 1
>+ movu [r0], m0
>+
>+ ; row 2 - 3
>+ movu xm1, [r1 + r2 * 2]
>+ vinserti128 m1, m1, [r1 + r3], 1
>+ movu [r0 + 32], m1
>+ lea r1, [r1 + r2 * 4]
>+
>+ ; row 4 - 5
>+ movu xm2, [r1]
>+ vinserti128 m2, m2, [r1 + r2], 1
>+ movu [r0 + 64], m2
>+
>+ ; row 6 - 7
>+ movu xm3, [r1 + r2 * 2]
>+ vinserti128 m3, m3, [r1 + r3], 1
>+ movu [r0 + 96], m3
>
> ; get count
>- packsswb m2, m5
>- pcmpeqb m2, m4
>- pmovmskb r0d, m2
>- not r0d
>- popcnt r0d, r0d
>-
>-%if ARCH_X86_64 == 1
>- add tmpd, r0d
>-%else
>- add r0d, tmpd
>-%endif
>+ xorpd m4, m4
>+ vpacksswb m0, m1
>+ vpacksswb m2, m3
>+ pminub m0, [pb_1]
>+ pminub m2, [pb_1]
>+ paddb m0, m2
>+ vextracti128 xm1, m0, 1
>+ paddb xm0, xm1
>+ psadbw xm0, xm4
>+ movhlps xm1, xm0
>+ paddd xm0, xm1
>+ movd eax, xm0
> RET
>
>
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140911/444adf72/attachment.html>
More information about the x265-devel
mailing list