[x265] [PATCH] copy_cnt_8 AVX2 asm code, as per new interface
chen
chenm003 at 163.com
Tue Sep 9 19:15:17 CEST 2014
At 2014-09-09 20:23:25,praveen at multicorewareinc.com wrote:
># HG changeset patch
># User Praveen Tiwari
># Date 1410265389 -19800
># Node ID cbb7e2beff033e441a13cd82bbfd85b362cd6d24
># Parent d011073f35258cb2f0ad95db6038c2d9fb840b27
>copy_cnt_8 AVX2 asm code, as per new interface
>
>diff -r d011073f3525 -r cbb7e2beff03 source/common/x86/blockcopy8.asm
>--- a/source/common/x86/blockcopy8.asm Tue Sep 09 14:07:14 2014 +0530
>+++ b/source/common/x86/blockcopy8.asm Tue Sep 09 17:53:09 2014 +0530
>@@ -4076,85 +4076,49 @@
>
>
> INIT_YMM avx2
>-%if ARCH_X86_64 == 1
>-cglobal copy_cnt_8, 3,4,6
>- %define tmpd eax
>-%else
>-cglobal copy_cnt_8, 3,5,6
>- %define tmpd r4d
>-%endif
>+cglobal copy_cnt_8, 3,3,6
> add r2d, r2d
>- pxor m4, m4
>- lea r3, [r2 * 3]
>-
>- ; row 0
>+ xorpd m5, m5
>+
>+ ; row 0 - 1
> movu xm0, [r1]
>- mova xm2, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 0 * mmsize], m1
>-
>- ; row 1
>- movu xm0, [r1 + r2]
>- vinserti128 m2, m2, xm0, 1
>- pmovsxwd m1, xm0
>- movu [r0 + 1 * mmsize], m1
>-
>- ; row 2
>- movu xm0, [r1 + r2 * 2]
>- mova xm5, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 2 * mmsize], m1
>-
>- ; row 3
>- movu xm0, [r1 + r3]
>- vinserti128 m5, m5, xm0, 1
>- packsswb m2, m5
>- pcmpeqb m2, m4
>- pmovmskb tmpd, m2
>- not tmpd
>- popcnt tmpd, tmpd
>- pmovsxwd m1, xm0
>- movu [r0 + 3 * mmsize], m1
>-
>- add r0, 4 * mmsize
>- lea r1, [r1 + r2 * 4]
>-
>- ; row 4
>- movu xm0, [r1]
>- mova xm2, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 0 * mmsize], m1
>-
>- ; row 5
>- movu xm0, [r1 + r2]
>- vinserti128 m2, m2, xm0, 1
>- pmovsxwd m1, xm0
>- movu [r0 + 1 * mmsize], m1
>-
>- ; row 6
>- movu xm0, [r1 + r2 * 2]
>- mova xm5, xm0
>- pmovsxwd m1, xm0
>- movu [r0 + 2 * mmsize], m1
>-
>- ; row 7
>- movu xm0, [r1 + r3]
>- pmovsxwd m1, xm0
>- movu [r0 + 3 * mmsize], m1
>- vinserti128 m5, m5, xm0, 1
>+ movu xm1, [r1 + r2]
>+ vinserti128 m0, m0, xm1, 1
>+ movu [r0], m0
>+
>+ ; row 2 - 3
>+ movu xm1, [r1 + r2 * 2]
>+ lea r1, [r1 + r2 * 2]
>+ movu xm2, [r1 + r2]
>+ vinserti128 m1, m1, xm2, 1
>+ movu [r0 + 32], m1
>+
>+ ; row 4 - 5
>+ movu xm2, [r1 + r2 * 2]
>+ lea r1, [r1 + r2 * 2]
>+ movu xm3, [r1 + r2]
>+ vinserti128 m2, m2, xm3, 1
>+ movu [r0 + 64], m2
>+
>+ ; row 6 - 7
>+ movu xm3, [r1 + r2 * 2]
>+ lea r1, [r1 + r2 * 2]
>+ movu xm4, [r1 + r2]
>+ vinserti128 m3, m3, xm4, 1
>+ movu [r0 + 96], m3
>
> ; get count
>- packsswb m2, m5
>- pcmpeqb m2, m4
>- pmovmskb r0d, m2
>- not r0d
>- popcnt r0d, r0d
>-
>-%if ARCH_X86_64 == 1
>- add tmpd, r0d
>-%else
>- add r0d, tmpd
>-%endif
>+ vpacksswb m0, m1
>+ vpcmpeqb m0, m5
>+ vpmovmskb eax, m0
>+ not eax
>+ popcnt eax, eax
>+ vpacksswb m2, m3
>+ vpcmpeqb m2, m5
>+ vpmovmskb r1d, m2
>+ not r1d
>+ popcnt r1d, r1d
>+ add eax, r1d
> RET
>
>
popcnt is expensive instruction, in this case, you didn't need it,
and many algorithm need modify, see below reference code:
INIT_YMM avx2
cglobal copy_cnt_8, 3,4,5
add r2d, r2d
lea r3, [r2 * 3]
; row 0 - 1
movu xm0, [r1]
vinserti128 m0, m0, [r1 + r2], 1
movu [r0], m0
; row 2 - 3
movu xm1, [r1 + r2 * 2]
vinserti128 m1, m1, [r1 + r3], 1
movu [r0 + 32], m1
lea r1, [r1 + r2 * 4]
; row 4 - 5
movu xm2, [r1]
vinserti128 m2, m2, [r1 + r2], 1
movu [r0 + 64], m2
; row 6 - 7
movu xm3, [r1 + r2 * 2]
vinserti128 m3, m3, [r1 + r3], 1
movu [r0 + 96], m3
; get count
xorpd m4, m4
vpacksswb m0, m1
vpacksswb m2, m3
pminub m0, [pb_1]
pminub m2, [pb_1]
paddb m0, m2
vextracti128 xm1, m0, 1
paddb xm0, xm1
psadbw xm0, xm4
movhlps xm1, xm0
paddd xm0, xm1
movd eax, xm0
RET
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140910/72e2bdde/attachment-0001.html>
More information about the x265-devel
mailing list