[x265] [PATCH] asm: chroma_vpp[64x64] for colorspace i444 in avx2: improve 15351c->7344c
chen
chenm003 at 163.com
Fri Dec 12 19:28:50 CET 2014
Sorry, the email sent failed, I sent again to you.
At 2014-12-10 19:10:13,"Divya Manivannan" <divya at multicorewareinc.com> wrote:
># HG changeset patch
># User Divya Manivannan divya at multicorewareinc.com>
># Date 1418209768 -19800
># Wed Dec 10 16:39:28 2014 +0530
># Node ID 016eb6ee9606a8d160b4c33c40e39b85c419d890
># Parent 45a3df8b653c78b1c1d1cde5bd4dcab628eb537f
>asm: chroma_vpp[64x64] for colorspace i444 in avx2: improve 15351c->7344c
>
diff -r 45a3df8b653c -r 016eb6ee9606 source/common/x86/ipfilter8.asm
>--- a/source/common/x86/ipfilter8.asm Wed Dec 10 16:12:52 2014 +0530
>+++ b/source/common/x86/ipfilter8.asm Wed Dec 10 16:39:28 2014 +0530
>@@ -4235,6 +4235,103 @@
> FILTER_V4_W16n_H2 48, 64
> FILTER_V4_W16n_H2 64, 16
>
>+INIT_YMM avx2
>+%if ARCH_X86_64 == 1
>+cglobal interp_4tap_vert_pp_64x64, 4, 9, 13
>+ mov r4d, r4m
>+ shl r4d, 6
>+
>+%ifdef PIC
>+ lea r5, [tab_ChromaCoeffVer_32]
>+ add r5, r4
>+%else
>+ lea r5, [tab_ChromaCoeffVer_32 + r4]
>+%endif
>+
>+ mova m10, [r5]
>+ mova m11, [r5 + mmsize]
>+ lea r4, [r1 * 3]
>+ sub r0, r1
>+ lea r5, [r3 * 3]
>+ mova m12, [pw_512]
>+ mov r6d, 16
>+.loopH:
>+ mov r8d, 2
This loop just 2 times, so we may use %rep to unroll it
>+.loopW:
>+ movu m0, [r0] ; m0 = row 0
>+ movu m1, [r0 + r1] ; m1 = row 1
>+ punpcklbw m2, m0, m1
>+ punpckhbw m3, m0, m1
>+ pmaddubsw m2, m10
>+ pmaddubsw m3, m10
>+ movu m0, [r0 + r1 * 2] ; m0 = row 2
>+ punpcklbw m4, m1, m0
>+ punpckhbw m5, m1, m0
>+ pmaddubsw m4, m10
>+ pmaddubsw m5, m10
>+ movu m1, [r0 + r4] ; m1 = row 3
>+ punpcklbw m6, m0, m1
>+ punpckhbw m7, m0, m1
>+ pmaddubsw m8, m6, m11
>+ pmaddubsw m9, m7, m11
>+ pmaddubsw m6, m10
>+ pmaddubsw m7, m10
>+ paddw m2, m8
>+ paddw m3, m9
>+ pmulhrsw m2, m12
>+ pmulhrsw m3, m12
>+ packuswb m2, m3
>+ movu [r2], m2
>+
>+ lea r7, [r0 + r1 * 4]
you need r7 because you didn't unroll inner loop, so you have to keep r0
>+ movu m0, [r7] ; m0 = row 4
>+ punpcklbw m2, m1, m0
>+ punpckhbw m3, m1, m0
>+ pmaddubsw m8, m2, m11
>+ pmaddubsw m9, m3, m11
>+ pmaddubsw m2, m10
>+ pmaddubsw m3, m10
>+ paddw m4, m8
>+ paddw m5, m9
>+ pmulhrsw m4, m12
>+ pmulhrsw m5, m12
>+ packuswb m4, m5
>+ movu [r2 + r3], m4
>+
>+ movu m1, [r7 + r1] ; m1 = row 5
>+ punpcklbw m4, m0, m1
>+ punpckhbw m5, m0, m1
>+ pmaddubsw m4, m11
>+ pmaddubsw m5, m11
>+ paddw m6, m4
>+ paddw m7, m5
>+ pmulhrsw m6, m12
>+ pmulhrsw m7, m12
>+ packuswb m6, m7
>+ movu [r2 + r3 * 2], m6
>+
>+ movu m0, [r7 + r1 * 2] ; m0 = row 6
>+ punpcklbw m6, m1, m0
>+ punpckhbw m7, m1, m0
>+ pmaddubsw m6, m11
>+ pmaddubsw m7, m11
>+ paddw m2, m6
>+ paddw m3, m7
>+ pmulhrsw m2, m12
>+ pmulhrsw m3, m12
>+ packuswb m2, m3
>+ movu [r2 + r5], m2
>+
>+ add r2, 32
>+ add r0, 32
>+ dec r8d
>+ jnz .loopW
>+ lea r0, [r7 - 32]
>+ lea r2, [r2 + r3 * 4 - 64]
>+ dec r6d
>+ jnz .loopH
first pass is horizon and second is vertcal, in this format, we can't reuse row pixel to reduce memory operators, so I suggest you modify your algorithm
>+ RET
>+%endif
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141213/ca939d44/attachment-0001.html>
More information about the x265-devel
mailing list