[x265] [PATCH] asm: avx2 assembly code for 8bpp transpose32x32 module
Murugan Vairavel
murugan at multicorewareinc.com
Thu Oct 9 07:18:41 CEST 2014
Ok Min, I will update this and send a new patch.
On Wed, Oct 8, 2014 at 9:37 PM, chen <chenm003 at 163.com> wrote:
>
>
>
> At 2014-10-08 21:45:03,murugan at multicorewareinc.com wrote:
> ># HG changeset patch
> ># User Murugan Vairavel murugan at multicorewareinc.com>
> ># Date 1412775097 -19800
> ># Wed Oct 08 19:01:37 2014 +0530
> ># Node ID a184652f22c6db55242dc2ec824463142bc05338
> ># Parent 52677ba0c69441688fbe83c926ae39d4d1a5422c
> >asm: avx2 assembly code for 8bpp transpose32x32 module
> > <murugan at multicorewareinc.com%3E%3E#%C2%A0Date%C2%A01412775097%C2%A0-19800%3E%23%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0%C2%A0Wed%C2%A0Oct%C2%A008%C2%A019:01:37%C2%A02014%C2%A0+0530%3E%23%C2%A0Node%C2%A0ID%C2%A0a184652f22c6db55242dc2ec824463142bc05338%3E%23%C2%A0Parent%C2%A0%C2%A052677ba0c69441688fbe83c926ae39d4d1a5422c%3Easm:%C2%A0avx2%C2%A0assembly%C2%A0code%C2%A0for%C2%A08bpp%C2%A0transpose32x32%C2%A0module%3E>
> --- a/source/common/x86/pixel-util8.asm Wed Oct 08 14:24:30 2014 +0530
> >+++ b/source/common/x86/pixel-util8.asm Wed Oct 08 19:01:37 2014 +0530
> >@@ -1670,8 +1670,8 @@
> > ;-----------------------------------------------------------------
> > ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
> > ;-----------------------------------------------------------------
> >+%if HIGH_BIT_DEPTH
> > INIT_XMM sse2
> >-%if HIGH_BIT_DEPTH
> > cglobal transpose32, 3, 7, 4, dest, src, stride
> > add r2, r2
> > mov r3, r0
> >@@ -1739,7 +1739,9 @@
> > lea r0, [r6 + 24 * 64 + 48]
> > mov r3, r0
> > call transpose8_internal
> >+ RET
> > %else
> >+INIT_XMM sse2
> > cglobal transpose32, 3, 7, 8, dest, src, stride
> > mov r3, r0
> > mov r4, r1
> >@@ -1758,8 +1760,205 @@
> > lea r0, [r3 + 16 * 32 + 16]
> > mov r5, r0
> > call transpose16_internal
> >+ RET
> >+
> >+INIT_YMM avx2
> >+cglobal transpose32, 3, 5, 16
>
> this is X64 only
>
>
>
> >+ lea r3, [r2 * 3]
> >+ mov r4d, 2
> >+
> >+.loop:
> >+ movu m0, [r1]
> >+ movu m1, [r1 + r2]
> >+ movu m2, [r1 + 2 * r2]
> >+ movu m3, [r1 + r3]
> >+ lea r1, [r1 + 4 * r2]
> >+
> >+ movu m4, [r1]
> >+ movu m5, [r1 + r2]
> >+ movu m6, [r1 + 2 * r2]
> >+ movu m7, [r1 + r3]
> >+
> >+ punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
> >+ punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
> >+
> >+ punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
> >+ punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
> >+
> >+ punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
> >+ punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
> >+
> >+ punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
> >+ punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
> >+
> >+ punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
> >+ punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
> >+
> >+ punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
> >+ punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
> >+
> >+ punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
> >+ punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4]
> >+
> >+ punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
> >+ punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8]
> >+
> >+ punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
> >+ punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
> >+
> >+ punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
> >+ punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
> >+
> >+ punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
> >+ punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
> >+
> >+ punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
> >+ punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
> >+
> >+ movq [r0 + 0 * 32], xm6
> >+ movhps [r0 + 1 * 32], xm6
> >+ vextracti128 xm4, m6, 1
> >+ movq [r0 + 16 * 32], xm4
> >+ movhps [r0 + 17 * 32], xm4
> >+
> >+ lea r1, [r1 + 4 * r2]
> >+ movu m9, [r1]
> >+ movu m10, [r1 + r2]
> >+ movu m11, [r1 + 2 * r2]
> >+ movu m12, [r1 + r3]
> >+ lea r1, [r1 + 4 * r2]
> >+
> >+ movu m13, [r1]
> >+ movu m14, [r1 + r2]
> >+ movu m15, [r1 + 2 * r2]
> >+ movu m6, [r1 + r3]
> >+
> >+ punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
> >+ punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
> >+
> >+ punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
> >+ punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
> >+
> >+ punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
> >+ punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
> >+
> >+ punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
> >+ punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
> >+
> >+ punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
> >+ punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
> >+
> >+ punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
> >+ punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
> >+
> >+ punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
> >+ punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
> >+
> >+ punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
> >+ punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
> >+
> >+ punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
> >+ punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
> >+
> >+ punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
> >+ punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
> >+
> >+ punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
> >+ punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
> >+
> >+ punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
> >+ punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
> >+
> >+
> >+ punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+ punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+
> >+ punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+ punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+
> >+ punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+ punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+
> >+ punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+ punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+
> >+ punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+ punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+
> >+ punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+ punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+
> >+ punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+ punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
> >+
> >+ movq [r0 + 0 * 32 + 8], xm15
> >+ movhps [r0 + 1 * 32 + 8], xm15
> >+ vextracti128 xm9, m15, 1
> >+ movq [r0 + 16 * 32 + 8], xm9
> >+ movhps [r0 + 17 * 32 + 8], xm9
> >+
> >+ movu [r0 + 2 * 32], xm13
> >+ vextracti128 xm9, m13, 1
> >+ movu [r0 + 18 * 32], xm9
> >+
> >+ movu [r0 + 3 * 32], xm7
> >+ vextracti128 xm9, m7, 1
> >+ movu [r0 + 19 * 32], xm9
>
> buf_trans is aligned to 32-bytes, so we can combin vextracti128 and movu
>
> >+
> >+ movu [r0 + 4 * 32], xm6
> >+ vextracti128 xm9, m6, 1
> >+ movu [r0 + 20 * 32], xm9
> >+
> >+ movu [r0 + 5 * 32], xm1
> >+ vextracti128 xm9, m1, 1
> >+ movu [r0 + 21 * 32], xm9
> >+
> >+ movu [r0 + 6 * 32], xm10
> >+ vextracti128 xm9, m10, 1
> >+ movu [r0 + 22 * 32], xm9
> >+
> >+ movu [r0 + 7 * 32], xm8
> >+ vextracti128 xm9, m8, 1
> >+ movu [r0 + 23 * 32], xm9
> >+
> >+ movu [r0 + 8 * 32], xm4
> >+ vextracti128 xm9, m4, 1
> >+ movu [r0 + 24 * 32], xm9
> >+
> >+ movu [r0 + 9 * 32], xm3
> >+ vextracti128 xm9, m3, 1
> >+ movu [r0 + 25 * 32], xm9
> >+
> >+ movu [r0 + 10 * 32], xm12
> >+ vextracti128 xm9, m12, 1
> >+ movu [r0 + 26 * 32], xm9
> >+
> >+ movu [r0 + 11 * 32], xm5
> >+ vextracti128 xm9, m5, 1
> >+ movu [r0 + 27 * 32], xm9
> >+
> >+ movu [r0 + 12 * 32], xm14
> >+ vextracti128 xm9, m14, 1
> >+ movu [r0 + 28 * 32], xm9
> >+
> >+ movu [r0 + 13 * 32], xm2
> >+ vextracti128 xm9, m2, 1
> >+ movu [r0 + 29 * 32], xm9
> >+
> >+ movu [r0 + 14 * 32], xm11
> >+ vextracti128 xm9, m11, 1
> >+ movu [r0 + 30 * 32], xm9
> >+
> >+ movu [r0 + 15 * 32], xm0
> >+ vextracti128 xm9, m0, 1
> >+ movu [r0 + 31 * 32], xm9
> >+
> >+ add r0, 16
> >+ lea r1, [r1 + 4 * r2]
> >+ dec r4d
> >+ jnz .loop
> >+ RET
> > %endif
> >- RET
>
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
>
>
--
With Regards,
Murugan. V
+919659287478
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20141009/b5ce4e3b/attachment-0001.html>
More information about the x265-devel
mailing list