<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div> </div><pre><br>At 2014-10-08 21:45:03,murugan@multicorewareinc.com wrote:
># HG changeset patch
># User Murugan Vairavel <a href="mailto:murugan@multicorewareinc.com>># Date 1412775097 -19800># Wed Oct 08 19:01:37 2014 +0530># Node ID a184652f22c6db55242dc2ec824463142bc05338># Parent 52677ba0c69441688fbe83c926ae39d4d1a5422c>asm: avx2 assembly code for 8bpp transpose32x32 module>">murugan@multicorewareinc.com>
># Date 1412775097 -19800
># Wed Oct 08 19:01:37 2014 +0530
># Node ID a184652f22c6db55242dc2ec824463142bc05338
># Parent 52677ba0c69441688fbe83c926ae39d4d1a5422c
>asm: avx2 assembly code for 8bpp transpose32x32 module
>
</a>--- a/source/common/x86/pixel-util8.asm Wed Oct 08 14:24:30 2014 +0530
>+++ b/source/common/x86/pixel-util8.asm Wed Oct 08 19:01:37 2014 +0530
>@@ -1670,8 +1670,8 @@
> ;-----------------------------------------------------------------
> ; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)
> ;-----------------------------------------------------------------
>+%if HIGH_BIT_DEPTH
> INIT_XMM sse2
>-%if HIGH_BIT_DEPTH
> cglobal transpose32, 3, 7, 4, dest, src, stride
> add r2, r2
> mov r3, r0
>@@ -1739,7 +1739,9 @@
> lea r0, [r6 + 24 * 64 + 48]
> mov r3, r0
> call transpose8_internal
>+ RET
> %else
>+INIT_XMM sse2
> cglobal transpose32, 3, 7, 8, dest, src, stride
> mov r3, r0
> mov r4, r1
>@@ -1758,8 +1760,205 @@
> lea r0, [r3 + 16 * 32 + 16]
> mov r5, r0
> call transpose16_internal
>+ RET
>+
>+INIT_YMM avx2
>+cglobal transpose32, 3, 5, 16
</pre><pre>this is X64 only</pre><pre> </pre><pre>>+ lea r3, [r2 * 3]
>+ mov r4d, 2
>+
>+.loop:
>+ movu m0, [r1]
>+ movu m1, [r1 + r2]
>+ movu m2, [r1 + 2 * r2]
>+ movu m3, [r1 + r3]
>+ lea r1, [r1 + 4 * r2]
>+
>+ movu m4, [r1]
>+ movu m5, [r1 + r2]
>+ movu m6, [r1 + 2 * r2]
>+ movu m7, [r1 + r3]
>+
>+ punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]
>+ punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]
>+
>+ punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]
>+ punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]
>+
>+ punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]
>+ punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]
>+
>+ punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]
>+ punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]
>+
>+ punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]
>+ punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]
>+
>+ punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]
>+ punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]
>+
>+ punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]
>+ punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4]
>+
>+ punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]
>+ punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8]
>+
>+ punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]
>+ punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]
>+
>+ punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]
>+ punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]
>+
>+ punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]
>+ punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]
>+
>+ punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]
>+ punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]
>+
>+ movq [r0 + 0 * 32], xm6
>+ movhps [r0 + 1 * 32], xm6
>+ vextracti128 xm4, m6, 1
>+ movq [r0 + 16 * 32], xm4
>+ movhps [r0 + 17 * 32], xm4
>+
>+ lea r1, [r1 + 4 * r2]
>+ movu m9, [r1]
>+ movu m10, [r1 + r2]
>+ movu m11, [r1 + 2 * r2]
>+ movu m12, [r1 + r3]
>+ lea r1, [r1 + 4 * r2]
>+
>+ movu m13, [r1]
>+ movu m14, [r1 + r2]
>+ movu m15, [r1 + 2 * r2]
>+ movu m6, [r1 + r3]
>+
>+ punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]
>+ punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]
>+
>+ punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]
>+ punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]
>+
>+ punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]
>+ punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]
>+
>+ punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]
>+ punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]
>+
>+ punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]
>+ punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]
>+
>+ punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]
>+ punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]
>+
>+ punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]
>+ punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]
>+
>+ punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]
>+ punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]
>+
>+ punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]
>+ punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]
>+
>+ punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]
>+ punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]
>+
>+ punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]
>+ punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]
>+
>+ punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]
>+ punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]
>+
>+
>+ punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+ punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+
>+ punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+ punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+
>+ punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+ punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+
>+ punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+ punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+
>+ punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+ punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+
>+ punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+ punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+
>+ punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+ punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]
>+
>+ movq [r0 + 0 * 32 + 8], xm15
>+ movhps [r0 + 1 * 32 + 8], xm15
>+ vextracti128 xm9, m15, 1
>+ movq [r0 + 16 * 32 + 8], xm9
>+ movhps [r0 + 17 * 32 + 8], xm9
>+
>+ movu [r0 + 2 * 32], xm13
>+ vextracti128 xm9, m13, 1
>+ movu [r0 + 18 * 32], xm9
>+
>+ movu [r0 + 3 * 32], xm7
>+ vextracti128 xm9, m7, 1
>+ movu [r0 + 19 * 32], xm9
</pre><pre>buf_trans is aligned to 32-bytes, so we can combin vextracti128 and movu</pre><pre>>+
>+ movu [r0 + 4 * 32], xm6
>+ vextracti128 xm9, m6, 1
>+ movu [r0 + 20 * 32], xm9
>+
>+ movu [r0 + 5 * 32], xm1
>+ vextracti128 xm9, m1, 1
>+ movu [r0 + 21 * 32], xm9
>+
>+ movu [r0 + 6 * 32], xm10
>+ vextracti128 xm9, m10, 1
>+ movu [r0 + 22 * 32], xm9
>+
>+ movu [r0 + 7 * 32], xm8
>+ vextracti128 xm9, m8, 1
>+ movu [r0 + 23 * 32], xm9
>+
>+ movu [r0 + 8 * 32], xm4
>+ vextracti128 xm9, m4, 1
>+ movu [r0 + 24 * 32], xm9
>+
>+ movu [r0 + 9 * 32], xm3
>+ vextracti128 xm9, m3, 1
>+ movu [r0 + 25 * 32], xm9
>+
>+ movu [r0 + 10 * 32], xm12
>+ vextracti128 xm9, m12, 1
>+ movu [r0 + 26 * 32], xm9
>+
>+ movu [r0 + 11 * 32], xm5
>+ vextracti128 xm9, m5, 1
>+ movu [r0 + 27 * 32], xm9
>+
>+ movu [r0 + 12 * 32], xm14
>+ vextracti128 xm9, m14, 1
>+ movu [r0 + 28 * 32], xm9
>+
>+ movu [r0 + 13 * 32], xm2
>+ vextracti128 xm9, m2, 1
>+ movu [r0 + 29 * 32], xm9
>+
>+ movu [r0 + 14 * 32], xm11
>+ vextracti128 xm9, m11, 1
>+ movu [r0 + 30 * 32], xm9
>+
>+ movu [r0 + 15 * 32], xm0
>+ vextracti128 xm9, m0, 1
>+ movu [r0 + 31 * 32], xm9
>+
>+ add r0, 16
>+ lea r1, [r1 + 4 * r2]
>+ dec r4d
>+ jnz .loop
>+ RET
> %endif
>- RET
</pre></div>