<div dir="ltr">Ignore this patch, Will send new one with some improvements.<div><br></div></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Oct 9, 2014 at 5:20 PM, <span dir="ltr"><<a href="mailto:murugan@multicorewareinc.com" target="_blank">murugan@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Murugan Vairavel <<a href="mailto:murugan@multicorewareinc.com">murugan@multicorewareinc.com</a>><br>
# Date 1412855249 -19800<br>
# Thu Oct 09 17:17:29 2014 +0530<br>
# Node ID 0c36fca591aefdf1df620126e3e7d2ba327609cb<br>
# Parent 85203bb459124dc5eb4bc929450c655b196aeb0e<br>
asm: avx2 assembly code for 8bpp transpose16x16 module<br>
<br>
diff -r 85203bb45912 -r 0c36fca591ae source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Thu Oct 09 15:59:58 2014 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 09 17:17:29 2014 +0530<br>
@@ -1792,6 +1792,7 @@<br>
p.idct[IDCT_8x8] = x265_idct8_avx2;<br>
p.idct[IDCT_16x16] = x265_idct16_avx2;<br>
p.idct[IDCT_32x32] = x265_idct32_avx2;<br>
+ p.transpose[BLOCK_16x16] = x265_transpose16_avx2;<br>
p.transpose[BLOCK_32x32] = x265_transpose32_avx2;<br>
#endif<br>
}<br>
diff -r 85203bb45912 -r 0c36fca591ae source/common/x86/pixel-util.h<br>
--- a/source/common/x86/pixel-util.h Thu Oct 09 15:59:58 2014 +0530<br>
+++ b/source/common/x86/pixel-util.h Thu Oct 09 17:17:29 2014 +0530<br>
@@ -44,6 +44,7 @@<br>
void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);<br>
void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);<br>
<br>
+void x265_transpose16_avx2(pixel *dest, pixel *src, intptr_t stride);<br>
void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);<br>
<br>
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);<br>
diff -r 85203bb45912 -r 0c36fca591ae source/common/x86/pixel-util8.asm<br>
--- a/source/common/x86/pixel-util8.asm Thu Oct 09 15:59:58 2014 +0530<br>
+++ b/source/common/x86/pixel-util8.asm Thu Oct 09 17:17:29 2014 +0530<br>
@@ -1632,8 +1632,105 @@<br>
lea r0, [r6 + 8 * r5 + 16]<br>
mov r3, r0<br>
call transpose8_internal<br>
-<br>
+ RET<br>
%else<br>
+%if ARCH_X86_64 == 1<br>
+INIT_YMM avx2<br>
+cglobal transpose16, 3, 5, 16<br>
+ lea r3, [r2 * 3]<br>
+ lea r4, [r1 + 8 * r2]<br>
+<br>
+ movu xm0, [r1]<br>
+ movu xm1, [r1 + r2]<br>
+ movu xm2, [r1 + 2 * r2]<br>
+ movu xm3, [r1 + r3]<br>
+ vinserti128 m0, m0, [r4], 1<br>
+ vinserti128 m1, m1, [r4 + r2], 1<br>
+ vinserti128 m2, m2, [r4 + 2 * r2], 1<br>
+ vinserti128 m3, m3, [r4 + r3], 1<br>
+ lea r1, [r1 + 4 * r2]<br>
+ lea r4, [r4 + 4 * r2]<br>
+<br>
+ movu xm4, [r1]<br>
+ movu xm5, [r1 + r2]<br>
+ movu xm6, [r1 + 2 * r2]<br>
+ movu xm7, [r1 + r3]<br>
+ vinserti128 m4, m4, [r4], 1<br>
+ vinserti128 m5, m5, [r4 + r2], 1<br>
+ vinserti128 m6, m6, [r4 + 2 * r2], 1<br>
+ vinserti128 m7, m7, [r4 + r3], 1<br>
+<br>
+ punpcklbw m8, m0, m1 ;[1 - 8 ; 1 - 8 ][1 2 9 10]<br>
+ punpckhbw m0, m1 ;[9 - 16; 9 - 16][1 2 9 10]<br>
+<br>
+ punpcklbw m1, m2, m3 ;[1 - 8 ; 1 - 8 ][3 4 11 12]<br>
+ punpckhbw m2, m3 ;[9 - 16; 9 - 16][3 4 11 12]<br>
+<br>
+ punpcklbw m3, m4, m5 ;[1 - 8 ; 1 - 8 ][5 6 13 14]<br>
+ punpckhbw m4, m5 ;[9 - 16; 9 - 16][5 6 13 14]<br>
+<br>
+ punpcklbw m5, m6, m7 ;[1 - 8 ; 1 - 8 ][7 8 15 16]<br>
+ punpckhbw m6, m7 ;[9 - 16; 9 - 16][7 8 15 16]<br>
+<br>
+ punpcklwd m7, m8, m1 ;[1 - 4 ; 1 - 4][1 2 3 4 9 10 11 12]<br>
+ punpckhwd m8, m1 ;[5 - 8 ; 5 - 8][1 2 3 4 9 10 11 12]<br>
+<br>
+ punpcklwd m1, m3, m5 ;[1 - 4 ; 1 - 4][5 6 7 8 13 14 15 16]<br>
+ punpckhwd m3, m5 ;[5 - 8 ; 5 - 8][5 6 7 8 13 14 15 16]<br>
+<br>
+ punpcklwd m5, m0, m2 ;[9 - 12; 9 - 12][1 2 3 4 9 10 11 12]<br>
+ punpckhwd m0, m2 ;[13- 16; 13 - 16][1 2 3 4 9 10 11 12]<br>
+<br>
+ punpcklwd m2, m4, m6 ;[9 - 12; 9 - 12][5 6 7 8 13 14 15 16]<br>
+ punpckhwd m4, m6 ;[13- 16; 13 - 16][5 6 7 8 13 14 15 16]<br>
+<br>
+ punpckldq m6, m7, m1 ;[1 - 2 ; 1 - 2][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhdq m7, m1 ;[3 - 4 ; 3 - 4][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpckldq m1, m8, m3 ;[5 - 6 ; 5 - 6][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhdq m8, m3 ;[7 - 8 ; 7 - 8][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpckldq m3, m5, m2 ;[9 - 10; 9 - 10][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhdq m5, m2 ;[11- 12; 11 - 12][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpckldq m2, m0, m4 ;[13- 14; 13 - 14][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhdq m0, m4 ;[15- 16; 15 - 16][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ vpermq m6, m6, 0xD8<br>
+ vpermq m7, m7, 0xD8<br>
+ vpermq m1, m1, 0xD8<br>
+ vpermq m8, m8, 0xD8<br>
+ vpermq m3, m3, 0xD8<br>
+ vpermq m5, m5, 0xD8<br>
+ vpermq m2, m2, 0xD8<br>
+ vpermq m0, m0, 0xD8<br>
+<br>
+ movu [r0 + 0 * 16], xm6<br>
+ vextracti128 [r0 + 1 * 16], m6, 1<br>
+<br>
+ movu [r0 + 2 * 16], xm7<br>
+ vextracti128 [r0 + 3 * 16], m7, 1<br>
+<br>
+ movu [r0 + 4 * 16], xm1<br>
+ vextracti128 [r0 + 5 * 16], m1, 1<br>
+<br>
+ movu [r0 + 6 * 16], xm8<br>
+ vextracti128 [r0 + 7 * 16], m8, 1<br>
+<br>
+ movu [r0 + 8 * 16], xm3<br>
+ vextracti128 [r0 + 9 * 16], m3, 1<br>
+<br>
+ movu [r0 + 10 * 16], xm5<br>
+ vextracti128 [r0 + 11 * 16], m5, 1<br>
+<br>
+ movu [r0 + 12 * 16], xm2<br>
+ vextracti128 [r0 + 13 * 16], m2, 1<br>
+<br>
+ movu [r0 + 14 * 16], xm0<br>
+ vextracti128 [r0 + 15 * 16], m0, 1<br>
+ RET<br>
+%endif<br>
+INIT_XMM sse2<br>
cglobal transpose16, 3, 5, 8, dest, src, stride<br>
mov r3, r0<br>
mov r4, r1<br>
@@ -1647,8 +1744,8 @@<br>
lea r1, [r1 + 2 * r2]<br>
lea r0, [r3 + 8 * 16 + 8]<br>
TRANSPOSE_8x8 16<br>
+ RET<br>
%endif<br>
- RET<br>
<br>
cglobal transpose16_internal<br>
TRANSPOSE_8x8 r6<br>
@@ -1761,6 +1858,7 @@<br>
mov r5, r0<br>
call transpose16_internal<br>
RET<br>
+<br>
%if ARCH_X86_64 == 1<br>
INIT_YMM avx2<br>
cglobal transpose32, 3, 5, 16<br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br><div dir="ltr">With Regards,<div><br></div><div>Murugan. V</div><div>+919659287478</div></div>
</div>