<div dir="ltr">Ignore this patch. Missed one (movu + vextracti128), will send a new one.<div><br></div></div><div class="gmail_extra"><br><div class="gmail_quote">On Thu, Oct 9, 2014 at 2:43 PM, <span dir="ltr"><<a href="mailto:murugan@multicorewareinc.com" target="_blank">murugan@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><span class=""># HG changeset patch<br>
# User Murugan Vairavel <<a href="mailto:murugan@multicorewareinc.com">murugan@multicorewareinc.com</a>><br>
</span># Date 1412838641 -19800<br>
# Thu Oct 09 12:40:41 2014 +0530<br>
# Node ID 1588689ff25cf63159a8a5ff032b350f80156b15<br>
# Parent 96609efaa87744168c613122d716c21d07b49af5<br>
<span class="">asm: avx2 assembly code for 8bpp transpose32x32 module<br>
<br>
</span>diff -r 96609efaa877 -r 1588689ff25c source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Wed Oct 08 18:52:12 2014 +0530<br>
+++ b/source/common/x86/asm-primitives.cpp Thu Oct 09 12:40:41 2014 +0530<br>
<span class="">@@ -1792,6 +1792,7 @@<br>
p.idct[IDCT_8x8] = x265_idct8_avx2;<br>
p.idct[IDCT_16x16] = x265_idct16_avx2;<br>
p.idct[IDCT_32x32] = x265_idct32_avx2;<br>
+ p.transpose[BLOCK_32x32] = x265_transpose32_avx2;<br>
#endif<br>
}<br>
#endif // if HIGH_BIT_DEPTH<br>
</span>diff -r 96609efaa877 -r 1588689ff25c source/common/x86/pixel-util.h<br>
--- a/source/common/x86/pixel-util.h Wed Oct 08 18:52:12 2014 +0530<br>
+++ b/source/common/x86/pixel-util.h Thu Oct 09 12:40:41 2014 +0530<br>
<span class="">@@ -44,6 +44,8 @@<br>
void x265_transpose32_sse2(pixel *dest, pixel *src, intptr_t stride);<br>
void x265_transpose64_sse2(pixel *dest, pixel *src, intptr_t stride);<br>
<br>
+void x265_transpose32_avx2(pixel *dest, pixel *src, intptr_t stride);<br>
+<br>
uint32_t x265_quant_sse4(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);<br>
uint32_t x265_quant_avx2(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int16_t *qCoef, int qBits, int add, int numCoeff);<br>
uint32_t x265_nquant_sse4(int32_t *coef, int32_t *quantCoeff, int16_t *qCoef, int qBits, int add, int numCoeff);<br>
</span>diff -r 96609efaa877 -r 1588689ff25c source/common/x86/pixel-util8.asm<br>
--- a/source/common/x86/pixel-util8.asm Wed Oct 08 18:52:12 2014 +0530<br>
+++ b/source/common/x86/pixel-util8.asm Thu Oct 09 12:40:41 2014 +0530<br>
<span class="">@@ -1670,8 +1670,8 @@<br>
;-----------------------------------------------------------------<br>
; void transpose_32x32(pixel *dst, pixel *src, intptr_t stride)<br>
;-----------------------------------------------------------------<br>
+%if HIGH_BIT_DEPTH<br>
INIT_XMM sse2<br>
-%if HIGH_BIT_DEPTH<br>
cglobal transpose32, 3, 7, 4, dest, src, stride<br>
add r2, r2<br>
mov r3, r0<br>
@@ -1739,7 +1739,9 @@<br>
lea r0, [r6 + 24 * 64 + 48]<br>
mov r3, r0<br>
call transpose8_internal<br>
+ RET<br>
%else<br>
+INIT_XMM sse2<br>
cglobal transpose32, 3, 7, 8, dest, src, stride<br>
mov r3, r0<br>
mov r4, r1<br>
</span>@@ -1758,8 +1760,193 @@<br>
<span class=""> lea r0, [r3 + 16 * 32 + 16]<br>
mov r5, r0<br>
call transpose16_internal<br>
+ RET<br>
</span>+%if ARCH_X86_64 == 1<br>
<span class="">+INIT_YMM avx2<br>
+cglobal transpose32, 3, 5, 16<br>
</span><div><div class="h5">+ lea r3, [r2 * 3]<br>
+ mov r4d, 2<br>
+<br>
+.loop:<br>
+ movu m0, [r1]<br>
+ movu m1, [r1 + r2]<br>
+ movu m2, [r1 + 2 * r2]<br>
+ movu m3, [r1 + r3]<br>
+ lea r1, [r1 + 4 * r2]<br>
+<br>
+ movu m4, [r1]<br>
+ movu m5, [r1 + r2]<br>
+ movu m6, [r1 + 2 * r2]<br>
+ movu m7, [r1 + r3]<br>
+<br>
+ punpcklbw m8, m0, m1 ;[1 - 8 ; 17 - 24][1 2]<br>
+ punpckhbw m0, m1 ;[9 - 16; 25 - 32][1 2]<br>
+<br>
+ punpcklbw m1, m2, m3 ;[1 - 8 ; 17 - 24][3 4]<br>
+ punpckhbw m2, m3 ;[9 - 16; 25 - 32][3 4]<br>
+<br>
+ punpcklbw m3, m4, m5 ;[1 - 8 ; 17 - 24][5 6]<br>
+ punpckhbw m4, m5 ;[9 - 16; 25 - 32][5 6]<br>
+<br>
+ punpcklbw m5, m6, m7 ;[1 - 8 ; 17 - 24][7 8]<br>
+ punpckhbw m6, m7 ;[9 - 16; 25 - 32][7 8]<br>
+<br>
+ punpcklwd m7, m8, m1 ;[1 - 4 ; 17 - 20][1 2 3 4]<br>
+ punpckhwd m8, m1 ;[5 - 8 ; 20 - 24][1 2 3 4]<br>
+<br>
+ punpcklwd m1, m3, m5 ;[1 - 4 ; 17 - 20][5 6 7 8]<br>
+ punpckhwd m3, m5 ;[5 - 8 ; 20 - 24][5 6 7 8]<br>
+<br>
+ punpcklwd m5, m0, m2 ;[9 - 12; 25 - 28][1 2 3 4]<br>
+ punpckhwd m0, m2 ;[12- 15; 29 - 32][1 2 3 4]<br>
+<br>
+ punpcklwd m2, m4, m6 ;[9 - 12; 25 - 28][5 6 7 8]<br>
+ punpckhwd m4, m6 ;[12- 15; 29 - 32][5 6 7 8]<br>
+<br>
+ punpckldq m6, m7, m1 ;[1 - 2 ; 17 - 18][1 2 3 4 5 6 7 8]<br>
+ punpckhdq m7, m1 ;[3 - 4 ; 19 - 20][1 2 3 4 5 6 7 8]<br>
+<br>
+ punpckldq m1, m8, m3 ;[5 - 6 ; 21 - 22][1 2 3 4 5 6 7 8]<br>
+ punpckhdq m8, m3 ;[7 - 8 ; 23 - 24][1 2 3 4 5 6 7 8]<br>
+<br>
+ punpckldq m3, m5, m2 ;[9 - 10; 25 - 26][1 2 3 4 5 6 7 8]<br>
+ punpckhdq m5, m2 ;[11- 12; 27 - 28][1 2 3 4 5 6 7 8]<br>
+<br>
+ punpckldq m2, m0, m4 ;[13- 14; 29 - 30][1 2 3 4 5 6 7 8]<br>
+ punpckhdq m0, m4 ;[15- 16; 31 - 32][1 2 3 4 5 6 7 8]<br>
+<br>
+ movq [r0 + 0 * 32], xm6<br>
+ movhps [r0 + 1 * 32], xm6<br>
+ vextracti128 xm4, m6, 1<br>
+ movq [r0 + 16 * 32], xm4<br>
+ movhps [r0 + 17 * 32], xm4<br>
+<br>
+ lea r1, [r1 + 4 * r2]<br>
+ movu m9, [r1]<br>
+ movu m10, [r1 + r2]<br>
+ movu m11, [r1 + 2 * r2]<br>
+ movu m12, [r1 + r3]<br>
+ lea r1, [r1 + 4 * r2]<br>
+<br>
+ movu m13, [r1]<br>
+ movu m14, [r1 + r2]<br>
+ movu m15, [r1 + 2 * r2]<br>
+ movu m6, [r1 + r3]<br>
+<br>
+ punpcklbw m4, m9, m10 ;[1 - 8 ; 17 - 24][9 10]<br>
+ punpckhbw m9, m10 ;[9 - 16; 25 - 32][9 10]<br>
+<br>
+ punpcklbw m10, m11, m12 ;[1 - 8 ; 17 - 24][11 12]<br>
+ punpckhbw m11, m12 ;[9 - 16; 25 - 32][11 12]<br>
+<br>
+ punpcklbw m12, m13, m14 ;[1 - 8 ; 17 - 24][13 14]<br>
+ punpckhbw m13, m14 ;[9 - 16; 25 - 32][13 14]<br>
+<br>
+ punpcklbw m14, m15, m6 ;[1 - 8 ; 17 - 24][15 16]<br>
+ punpckhbw m15, m6 ;[9 - 16; 25 - 32][15 16]<br>
+<br>
+ punpcklwd m6, m4, m10 ;[1 - 4 ; 17 - 20][9 10 11 12]<br>
+ punpckhwd m4, m10 ;[5 - 8 ; 20 - 24][9 10 11 12]<br>
+<br>
+ punpcklwd m10, m12, m14 ;[1 - 4 ; 17 - 20][13 14 15 16]<br>
+ punpckhwd m12, m14 ;[5 - 8 ; 20 - 24][13 14 15 16]<br>
+<br>
+ punpcklwd m14, m9, m11 ;[9 - 12; 25 - 28][9 10 11 12]<br>
+ punpckhwd m9, m11 ;[13- 16; 29 - 32][9 10 11 12]<br>
+<br>
+ punpcklwd m11, m13, m15 ;[9 - 12; 25 - 28][13 14 15 16]<br>
+ punpckhwd m13, m15 ;[13- 16; 29 - 32][13 14 15 16]<br>
+<br>
+ punpckldq m15, m6, m10 ;[1 - 2 ; 17 - 18][9 10 11 12 13 14 15 16]<br>
+ punpckhdq m6, m10 ;[3 - 4 ; 19 - 20][9 10 11 12 13 14 15 16]<br>
+<br>
+ punpckldq m10, m4, m12 ;[5 - 6 ; 21 - 22][9 10 11 12 13 14 15 16]<br>
+ punpckhdq m4, m12 ;[7 - 8 ; 23 - 24][9 10 11 12 13 14 15 16]<br>
+<br>
+ punpckldq m12, m14, m11 ;[9 - 10; 25 - 26][9 10 11 12 13 14 15 16]<br>
+ punpckhdq m14, m11 ;[11- 12; 27 - 28][9 10 11 12 13 14 15 16]<br>
+<br>
+ punpckldq m11, m9, m13 ;[13- 14; 29 - 30][9 10 11 12 13 14 15 16]<br>
+ punpckhdq m9, m13 ;[15- 16; 31 - 32][9 10 11 12 13 14 15 16]<br>
+<br>
+<br>
+ punpcklqdq m13, m7, m6 ;[3 ; 19][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhqdq m7, m6 ;[4 ; 20][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpcklqdq m6, m1, m10 ;[5 ; 21][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhqdq m1, m10 ;[6 ; 22][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpcklqdq m10, m8, m4 ;[7 ; 23][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhqdq m8, m4 ;[8 ; 24][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpcklqdq m4, m3, m12 ;[9 ; 25][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhqdq m3, m12 ;[10; 26][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpcklqdq m12, m5, m14 ;[11; 27][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhqdq m5, m14 ;[12; 28][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpcklqdq m14, m2, m11 ;[13; 29][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhqdq m2, m11 ;[14; 30][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ punpcklqdq m11, m0, m9 ;[15; 31][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+ punpckhqdq m0, m9 ;[16; 32][1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16]<br>
+<br>
+ movq [r0 + 0 * 32 + 8], xm15<br>
+ movhps [r0 + 1 * 32 + 8], xm15<br>
+ vextracti128 xm9, m15, 1<br>
+ movq [r0 + 16 * 32 + 8], xm9<br>
+ movhps [r0 + 17 * 32 + 8], xm9<br>
+<br>
+ movu [r0 + 2 * 32], xm13<br>
+ vextracti128 xm9, m13, 1<br>
+ movu [r0 + 18 * 32], xm9<br>
+<br>
+ movu [r0 + 3 * 32], xm7<br>
</div></div>+ vextracti128 [r0 + 19 * 32], m7, 1<br>
<span class="">+<br>
+ movu [r0 + 4 * 32], xm6<br>
</span>+ vextracti128 [r0 + 20 * 32], m6, 1<br>
<span class="">+<br>
+ movu [r0 + 5 * 32], xm1<br>
</span>+ vextracti128 [r0 + 21 * 32], m1, 1<br>
<span class="">+<br>
+ movu [r0 + 6 * 32], xm10<br>
</span>+ vextracti128 [r0 + 22 * 32], m10, 1<br>
<span class="">+<br>
+ movu [r0 + 7 * 32], xm8<br>
</span>+ vextracti128 [r0 + 23 * 32], m8, 1<br>
<span class="">+<br>
+ movu [r0 + 8 * 32], xm4<br>
</span>+ vextracti128 [r0 + 24 * 32], m4, 1<br>
<span class="">+<br>
+ movu [r0 + 9 * 32], xm3<br>
</span>+ vextracti128 [r0 + 25 * 32], m3, 1<br>
<span class="">+<br>
+ movu [r0 + 10 * 32], xm12<br>
</span>+ vextracti128 [r0 + 26 * 32], m12, 1<br>
<span class="">+<br>
+ movu [r0 + 11 * 32], xm5<br>
</span>+ vextracti128 [r0 + 27 * 32], m5, 1<br>
<span class="">+<br>
+ movu [r0 + 12 * 32], xm14<br>
</span>+ vextracti128 [r0 + 28 * 32], m14, 1<br>
<span class="">+<br>
+ movu [r0 + 13 * 32], xm2<br>
</span>+ vextracti128 [r0 + 29 * 32], m2, 1<br>
<span class="">+<br>
+ movu [r0 + 14 * 32], xm11<br>
</span>+ vextracti128 [r0 + 30 * 32], m11, 1<br>
<span class="">+<br>
+ movu [r0 + 15 * 32], xm0<br>
</span>+ vextracti128 [r0 + 31 * 32], m0, 1<br>
<span class="">+<br>
+ add r0, 16<br>
+ lea r1, [r1 + 4 * r2]<br>
+ dec r4d<br>
+ jnz .loop<br>
+ RET<br>
%endif<br>
- RET<br>
</span>+%endif<br>
<br>
;-----------------------------------------------------------------<br>
; void transpose_64x64(pixel *dst, pixel *src, intptr_t stride)<br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br><div dir="ltr">With Regards,<div><br></div><div>Murugan. V</div><div>+919659287478</div></div>
</div>