<div style="line-height:1.7;color:#000000;font-size:14px;font-family:arial"><div>Yes, that testbench buffer have align problem, we may modify asm code soon<br></div>At 2014-09-24 01:34:34,"Deepthi Nandakumar" <deepthi@multicorewareinc.com> wrote:<br> <blockquote id="isReplyContent" style="margin: 0px 0px 0px 0.8ex; padding-left: 1ex; border-left-color: rgb(204, 204, 204); border-left-width: 1px; border-left-style: solid;"><div dir="ltr"><div>Thanks, Min. Pushed. However, I still get the testbench error message - quantcoeff/dequantcoeff buffer not aligned. Does the above change need to be reflected to quant/dequant also?<br><br></div>Thanks,<br>Deepthi<br></div><div class="gmail_extra"><br><div class="gmail_quote">On Wed, Sep 24, 2014 at 12:50 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin: 0px 0px 0px 0.8ex; padding-left: 1ex; border-left-color: rgb(204, 204, 204); border-left-width: 1px; border-left-style: solid;"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1411499911 25200<br>
# Node ID 439637e2e34800ba31dbfe28946946264af39380<br>
# Parent ee76b64fd051b529cc57c4fae7d8b7e0b6f8463e<br>
asm: replace mova by movu to avoid AVX2 testbench crash in dct16, dct32, denoise_dct, its same speed on Haswell<br>
<br>
diff -r ee76b64fd051 -r 439637e2e348 source/common/x86/dct8.asm<br>
--- a/source/common/x86/dct8.asm Mon Sep 22 21:28:59 2014 +0900<br>
+++ b/source/common/x86/dct8.asm Tue Sep 23 12:18:31 2014 -0700<br>
@@ -1108,17 +1108,17 @@<br>
pxor m5, m5<br>
shr r3d, 3<br>
.loop:<br>
- mova m0, [r0]<br>
+ movu m0, [r0]<br>
pabsd m1, m0<br>
- mova m2, [r1]<br>
+ movu m2, [r1]<br>
paddd m2, m1<br>
- mova [r1], m2<br>
+ movu [r1], m2<br>
pmovzxwd m3, [r2]<br>
psubd m1, m3<br>
pcmpgtd m4, m1, m5<br>
pand m1, m4<br>
psignd m1, m0<br>
- mova [r0], m1<br>
+ movu [r0], m1<br>
add r0, 32<br>
add r1, 32<br>
add r2, 16<br>
@@ -1197,10 +1197,10 @@<br>
cglobal dct16, 3, 9, 15, 0-16*mmsize<br>
%if BIT_DEPTH == 10<br>
%define DCT_SHIFT 5<br>
- vpbroadcastd m9, [pd_16]<br>
+ vbroadcasti128 m9, [pd_16]<br>
%elif BIT_DEPTH == 8<br>
%define DCT_SHIFT 3<br>
- vpbroadcastd m9, [pd_4]<br>
+ vbroadcasti128 m9, [pd_4]<br>
%else<br>
%error Unsupported BIT_DEPTH!<br>
%endif<br>
@@ -1219,23 +1219,23 @@<br>
.pass1:<br>
lea r6, [r0 + r2 * 4]<br>
<br>
- mova m2, [r0]<br>
- mova m1, [r6]<br>
+ movu m2, [r0]<br>
+ movu m1, [r6]<br>
vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo]<br>
vperm2i128 m1, m2, m1, 0x31 ; [row0hi row4hi]<br>
<br>
- mova m4, [r0 + r2]<br>
- mova m3, [r6 + r2]<br>
+ movu m4, [r0 + r2]<br>
+ movu m3, [r6 + r2]<br>
vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo]<br>
vperm2i128 m3, m4, m3, 0x31 ; [row1hi row5hi]<br>
<br>
- mova m6, [r0 + r2 * 2]<br>
- mova m5, [r6 + r2 * 2]<br>
+ movu m6, [r0 + r2 * 2]<br>
+ movu m5, [r6 + r2 * 2]<br>
vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo]<br>
vperm2i128 m5, m6, m5, 0x31 ; [row2hi row6hi]<br>
<br>
- mova m8, [r0 + r3]<br>
- mova m7, [r6 + r3]<br>
+ movu m8, [r0 + r3]<br>
+ movu m7, [r6 + r3]<br>
vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo]<br>
vperm2i128 m7, m8, m7, 0x31 ; [row3hi row7hi]<br>
<br>
@@ -1296,7 +1296,7 @@<br>
mov r4d, 2<br>
mov r2d, 64<br>
lea r3, [r2 * 3]<br>
- vpbroadcastd m9, [pd_512]<br>
+ vbroadcasti128 m9, [pd_512]<br>
<br>
.pass2:<br>
mova m0, [r5 + 0 * 32] ; [row0lo row4lo]<br>
@@ -1312,43 +1312,43 @@<br>
mova m7, [r5 + 11 * 32] ; [row3hi row7hi]<br>
<br>
DCT16_PASS_2 -8 * 16<br>
- mova [r1], m10<br>
+ movu [r1], m10<br>
DCT16_PASS_2 -7 * 16<br>
- mova [r1 + r2], m10<br>
+ movu [r1 + r2], m10<br>
DCT16_PASS_2 -6 * 16<br>
- mova [r1 + r2 * 2], m10<br>
+ movu [r1 + r2 * 2], m10<br>
DCT16_PASS_2 -5 * 16<br>
- mova [r1 + r3], m10<br>
+ movu [r1 + r3], m10<br>
<br>
lea r6, [r1 + r2 * 4]<br>
DCT16_PASS_2 -4 * 16<br>
- mova [r6], m10<br>
+ movu [r6], m10<br>
DCT16_PASS_2 -3 * 16<br>
- mova [r6 + r2], m10<br>
+ movu [r6 + r2], m10<br>
DCT16_PASS_2 -2 * 16<br>
- mova [r6 + r2 * 2], m10<br>
+ movu [r6 + r2 * 2], m10<br>
DCT16_PASS_2 -1 * 16<br>
- mova [r6 + r3], m10<br>
+ movu [r6 + r3], m10<br>
<br>
lea r6, [r6 + r2 * 4]<br>
DCT16_PASS_2 0 * 16<br>
- mova [r6], m10<br>
+ movu [r6], m10<br>
DCT16_PASS_2 1 * 16<br>
- mova [r6 + r2], m10<br>
+ movu [r6 + r2], m10<br>
DCT16_PASS_2 2 * 16<br>
- mova [r6 + r2 * 2], m10<br>
+ movu [r6 + r2 * 2], m10<br>
DCT16_PASS_2 3 * 16<br>
- mova [r6 + r3], m10<br>
+ movu [r6 + r3], m10<br>
<br>
lea r6, [r6 + r2 * 4]<br>
DCT16_PASS_2 4 * 16<br>
- mova [r6], m10<br>
+ movu [r6], m10<br>
DCT16_PASS_2 5 * 16<br>
- mova [r6 + r2], m10<br>
+ movu [r6 + r2], m10<br>
DCT16_PASS_2 6 * 16<br>
- mova [r6 + r2 * 2], m10<br>
+ movu [r6 + r2 * 2], m10<br>
DCT16_PASS_2 7 * 16<br>
- mova [r6 + r3], m10<br>
+ movu [r6 + r3], m10<br>
<br>
add r1, 32<br>
add r5, 128<br>
@@ -1442,15 +1442,15 @@<br>
mova m15, [dct16_shuf1]<br>
<br>
.pass1:<br>
- mova m2, [r0]<br>
- mova m1, [r0 + 32]<br>
+ movu m2, [r0]<br>
+ movu m1, [r0 + 32]<br>
pshufb m1, m15<br>
vpermq m1, m1, 0x4E<br>
psubw m7, m2, m1<br>
paddw m2, m1<br>
<br>
- mova m1, [r0 + r2 * 2]<br>
- mova m0, [r0 + r2 * 2 + 32]<br>
+ movu m1, [r0 + r2 * 2]<br>
+ movu m0, [r0 + r2 * 2 + 32]<br>
pshufb m0, m15<br>
vpermq m0, m0, 0x4E<br>
psubw m8, m1, m0<br>
@@ -1465,15 +1465,15 @@<br>
vperm2i128 m6, m7, m8, 0x31 ; [row0hi row2hi] for O<br>
<br>
<br>
- mova m4, [r0 + r2]<br>
- mova m2, [r0 + r2 + 32]<br>
+ movu m4, [r0 + r2]<br>
+ movu m2, [r0 + r2 + 32]<br>
pshufb m2, m15<br>
vpermq m2, m2, 0x4E<br>
psubw m10, m4, m2<br>
paddw m4, m2<br>
<br>
- mova m3, [r0 + r3]<br>
- mova m2, [r0 + r3 + 32]<br>
+ movu m3, [r0 + r3]<br>
+ movu m2, [r0 + r3 + 32]<br>
pshufb m2, m15<br>
vpermq m2, m2, 0x4E<br>
psubw m11, m3, m2<br>
@@ -1531,83 +1531,83 @@<br>
mova m7, [r5 + 3 * 64 + 32]<br>
<br>
DCT32_PASS_2 0 * 32<br>
- mova [r1], xm11<br>
+ movu [r1], xm11<br>
DCT32_PASS_2 1 * 32<br>
- mova [r1 + r2], xm11<br>
+ movu [r1 + r2], xm11<br>
DCT32_PASS_2 2 * 32<br>
- mova [r1 + r2 * 2], xm11<br>
+ movu [r1 + r2 * 2], xm11<br>
DCT32_PASS_2 3 * 32<br>
- mova [r1 + r3], xm11<br>
+ movu [r1 + r3], xm11<br>
<br>
lea r6, [r1 + r2 * 4]<br>
DCT32_PASS_2 4 * 32<br>
- mova [r6], xm11<br>
+ movu [r6], xm11<br>
DCT32_PASS_2 5 * 32<br>
- mova [r6 + r2], xm11<br>
+ movu [r6 + r2], xm11<br>
DCT32_PASS_2 6 * 32<br>
- mova [r6 + r2 * 2], xm11<br>
+ movu [r6 + r2 * 2], xm11<br>
DCT32_PASS_2 7 * 32<br>
- mova [r6 + r3], xm11<br>
+ movu [r6 + r3], xm11<br>
<br>
lea r6, [r6 + r2 * 4]<br>
DCT32_PASS_2 8 * 32<br>
- mova [r6], xm11<br>
+ movu [r6], xm11<br>
DCT32_PASS_2 9 * 32<br>
- mova [r6 + r2], xm11<br>
+ movu [r6 + r2], xm11<br>
DCT32_PASS_2 10 * 32<br>
- mova [r6 + r2 * 2], xm11<br>
+ movu [r6 + r2 * 2], xm11<br>
DCT32_PASS_2 11 * 32<br>
- mova [r6 + r3], xm11<br>
+ movu [r6 + r3], xm11<br>
<br>
lea r6, [r6 + r2 * 4]<br>
DCT32_PASS_2 12 * 32<br>
- mova [r6], xm11<br>
+ movu [r6], xm11<br>
DCT32_PASS_2 13 * 32<br>
- mova [r6 + r2], xm11<br>
+ movu [r6 + r2], xm11<br>
DCT32_PASS_2 14 * 32<br>
- mova [r6 + r2 * 2], xm11<br>
+ movu [r6 + r2 * 2], xm11<br>
DCT32_PASS_2 15 * 32<br>
- mova [r6 + r3], xm11<br>
+ movu [r6 + r3], xm11<br>
<br>
lea r6, [r6 + r2 * 4]<br>
DCT32_PASS_2 16 * 32<br>
- mova [r6], xm11<br>
+ movu [r6], xm11<br>
DCT32_PASS_2 17 * 32<br>
- mova [r6 + r2], xm11<br>
+ movu [r6 + r2], xm11<br>
DCT32_PASS_2 18 * 32<br>
- mova [r6 + r2 * 2], xm11<br>
+ movu [r6 + r2 * 2], xm11<br>
DCT32_PASS_2 19 * 32<br>
- mova [r6 + r3], xm11<br>
+ movu [r6 + r3], xm11<br>
<br>
lea r6, [r6 + r2 * 4]<br>
DCT32_PASS_2 20 * 32<br>
- mova [r6], xm11<br>
+ movu [r6], xm11<br>
DCT32_PASS_2 21 * 32<br>
- mova [r6 + r2], xm11<br>
+ movu [r6 + r2], xm11<br>
DCT32_PASS_2 22 * 32<br>
- mova [r6 + r2 * 2], xm11<br>
+ movu [r6 + r2 * 2], xm11<br>
DCT32_PASS_2 23 * 32<br>
- mova [r6 + r3], xm11<br>
+ movu [r6 + r3], xm11<br>
<br>
lea r6, [r6 + r2 * 4]<br>
DCT32_PASS_2 24 * 32<br>
- mova [r6], xm11<br>
+ movu [r6], xm11<br>
DCT32_PASS_2 25 * 32<br>
- mova [r6 + r2], xm11<br>
+ movu [r6 + r2], xm11<br>
DCT32_PASS_2 26 * 32<br>
- mova [r6 + r2 * 2], xm11<br>
+ movu [r6 + r2 * 2], xm11<br>
DCT32_PASS_2 27 * 32<br>
- mova [r6 + r3], xm11<br>
+ movu [r6 + r3], xm11<br>
<br>
lea r6, [r6 + r2 * 4]<br>
DCT32_PASS_2 28 * 32<br>
- mova [r6], xm11<br>
+ movu [r6], xm11<br>
DCT32_PASS_2 29 * 32<br>
- mova [r6 + r2], xm11<br>
+ movu [r6 + r2], xm11<br>
DCT32_PASS_2 30 * 32<br>
- mova [r6 + r2 * 2], xm11<br>
+ movu [r6 + r2 * 2], xm11<br>
DCT32_PASS_2 31 * 32<br>
- mova [r6 + r3], xm11<br>
+ movu [r6 + r3], xm11<br>
<br>
add r5, 256<br>
add r1, 16<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br></div>
</blockquote></div>