<div dir="ltr"><br><div class="gmail_extra"><br><br><div class="gmail_quote">On Thu, Nov 14, 2013 at 2:48 AM, Min Chen <span dir="ltr"><<a href="mailto:chenm003@163.com" target="_blank">chenm003@163.com</a>></span> wrote:<br>
<blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"># HG changeset patch<br>
# User Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>
# Date 1384418720 -28800<br>
# Node ID 493981f517c44293fd1134707a910b53cc688015<br>
# Parent 8e22129119d6d8049996ed5f487625e4801b0a50<br>
asm: residual buffer is alignment to size, so we can use alignment load instruction<br></blockquote><div><br></div><div>an older version of this was pushed a couple of days ago</div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
<br>
diff -r 8e22129119d6 -r 493981f517c4 source/Lib/TLibEncoder/TEncSearch.cpp<br>
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Nov 14 16:45:03 2013 +0800<br>
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Nov 14 16:45:20 2013 +0800<br>
@@ -501,6 +501,8 @@<br>
primitives.blockfill_s[size](resiTmp, stride, 0);<br>
}<br>
<br>
+ assert(((uint32_t)residual & (width - 1)) == 0);<br>
+ assert(width <= 32);<br>
//===== reconstruction =====<br>
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);<br>
<br>
diff -r 8e22129119d6 -r 493981f517c4 source/common/x86/pixel-util.asm<br>
--- a/source/common/x86/pixel-util.asm Thu Nov 14 16:45:03 2013 +0800<br>
+++ b/source/common/x86/pixel-util.asm Thu Nov 14 16:45:20 2013 +0800<br>
@@ -239,10 +239,10 @@<br>
cglobal calcRecons16<br>
%if ARCH_X86_64 == 1<br>
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
- PROLOGUE 6,9,5<br>
+ PROLOGUE 6,9,3<br>
%else<br>
DECLARE_REG_TMP 0,1,2,3,4,5<br>
- PROLOGUE 6,7,5<br>
+ PROLOGUE 6,7,3<br>
%define t6 r6m<br>
%define t6d r6d<br>
%define t7 r7m<br>
@@ -265,10 +265,8 @@<br>
movu m2, [t0]<br>
pmovzxbw m1, m2<br>
punpckhbw m2, m0<br>
- movu m3, [t1]<br>
- movu m4, [t1 + 16]<br>
- paddw m1, m3<br>
- paddw m2, m4<br>
+ paddw m1, [t1]<br>
+ paddw m2, [t1 + 16]<br>
packuswb m1, m2<br>
<br>
; store recon[] and recipred[]<br>
@@ -296,10 +294,10 @@<br>
cglobal calcRecons32<br>
%if ARCH_X86_64 == 1<br>
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
- PROLOGUE 6,9,7<br>
+ PROLOGUE 6,9,5<br>
%else<br>
DECLARE_REG_TMP 0,1,2,3,4,5<br>
- PROLOGUE 6,7,7<br>
+ PROLOGUE 6,7,5<br>
%define t6 r6m<br>
%define t6d r6d<br>
%define t7 r7m<br>
@@ -326,16 +324,12 @@<br>
pmovzxbw m3, m4<br>
punpckhbw m4, m0<br>
<br>
- movu m5, [t1 + 0 * 16]<br>
- movu m6, [t1 + 1 * 16]<br>
- paddw m1, m5<br>
- paddw m2, m6<br>
+ paddw m1, [t1 + 0 * 16]<br>
+ paddw m2, [t1 + 1 * 16]<br>
packuswb m1, m2<br>
<br>
- movu m5, [t1 + 2 * 16]<br>
- movu m6, [t1 + 3 * 16]<br>
- paddw m3, m5<br>
- paddw m4, m6<br>
+ paddw m3, [t1 + 2 * 16]<br>
+ paddw m4, [t1 + 3 * 16]<br>
packuswb m3, m4<br>
<br>
; store recon[] and recipred[]<br>
@@ -369,10 +363,10 @@<br>
cglobal calcRecons64<br>
%if ARCH_X86_64 == 1<br>
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8<br>
- PROLOGUE 6,9,7<br>
+ PROLOGUE 6,9,5<br>
%else<br>
DECLARE_REG_TMP 0,1,2,3,4,5<br>
- PROLOGUE 6,7,7<br>
+ PROLOGUE 6,7,5<br>
%define t6 r6m<br>
%define t6d r6d<br>
%define t7 r7m<br>
@@ -400,16 +394,12 @@<br>
pmovzxbw m3, m4<br>
punpckhbw m4, m0<br>
<br>
- movu m5, [t1 + 0 * 16]<br>
- movu m6, [t1 + 1 * 16]<br>
- paddw m1, m5<br>
- paddw m2, m6<br>
+ paddw m1, [t1 + 0 * 16]<br>
+ paddw m2, [t1 + 1 * 16]<br>
packuswb m1, m2<br>
<br>
- movu m5, [t1 + 2 * 16]<br>
- movu m6, [t1 + 3 * 16]<br>
- paddw m3, m5<br>
- paddw m4, m6<br>
+ paddw m3, [t1 + 2 * 16]<br>
+ paddw m4, [t1 + 3 * 16]<br>
packuswb m3, m4<br>
<br>
; store recon[] and recipred[]<br>
@@ -436,16 +426,12 @@<br>
pmovzxbw m3, m4<br>
punpckhbw m4, m0<br>
<br>
- movu m5, [t1 + 4 * 16]<br>
- movu m6, [t1 + 5 * 16]<br>
- paddw m1, m5<br>
- paddw m2, m6<br>
+ paddw m1, [t1 + 4 * 16]<br>
+ paddw m2, [t1 + 5 * 16]<br>
packuswb m1, m2<br>
<br>
- movu m5, [t1 + 6 * 16]<br>
- movu m6, [t1 + 7 * 16]<br>
- paddw m3, m5<br>
- paddw m4, m6<br>
+ paddw m3, [t1 + 6 * 16]<br>
+ paddw m4, [t1 + 7 * 16]<br>
packuswb m3, m4<br>
<br>
; store recon[] and recipred[]<br>
<br>
_______________________________________________<br>
x265-devel mailing list<br>
<a href="mailto:x265-devel@videolan.org">x265-devel@videolan.org</a><br>
<a href="https://mailman.videolan.org/listinfo/x265-devel" target="_blank">https://mailman.videolan.org/listinfo/x265-devel</a><br>
</blockquote></div><br><br clear="all"><div><br></div>-- <br>Steve Borho
</div></div>