[x265] [PATCH] asm: residual buffer is alignment to size, so we can use alignment load instruction

Min Chen chenm003 at 163.com
Thu Nov 14 09:48:39 CET 2013


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384418720 -28800
# Node ID 493981f517c44293fd1134707a910b53cc688015
# Parent  8e22129119d6d8049996ed5f487625e4801b0a50
asm: residual buffer is alignment to size, so we can use alignment load instruction

diff -r 8e22129119d6 -r 493981f517c4 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Nov 14 16:45:03 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp	Thu Nov 14 16:45:20 2013 +0800
@@ -501,6 +501,8 @@
         primitives.blockfill_s[size](resiTmp, stride, 0);
     }
 
+    assert(((uint32_t)residual & (width - 1)) == 0);
+    assert(width <= 32);
     //===== reconstruction =====
     primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
 
diff -r 8e22129119d6 -r 493981f517c4 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm	Thu Nov 14 16:45:03 2013 +0800
+++ b/source/common/x86/pixel-util.asm	Thu Nov 14 16:45:20 2013 +0800
@@ -239,10 +239,10 @@
 cglobal calcRecons16
 %if ARCH_X86_64 == 1
     DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
-    PROLOGUE 6,9,5
+    PROLOGUE 6,9,3
 %else
     DECLARE_REG_TMP 0,1,2,3,4,5
-    PROLOGUE 6,7,5
+    PROLOGUE 6,7,3
     %define t6      r6m
     %define t6d     r6d
     %define t7      r7m
@@ -265,10 +265,8 @@
     movu        m2, [t0]
     pmovzxbw    m1, m2
     punpckhbw   m2, m0
-    movu        m3, [t1]
-    movu        m4, [t1 + 16]
-    paddw       m1, m3
-    paddw       m2, m4
+    paddw       m1, [t1]
+    paddw       m2, [t1 + 16]
     packuswb    m1, m2
 
     ; store recon[] and recipred[]
@@ -296,10 +294,10 @@
 cglobal calcRecons32
 %if ARCH_X86_64 == 1
     DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
-    PROLOGUE 6,9,7
+    PROLOGUE 6,9,5
 %else
     DECLARE_REG_TMP 0,1,2,3,4,5
-    PROLOGUE 6,7,7
+    PROLOGUE 6,7,5
     %define t6      r6m
     %define t6d     r6d
     %define t7      r7m
@@ -326,16 +324,12 @@
     pmovzxbw    m3, m4
     punpckhbw   m4, m0
 
-    movu        m5, [t1 + 0 * 16]
-    movu        m6, [t1 + 1 * 16]
-    paddw       m1, m5
-    paddw       m2, m6
+    paddw       m1, [t1 + 0 * 16]
+    paddw       m2, [t1 + 1 * 16]
     packuswb    m1, m2
 
-    movu        m5, [t1 + 2 * 16]
-    movu        m6, [t1 + 3 * 16]
-    paddw       m3, m5
-    paddw       m4, m6
+    paddw       m3, [t1 + 2 * 16]
+    paddw       m4, [t1 + 3 * 16]
     packuswb    m3, m4
 
     ; store recon[] and recipred[]
@@ -369,10 +363,10 @@
 cglobal calcRecons64
 %if ARCH_X86_64 == 1
     DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
-    PROLOGUE 6,9,7
+    PROLOGUE 6,9,5
 %else
     DECLARE_REG_TMP 0,1,2,3,4,5
-    PROLOGUE 6,7,7
+    PROLOGUE 6,7,5
     %define t6      r6m
     %define t6d     r6d
     %define t7      r7m
@@ -400,16 +394,12 @@
     pmovzxbw    m3, m4
     punpckhbw   m4, m0
 
-    movu        m5, [t1 + 0 * 16]
-    movu        m6, [t1 + 1 * 16]
-    paddw       m1, m5
-    paddw       m2, m6
+    paddw       m1, [t1 + 0 * 16]
+    paddw       m2, [t1 + 1 * 16]
     packuswb    m1, m2
 
-    movu        m5, [t1 + 2 * 16]
-    movu        m6, [t1 + 3 * 16]
-    paddw       m3, m5
-    paddw       m4, m6
+    paddw       m3, [t1 + 2 * 16]
+    paddw       m4, [t1 + 3 * 16]
     packuswb    m3, m4
 
     ; store recon[] and recipred[]
@@ -436,16 +426,12 @@
     pmovzxbw    m3, m4
     punpckhbw   m4, m0
 
-    movu        m5, [t1 + 4 * 16]
-    movu        m6, [t1 + 5 * 16]
-    paddw       m1, m5
-    paddw       m2, m6
+    paddw       m1, [t1 + 4 * 16]
+    paddw       m2, [t1 + 5 * 16]
     packuswb    m1, m2
 
-    movu        m5, [t1 + 6 * 16]
-    movu        m6, [t1 + 7 * 16]
-    paddw       m3, m5
-    paddw       m4, m6
+    paddw       m3, [t1 + 6 * 16]
+    paddw       m4, [t1 + 7 * 16]
     packuswb    m3, m4
 
     ; store recon[] and recipred[]



More information about the x265-devel mailing list