[x265] [PATCH] asm: residual buffer is alignment to size, so we can use alignment load instruction
Min Chen
chenm003 at 163.com
Thu Nov 14 09:48:39 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384418720 -28800
# Node ID 493981f517c44293fd1134707a910b53cc688015
# Parent 8e22129119d6d8049996ed5f487625e4801b0a50
asm: residual buffer is alignment to size, so we can use alignment load instruction
diff -r 8e22129119d6 -r 493981f517c4 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Thu Nov 14 16:45:03 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Thu Nov 14 16:45:20 2013 +0800
@@ -501,6 +501,8 @@
primitives.blockfill_s[size](resiTmp, stride, 0);
}
+ assert(((uint32_t)residual & (width - 1)) == 0);
+ assert(width <= 32);
//===== reconstruction =====
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
diff -r 8e22129119d6 -r 493981f517c4 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm Thu Nov 14 16:45:03 2013 +0800
+++ b/source/common/x86/pixel-util.asm Thu Nov 14 16:45:20 2013 +0800
@@ -239,10 +239,10 @@
cglobal calcRecons16
%if ARCH_X86_64 == 1
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,5
+ PROLOGUE 6,9,3
%else
DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,5
+ PROLOGUE 6,7,3
%define t6 r6m
%define t6d r6d
%define t7 r7m
@@ -265,10 +265,8 @@
movu m2, [t0]
pmovzxbw m1, m2
punpckhbw m2, m0
- movu m3, [t1]
- movu m4, [t1 + 16]
- paddw m1, m3
- paddw m2, m4
+ paddw m1, [t1]
+ paddw m2, [t1 + 16]
packuswb m1, m2
; store recon[] and recipred[]
@@ -296,10 +294,10 @@
cglobal calcRecons32
%if ARCH_X86_64 == 1
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,7
+ PROLOGUE 6,9,5
%else
DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,7
+ PROLOGUE 6,7,5
%define t6 r6m
%define t6d r6d
%define t7 r7m
@@ -326,16 +324,12 @@
pmovzxbw m3, m4
punpckhbw m4, m0
- movu m5, [t1 + 0 * 16]
- movu m6, [t1 + 1 * 16]
- paddw m1, m5
- paddw m2, m6
+ paddw m1, [t1 + 0 * 16]
+ paddw m2, [t1 + 1 * 16]
packuswb m1, m2
- movu m5, [t1 + 2 * 16]
- movu m6, [t1 + 3 * 16]
- paddw m3, m5
- paddw m4, m6
+ paddw m3, [t1 + 2 * 16]
+ paddw m4, [t1 + 3 * 16]
packuswb m3, m4
; store recon[] and recipred[]
@@ -369,10 +363,10 @@
cglobal calcRecons64
%if ARCH_X86_64 == 1
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,7
+ PROLOGUE 6,9,5
%else
DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,7
+ PROLOGUE 6,7,5
%define t6 r6m
%define t6d r6d
%define t7 r7m
@@ -400,16 +394,12 @@
pmovzxbw m3, m4
punpckhbw m4, m0
- movu m5, [t1 + 0 * 16]
- movu m6, [t1 + 1 * 16]
- paddw m1, m5
- paddw m2, m6
+ paddw m1, [t1 + 0 * 16]
+ paddw m2, [t1 + 1 * 16]
packuswb m1, m2
- movu m5, [t1 + 2 * 16]
- movu m6, [t1 + 3 * 16]
- paddw m3, m5
- paddw m4, m6
+ paddw m3, [t1 + 2 * 16]
+ paddw m4, [t1 + 3 * 16]
packuswb m3, m4
; store recon[] and recipred[]
@@ -436,16 +426,12 @@
pmovzxbw m3, m4
punpckhbw m4, m0
- movu m5, [t1 + 4 * 16]
- movu m6, [t1 + 5 * 16]
- paddw m1, m5
- paddw m2, m6
+ paddw m1, [t1 + 4 * 16]
+ paddw m2, [t1 + 5 * 16]
packuswb m1, m2
- movu m5, [t1 + 6 * 16]
- movu m6, [t1 + 7 * 16]
- paddw m3, m5
- paddw m4, m6
+ paddw m3, [t1 + 6 * 16]
+ paddw m4, [t1 + 7 * 16]
packuswb m3, m4
; store recon[] and recipred[]
More information about the x265-devel
mailing list