[x265] [PATCH] asm: residual buffer is alignment to size, so we can use alignment load instruction
Min Chen
chenm003 at 163.com
Wed Nov 13 11:47:20 CET 2013
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1384339631 -28800
# Node ID d4ab573033d4f4e3985f12a31a76114ff965d933
# Parent 8962d6fc534ae0597f7f4547f3be3a732bda439d
asm: residual buffer is alignment to size, so we can use alignment load instruction
diff -r 8962d6fc534a -r d4ab573033d4 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Wed Nov 13 18:19:43 2013 +0800
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Wed Nov 13 18:47:11 2013 +0800
@@ -501,6 +501,8 @@
primitives.blockfill_s[size](resiTmp, stride, 0);
}
+ assert(((uint32_t)residual & (width - 1)) == 0);
+ assert(width <= 32);
//===== reconstruction =====
primitives.calcrecon[size](pred, residual, recon, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
diff -r 8962d6fc534a -r d4ab573033d4 source/common/x86/pixel-util.asm
--- a/source/common/x86/pixel-util.asm Wed Nov 13 18:19:43 2013 +0800
+++ b/source/common/x86/pixel-util.asm Wed Nov 13 18:47:11 2013 +0800
@@ -239,10 +239,10 @@
cglobal calcRecons16
%if ARCH_X86_64 == 1
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,5
+ PROLOGUE 6,9,3
%else
DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,5
+ PROLOGUE 6,7,3
%define t6 r6m
%define t6d r6d
%define t7 r7m
@@ -265,10 +265,8 @@
movu m2, [t0]
pmovzxbw m1, m2
punpckhbw m2, m0
- movu m3, [t1]
- movu m4, [t1 + 16]
- paddw m1, m3
- paddw m2, m4
+ paddw m1, [t1]
+ paddw m2, [t1 + 16]
packuswb m1, m2
; store recon[] and recipred[]
@@ -296,10 +294,10 @@
cglobal calcRecons32
%if ARCH_X86_64 == 1
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,7
+ PROLOGUE 6,9,5
%else
DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,7
+ PROLOGUE 6,7,5
%define t6 r6m
%define t6d r6d
%define t7 r7m
@@ -326,16 +324,12 @@
pmovzxbw m3, m4
punpckhbw m4, m0
- movu m5, [t1 + 0 * 16]
- movu m6, [t1 + 1 * 16]
- paddw m1, m5
- paddw m2, m6
+ paddw m1, [t1 + 0 * 16]
+ paddw m2, [t1 + 1 * 16]
packuswb m1, m2
- movu m5, [t1 + 2 * 16]
- movu m6, [t1 + 3 * 16]
- paddw m3, m5
- paddw m4, m6
+ paddw m3, [t1 + 2 * 16]
+ paddw m4, [t1 + 3 * 16]
packuswb m3, m4
; store recon[] and recipred[]
@@ -369,10 +363,10 @@
cglobal calcRecons64
%if ARCH_X86_64 == 1
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,7
+ PROLOGUE 6,9,5
%else
DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,7
+ PROLOGUE 6,7,5
%define t6 r6m
%define t6d r6d
%define t7 r7m
@@ -400,16 +394,12 @@
pmovzxbw m3, m4
punpckhbw m4, m0
- movu m5, [t1 + 0 * 16]
- movu m6, [t1 + 1 * 16]
- paddw m1, m5
- paddw m2, m6
+ paddw m1, [t1 + 0 * 16]
+ paddw m2, [t1 + 1 * 16]
packuswb m1, m2
- movu m5, [t1 + 2 * 16]
- movu m6, [t1 + 3 * 16]
- paddw m3, m5
- paddw m4, m6
+ paddw m3, [t1 + 2 * 16]
+ paddw m4, [t1 + 3 * 16]
packuswb m3, m4
; store recon[] and recipred[]
@@ -436,16 +426,12 @@
pmovzxbw m3, m4
punpckhbw m4, m0
- movu m5, [t1 + 4 * 16]
- movu m6, [t1 + 5 * 16]
- paddw m1, m5
- paddw m2, m6
+ paddw m1, [t1 + 4 * 16]
+ paddw m2, [t1 + 5 * 16]
packuswb m1, m2
- movu m5, [t1 + 6 * 16]
- movu m6, [t1 + 7 * 16]
- paddw m3, m5
- paddw m4, m6
+ paddw m3, [t1 + 6 * 16]
+ paddw m4, [t1 + 7 * 16]
packuswb m3, m4
; store recon[] and recipred[]
More information about the x265-devel
mailing list