[x265] [PATCH 1 of 2] remove unused parwameter *recon from assembly code
Min Chen
chenm003 at 163.com
Wed Apr 2 22:26:07 CEST 2014
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1396469570 25200
# Node ID 4348a3ed1b3201bc18d80ed51bfc0fccc24d3fcf
# Parent 0206822d9fea295c199a0ad192e8fc5e1f2b9124
remove unused parwameter *recon from assembly code
diff -r 0206822d9fea -r 4348a3ed1b32 source/Lib/TLibEncoder/TEncSearch.cpp
--- a/source/Lib/TLibEncoder/TEncSearch.cpp Tue Apr 01 23:28:32 2014 +0530
+++ b/source/Lib/TLibEncoder/TEncSearch.cpp Wed Apr 02 13:12:50 2014 -0700
@@ -465,7 +465,7 @@
assert(width <= 32);
//===== reconstruction =====
- primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
+ primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
//===== update distortion =====
outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
}
@@ -587,7 +587,7 @@
assert(((intptr_t)residual & (width - 1)) == 0);
assert(width <= 32);
//===== reconstruction =====
- primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
+ primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
//===== update distortion =====
uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
if (ttype == TEXT_CHROMA_U)
diff -r 0206822d9fea -r 4348a3ed1b32 source/common/pixel.cpp
--- a/source/common/pixel.cpp Tue Apr 01 23:28:32 2014 +0530
+++ b/source/common/pixel.cpp Wed Apr 02 13:12:50 2014 -0700
@@ -460,9 +460,7 @@
}
template<int blockSize>
-void calcRecons(pixel* pred, int16_t* residual,
- pixel*,
- int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
+void calcRecons(pixel* pred, int16_t* residual, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
{
for (int y = 0; y < blockSize; y++)
{
diff -r 0206822d9fea -r 4348a3ed1b32 source/common/primitives.h
--- a/source/common/primitives.h Tue Apr 01 23:28:32 2014 +0530
+++ b/source/common/primitives.h Wed Apr 02 13:12:50 2014 -0700
@@ -125,7 +125,7 @@
typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
-typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
diff -r 0206822d9fea -r 4348a3ed1b32 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Tue Apr 01 23:28:32 2014 +0530
+++ b/source/common/x86/pixel-util.h Wed Apr 02 13:12:50 2014 -0700
@@ -24,12 +24,12 @@
#ifndef X265_PIXEL_UTIL_H
#define X265_PIXEL_UTIL_H
-void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
-void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
diff -r 0206822d9fea -r 4348a3ed1b32 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Apr 01 23:28:32 2014 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Apr 02 13:12:50 2014 -0700
@@ -58,590 +58,452 @@
cextern pw_pixel_max
;-----------------------------------------------------------------------------
-; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
+; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal calcRecons4
%if HIGH_BIT_DEPTH
%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,6
+cglobal calcRecons4, 5,8,4
+ %define t7b r7b
%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,6
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
+cglobal calcRecons4, 5,7,4,0-1
+ %define t7b byte [rsp]
%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
+ mov r4d, r4m
mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
- add t7, t7
-%endif
+ mov r6d, r6m
+ add r4d, r4d
+ add r5d, r5d
+ add r6d, r6d
pxor m4, m4
mova m5, [pw_pixel_max]
- add t5, t5
- mov t8d, 4/2
+ mov t7b, 4/2
.loop:
- movh m0, [t0]
- movh m1, [t0 + t5]
+ movh m0, [r0]
+ movh m1, [r0 + r4]
punpcklqdq m0, m1
- movh m2, [t1]
- movh m3, [t1 + t5]
+ movh m2, [r1]
+ movh m3, [r1 + r4]
punpcklqdq m2, m3
paddw m0, m2
CLIPW m0, m4, m5
- ; store recon[] and recipred[]
- movh [t4], m0
-%if ARCH_X86_64 == 0
- add t4, t7
- add t4, t7
- movhps [t4], m0
- add t4, t7
- add t4, t7
+ ; store recipred[]
+ movh [r3], m0
+ movhps [r3 + r6], m0
+
+ ; store recqt[]
+ movh [r2], m0
+ movhps [r2 + r5], m0
+
+ lea r0, [r0 + r4 * 2]
+ lea r1, [r1 + r4 * 2]
+ lea r2, [r2 + r5 * 2]
+ lea r3, [r3 + r6 * 2]
+
+ dec t7b
+ jnz .loop
+ RET
+%else ;HIGH_BIT_DEPTH
+
+%if ARCH_X86_64 == 1
+cglobal calcRecons4, 5,8,4
+ %define t7b r7b
%else
- movhps [t4 + t7], m0
- lea t4, [t4 + t7 * 2]
+cglobal calcRecons4, 5,7,4,0-1
+ %define t7b byte [rsp]
%endif
-
- ; store recqt[]
- movh [t3], m0
- add t3, t6
- movhps [t3], m0
- add t3, t6
-
- lea t0, [t0 + t5 * 2]
- lea t1, [t1 + t5 * 2]
-
- dec t8d
- jnz .loop
-
-%else ;HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,4
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,4
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
+ mov r4d, r4m
mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
+ mov r6d, r6m
+ add r5d, r5d
pxor m0, m0
- mov t8d, 4/2
+ mov t7b, 4/2
.loop:
- movd m1, [t0]
- movd m2, [t0 + t5]
+ movd m1, [r0]
+ movd m2, [r0 + r4]
punpckldq m1, m2
punpcklbw m1, m0
- movh m2, [t1]
- movh m3, [t1 + t5 * 2]
+ movh m2, [r1]
+ movh m3, [r1 + r4 * 2]
punpcklqdq m2, m3
paddw m1, m2
packuswb m1, m1
; store recon[] and recipred[]
- movd [t4], m1
- add t4, t7
+ movd [r3], m1
pshufd m2, m1, 1
- movd [t4], m2
- add t4, t7
+ movd [r3 + r6], m2
; store recqt[]
punpcklbw m1, m0
- movlps [t3], m1
- add t3, t6
- movhps [t3], m1
- add t3, t6
-
- lea t0, [t0 + t5 * 2]
- lea t1, [t1 + t5 * 4]
-
- dec t8d
+ movlps [r2], m1
+ movhps [r2 + r5], m1
+
+ lea r0, [r0 + r4 * 2]
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r5 * 2]
+ lea r3, [r3 + r6 * 2]
+
+ dec t7b
jnz .loop
+ RET
%endif ;HIGH_BIT_DEPTH
- RET
INIT_XMM sse2
-cglobal calcRecons8
+%if ARCH_X86_64 == 1
+cglobal calcRecons8, 5,8,4
+ %define t7b r7b
+%else
+cglobal calcRecons8, 5,7,4,0-1
+ %define t7b byte [rsp]
+%endif
+
%if HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,6
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,6
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
+ mov r4d, r4m
mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
- add t7, t7
-%endif
+ mov r6d, r6m
+ add r4d, r4d
+ add r5d, r5d
+ add r6d, r6d
pxor m4, m4
mova m5, [pw_pixel_max]
- add t5, t5
- mov t8d, 8/2
+ mov t7b, 8/2
.loop:
- movu m0, [t0]
- movu m1, [t0 + t5]
- movu m2, [t1]
- movu m3, [t1 + t5]
+ movu m0, [r0]
+ movu m1, [r0 + r4]
+ movu m2, [r1]
+ movu m3, [r1 + r4]
paddw m0, m2
paddw m1, m3
CLIPW m0, m4, m5
CLIPW m1, m4, m5
- ; store recon[] and recipred[]
- movu [t4], m0
-%if ARCH_X86_64 == 0
- add t4, t7
- add t4, t7
- movu [t4], m1
- add t4, t7
- add t4, t7
-%else
- movu [t4 + t7], m1
- lea t4, [t4 + t7 * 2]
-%endif
+ ; store recipred[]
+ movu [r3], m0
+ movu [r3 + r6], m1
; store recqt[]
- movu [t3], m0
- add t3, t6
- movu [t3], m1
- add t3, t6
-
- lea t0, [t0 + t5 * 2]
- lea t1, [t1 + t5 * 2]
-
- dec t8d
+ movu [r2], m0
+ movu [r2 + r5], m1
+
+ lea r0, [r0 + r4 * 2]
+ lea r1, [r1 + r4 * 2]
+ lea r2, [r2 + r5 * 2]
+ lea r3, [r3 + r6 * 2]
+
+ dec t7b
jnz .loop
+ RET
%else ;HIGH_BIT_DEPTH
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,5
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,5
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
+ mov r4d, r4m
mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
+ mov r6d, r6m
+ add r5d, r5d
pxor m0, m0
- mov t8d, 8/2
+ mov t7b, 8/2
.loop:
- movh m1, [t0]
- movh m2, [t0 + t5]
+ movh m1, [r0]
+ movh m2, [r0 + r4]
punpcklbw m1, m0
punpcklbw m2, m0
- movu m3, [t1]
- movu m4, [t1 + t5 * 2]
+ movu m3, [r1]
+ movu m4, [r1 + r4 * 2]
paddw m1, m3
paddw m2, m4
packuswb m1, m2
; store recon[] and recipred[]
- movlps [t4], m1
-%if ARCH_X86_64 == 0
- add t4, t7
- movhps [t4], m1
- add t4, t7
-%else
- movhps [t4 + t7], m1
- lea t4, [t4 + t7 * 2]
-%endif
+ movlps [r3], m1
+ movhps [r3 + r6], m1
; store recqt[]
punpcklbw m2, m1, m0
punpckhbw m1, m0
- movu [t3], m2
- add t3, t6
- movu [t3], m1
- add t3, t6
-
- lea t0, [t0 + t5 * 2]
- lea t1, [t1 + t5 * 4]
-
- dec t8d
+ movu [r2], m2
+ movu [r2 + r5], m1
+
+ lea r0, [r0 + r4 * 2]
+ lea r1, [r1 + r4 * 4]
+ lea r2, [r2 + r5 * 2]
+ lea r3, [r3 + r6 * 2]
+
+ dec t7b
jnz .loop
+ RET
%endif ;HIGH_BIT_DEPTH
- RET
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal calcRecons16
%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,6
+cglobal calcRecons16, 5,8,4
+ %define t7b r7b
%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,6
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
+cglobal calcRecons16, 5,7,4,0-1
+ %define t7b byte [rsp]
%endif
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
+ mov r4d, r4m
mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
- add t7, t7
-%endif
+ mov r6d, r6m
+ add r4d, r4d
+ add r5d, r5d
+ add r6d, r6d
pxor m4, m4
mova m5, [pw_pixel_max]
- add t5, t5
- mov t8d, 16/2
+ mov t7b, 16/2
.loop:
- movu m0, [t0]
- movu m1, [t0 + 16]
- movu m2, [t1]
- movu m3, [t1 + 16]
+ movu m0, [r0]
+ movu m1, [r0 + 16]
+ movu m2, [r1]
+ movu m3, [r1 + 16]
paddw m0, m2
paddw m1, m3
CLIPW m0, m4, m5
CLIPW m1, m4, m5
- ; store recon[] and recipred[]
- movu [t4], m0
- movu [t4 + 16], m1
-%if ARCH_X86_64 == 0
- add t4, t7
- add t4, t7
-%endif
+ ; store recipred[]
+ movu [r3], m0
+ movu [r3 + 16], m1
; store recqt[]
- movu [t3], m0
- movu [t3 + 16], m1
- add t3, t6
-
- movu m0, [t0 + t5]
- movu m1, [t0 + t5 + 16]
- movu m2, [t1 + t5]
- movu m3, [t1 + t5 + 16]
+ movu [r2], m0
+ movu [r2 + 16], m1
+
+ movu m0, [r0 + r4]
+ movu m1, [r0 + r4 + 16]
+ movu m2, [r1 + r4]
+ movu m3, [r1 + r4 + 16]
paddw m0, m2
paddw m1, m3
CLIPW m0, m4, m5
CLIPW m1, m4, m5
; store recon[] and recipred[]
-%if ARCH_X86_64 == 0
- movu [t4], m0
- movu [t4 + 16], m1
- add t4, t7
- add t4, t7
+ movu [r3 + r6], m0
+ movu [r3 + r6 + 16], m1
+
+ ; store recqt[]
+ movu [r2 + r5], m0
+ movu [r2 + r5 + 16], m1
+
+ lea r0, [r0 + r4 * 2]
+ lea r1, [r1 + r4 * 2]
+ lea r2, [r2 + r5 * 2]
+ lea r3, [r3 + r6 * 2]
+
+ dec t7b
+ jnz .loop
+ RET
+%else ;HIGH_BIT_DEPTH
+
+INIT_XMM sse4
+%if ARCH_X86_64 == 1
+cglobal calcRecons16, 5,8,4
+ %define t7b r7b
%else
- movu [t4 + t7], m0
- movu [t4 + t7 + 16], m1
- lea t4, [t4 + t7 * 2]
+cglobal calcRecons16, 5,7,4,0-1
+ %define t7b byte [rsp]
%endif
- ; store recqt[]
- movu [t3], m0
- movu [t3 + 16], m1
- add t3, t6
-
- lea t0, [t0 + t5 * 2]
- lea t1, [t1 + t5 * 2]
-
- dec t8d
- jnz .loop
-%else ;HIGH_BIT_DEPTH
-INIT_XMM sse4
-cglobal calcRecons16
-%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,3
-%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,3
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
-%endif
-
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
+ mov r4d, r4m
mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
+ mov r6d, r6m
+ add r5d, r5d
pxor m0, m0
- mov t8d, 16
+ mov t7b, 16
.loop:
- movu m2, [t0]
+ movu m2, [r0]
pmovzxbw m1, m2
punpckhbw m2, m0
- paddw m1, [t1]
- paddw m2, [t1 + 16]
+ paddw m1, [r1]
+ paddw m2, [r1 + 16]
packuswb m1, m2
; store recon[] and recipred[]
- movu [t4], m1
+ movu [r3], m1
; store recqt[]
pmovzxbw m2, m1
punpckhbw m1, m0
- movu [t3], m2
- movu [t3 + 16], m1
-
- add t3, t6
- add t4, t7
- add t0, t5
- lea t1, [t1 + t5 * 2]
-
- dec t8d
+ movu [r2], m2
+ movu [r2 + 16], m1
+
+ add r2, r5
+ add r3, r6
+ add r0, r4
+ lea r1, [r1 + r4 * 2]
+
+ dec t7b
jnz .loop
+ RET
%endif ;HIGH_BIT_DEPTH
- RET
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal calcRecons32
%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,6
+cglobal calcRecons32, 5,8,4
+ %define t7b r7b
%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,6
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
+cglobal calcRecons32, 5,7,4,0-1
+ %define t7b byte [rsp]
%endif
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
+ mov r4d, r4m
mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
- add t7, t7
-%endif
+ mov r6d, r6m
+ add r4d, r4d
+ add r5d, r5d
+ add r6d, r6d
pxor m4, m4
mova m5, [pw_pixel_max]
- add t5, t5
- mov t8d, 32/2
+ mov t7b, 32/2
.loop:
- movu m0, [t0]
- movu m1, [t0 + 16]
- movu m2, [t1]
- movu m3, [t1 + 16]
+ movu m0, [r0]
+ movu m1, [r0 + 16]
+ movu m2, [r1]
+ movu m3, [r1 + 16]
paddw m0, m2
paddw m1, m3
CLIPW m0, m4, m5
CLIPW m1, m4, m5
- ; store recon[] and recipred[]
- movu [t4], m0
- movu [t4 + 16], m1
+ ; store recipred[]
+ movu [r3], m0
+ movu [r3 + 16], m1
; store recqt[]
- movu [t3], m0
- movu [t3 + 16], m1
-
- movu m0, [t0 + 32]
- movu m1, [t0 + 48]
- movu m2, [t1 + 32]
- movu m3, [t1 + 48]
+ movu [r2], m0
+ movu [r2 + 16], m1
+
+ movu m0, [r0 + 32]
+ movu m1, [r0 + 48]
+ movu m2, [r1 + 32]
+ movu m3, [r1 + 48]
paddw m0, m2
paddw m1, m3
CLIPW m0, m4, m5
CLIPW m1, m4, m5
; store recon[] and recipred[]
- movu [t4 + 32], m0
- movu [t4 + 48], m1
-%if ARCH_X86_64 == 0
- add t4, t7
- add t4, t7
-%endif
+ movu [r3 + 32], m0
+ movu [r3 + 48], m1
; store recqt[]
- movu [t3 + 32], m0
- movu [t3 + 48], m1
- add t3, t6
-
- movu m0, [t0 + t5]
- movu m1, [t0 + t5 + 16]
- movu m2, [t1 + t5]
- movu m3, [t1 + t5 + 16]
+ movu [r2 + 32], m0
+ movu [r2 + 48], m1
+ add r2, r5
+
+ movu m0, [r0 + r4]
+ movu m1, [r0 + r4 + 16]
+ movu m2, [r1 + r4]
+ movu m3, [r1 + r4 + 16]
paddw m0, m2
paddw m1, m3
CLIPW m0, m4, m5
CLIPW m1, m4, m5
; store recon[] and recipred[]
-%if ARCH_X86_64 == 0
- movu [t4], m0
- movu [t4 + 16], m1
-%else
- movu [t4 + t7], m0
- movu [t4 + t7 + 16], m1
-%endif
+ movu [r3 + r6], m0
+ movu [r3 + r6 + 16], m1
; store recqt[]
- movu [t3], m0
- movu [t3 + 16], m1
-
- movu m0, [t0 + t5 + 32]
- movu m1, [t0 + t5 + 48]
- movu m2, [t1 + t5 + 32]
- movu m3, [t1 + t5 + 48]
+ movu [r2], m0
+ movu [r2 + 16], m1
+
+ movu m0, [r0 + r4 + 32]
+ movu m1, [r0 + r4 + 48]
+ movu m2, [r1 + r4 + 32]
+ movu m3, [r1 + r4 + 48]
paddw m0, m2
paddw m1, m3
CLIPW m0, m4, m5
CLIPW m1, m4, m5
; store recon[] and recipred[]
-%if ARCH_X86_64 == 0
- movu [t4 + 32], m0
- movu [t4 + 48], m1
- add t4, t7
- add t4, t7
-%else
- movu [t4 + t7 + 32], m0
- movu [t4 + t7 + 48], m1
- lea t4, [t4 + t7 * 2]
-%endif
+ movu [r3 + r6 + 32], m0
+ movu [r3 + r6 + 48], m1
+ lea r3, [r3 + r6 * 2]
; store recqt[]
- movu [t3 + 32], m0
- movu [t3 + 48], m1
- add t3, t6
-
- lea t0, [t0 + t5 * 2]
- lea t1, [t1 + t5 * 2]
-
- dec t8d
+ movu [r2 + 32], m0
+ movu [r2 + 48], m1
+ add r2, r5
+
+ lea r0, [r0 + r4 * 2]
+ lea r1, [r1 + r4 * 2]
+
+ dec t7b
jnz .loop
+ RET
%else ;HIGH_BIT_DEPTH
INIT_XMM sse4
-cglobal calcRecons32
%if ARCH_X86_64 == 1
- DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
- PROLOGUE 6,9,5
+cglobal calcRecons32, 5,8,4
+ %define t7b r7b
%else
- DECLARE_REG_TMP 0,1,2,3,4,5
- PROLOGUE 6,7,5
- %define t6 r6m
- %define t6d r6d
- %define t7 r7m
- %define t8d r6d
+cglobal calcRecons32, 5,7,4,0-1
+ %define t7b byte [rsp]
%endif
- mov t6d, r6m
-%if ARCH_X86_64 == 0
- add t6d, t6d
- mov r6m, t6d
-%else
+ mov r4d, r4m
mov r5d, r5m
- mov r7d, r7m
- add t6d, t6d
-%endif
+ mov r6d, r6m
+ add r5d, r5d
pxor m0, m0
- mov t8d, 32
+ mov t7b, 32
.loop:
- movu m2, [t0]
- movu m4, [t0 + 16]
+ movu m2, [r0]
+ movu m4, [r0 + 16]
pmovzxbw m1, m2
punpckhbw m2, m0
pmovzxbw m3, m4
punpckhbw m4, m0
- paddw m1, [t1 + 0 * 16]
- paddw m2, [t1 + 1 * 16]
+ paddw m1, [r1 + 0 * 16]
+ paddw m2, [r1 + 1 * 16]
packuswb m1, m2
- paddw m3, [t1 + 2 * 16]
- paddw m4, [t1 + 3 * 16]
+ paddw m3, [r1 + 2 * 16]
+ paddw m4, [r1 + 3 * 16]
packuswb m3, m4
; store recon[] and recipred[]
- movu [t4], m1
- movu [t4 + 16], m3
+ movu [r3], m1
+ movu [r3 + 16], m3
; store recqt[]
pmovzxbw m2, m1
punpckhbw m1, m0
- movu [t3 + 0 * 16], m2
- movu [t3 + 1 * 16], m1
+ movu [r2 + 0 * 16], m2
+ movu [r2 + 1 * 16], m1
pmovzxbw m4, m3
punpckhbw m3, m0
- movu [t3 + 2 * 16], m4
- movu [t3 + 3 * 16], m3
-
- add t3, t6
- add t4, t7
- add t0, t5
- lea t1, [t1 + t5 * 2]
-
- dec t8d
+ movu [r2 + 2 * 16], m4
+ movu [r2 + 3 * 16], m3
+
+ add r2, r5
+ add r3, r6
+ add r0, r4
+ lea r1, [r1 + r4 * 2]
+
+ dec t7b
jnz .loop
+ RET
%endif ;HIGH_BIT_DEPTH
- RET
;-----------------------------------------------------------------------------
diff -r 0206822d9fea -r 4348a3ed1b32 source/test/pixelharness.cpp
--- a/source/test/pixelharness.cpp Tue Apr 01 23:28:32 2014 +0530
+++ b/source/test/pixelharness.cpp Wed Apr 02 13:12:50 2014 -0700
@@ -354,10 +354,8 @@
int stride = STRIDE;
int index1 = rand() % TEST_CASES;
int index2 = rand() % TEST_CASES;
- ref(pixel_test_buff[index1] + j, short_test_buff[index2] + j,
- ref_reco, ref_recq, ref_pred, stride, stride, stride);
- opt(pixel_test_buff[index1] + j, short_test_buff[index2] + j,
- opt_reco, opt_recq, opt_pred, stride, stride, stride);
+ ref(pixel_test_buff[index1] + j, short_test_buff[index2] + j, ref_recq, ref_pred, stride, stride, stride);
+ opt(pixel_test_buff[index1] + j, short_test_buff[index2] + j, opt_recq, opt_pred, stride, stride, stride);
if (memcmp(ref_recq, opt_recq, 64 * 64 * sizeof(int16_t)))
{
@@ -1609,7 +1607,7 @@
if (opt.calcrecon[i])
{
HEADER("recon[%dx%d]", 4 << i, 4 << i);
- REPORT_SPEEDUP(opt.calcrecon[i], ref.calcrecon[i], pbuf1, sbuf1, pbuf2, sbuf1, pbuf1, 64, 64, 64);
+ REPORT_SPEEDUP(opt.calcrecon[i], ref.calcrecon[i], pbuf1, sbuf1, sbuf1, pbuf1, 64, 64, 64);
}
if (opt.blockfill_s[i])
More information about the x265-devel
mailing list