[x265] [PATCH 1 of 2] remove unused parwameter *recon from assembly code
Steve Borho
steve at borho.org
Wed Apr 2 20:38:34 CEST 2014
On Wed, Apr 2, 2014 at 3:34 PM, Min Chen <chenm003 at 163.com> wrote:
> # HG changeset patch
> # User Min Chen <chenm003 at 163.com>
> # Date 1396469570 25200
> # Node ID 4348a3ed1b3201bc18d80ed51bfc0fccc24d3fcf
> # Parent 0206822d9fea295c199a0ad192e8fc5e1f2b9124
> remove unused parwameter *recon from assembly code
queued with this typo fixed ^
>
> diff -r 0206822d9fea -r 4348a3ed1b32 source/Lib/TLibEncoder/TEncSearch.cpp
> --- a/source/Lib/TLibEncoder/TEncSearch.cpp Tue Apr 01 23:28:32 2014 +0530
> +++ b/source/Lib/TLibEncoder/TEncSearch.cpp Wed Apr 02 13:12:50 2014 -0700
> @@ -465,7 +465,7 @@
>
> assert(width <= 32);
> //===== reconstruction =====
> - primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> + primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, MAX_CU_SIZE, reconIPredStride);
> //===== update distortion =====
> outDist += primitives.sse_sp[part](reconQt, MAX_CU_SIZE, fenc, stride);
> }
> @@ -587,7 +587,7 @@
> assert(((intptr_t)residual & (width - 1)) == 0);
> assert(width <= 32);
> //===== reconstruction =====
> - primitives.calcrecon[size](pred, residual, 0, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> + primitives.calcrecon[size](pred, residual, reconQt, reconIPred, stride, reconQtStride, reconIPredStride);
> //===== update distortion =====
> uint32_t dist = primitives.sse_sp[part](reconQt, reconQtStride, fenc, stride);
> if (ttype == TEXT_CHROMA_U)
> diff -r 0206822d9fea -r 4348a3ed1b32 source/common/pixel.cpp
> --- a/source/common/pixel.cpp Tue Apr 01 23:28:32 2014 +0530
> +++ b/source/common/pixel.cpp Wed Apr 02 13:12:50 2014 -0700
> @@ -460,9 +460,7 @@
> }
>
> template<int blockSize>
> -void calcRecons(pixel* pred, int16_t* residual,
> - pixel*,
> - int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
> +void calcRecons(pixel* pred, int16_t* residual, int16_t* recqt, pixel* recipred, int stride, int qtstride, int ipredstride)
> {
> for (int y = 0; y < blockSize; y++)
> {
> diff -r 0206822d9fea -r 4348a3ed1b32 source/common/primitives.h
> --- a/source/common/primitives.h Tue Apr 01 23:28:32 2014 +0530
> +++ b/source/common/primitives.h Wed Apr 02 13:12:50 2014 -0700
> @@ -125,7 +125,7 @@
> typedef void (*dct_t)(int16_t *src, int32_t *dst, intptr_t stride);
> typedef void (*idct_t)(int32_t *src, int16_t *dst, intptr_t stride);
> typedef void (*calcresidual_t)(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> -typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> +typedef void (*calcrecon_t)(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> typedef void (*transpose_t)(pixel* dst, pixel* src, intptr_t stride);
> typedef uint32_t (*quant_t)(int32_t *coef, int32_t *quantCoeff, int32_t *deltaU, int32_t *qCoef, int qBits, int add, int numCoeff, int32_t* lastPos);
> typedef void (*dequant_scaling_t)(const int32_t* src, const int32_t *dequantCoef, int32_t* dst, int num, int mcqp_miper, int shift);
> diff -r 0206822d9fea -r 4348a3ed1b32 source/common/x86/pixel-util.h
> --- a/source/common/x86/pixel-util.h Tue Apr 01 23:28:32 2014 +0530
> +++ b/source/common/x86/pixel-util.h Wed Apr 02 13:12:50 2014 -0700
> @@ -24,12 +24,12 @@
> #ifndef X265_PIXEL_UTIL_H
> #define X265_PIXEL_UTIL_H
>
> -void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> -void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> -void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> -void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> -void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> -void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> +void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> +void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> +void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> +void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> +void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
> +void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
>
> void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
> diff -r 0206822d9fea -r 4348a3ed1b32 source/common/x86/pixel-util8.asm
> --- a/source/common/x86/pixel-util8.asm Tue Apr 01 23:28:32 2014 +0530
> +++ b/source/common/x86/pixel-util8.asm Wed Apr 02 13:12:50 2014 -0700
> @@ -58,590 +58,452 @@
> cextern pw_pixel_max
>
> ;-----------------------------------------------------------------------------
> -; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
> +; void calcrecon(pixel* pred, int16_t* residual, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
> ;-----------------------------------------------------------------------------
> INIT_XMM sse2
> -cglobal calcRecons4
> %if HIGH_BIT_DEPTH
> %if ARCH_X86_64 == 1
> - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,6
> +cglobal calcRecons4, 5,8,4
> + %define t7b r7b
> %else
> - DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,6
> - %define t6 r6m
> - %define t6d r6d
> - %define t7 r7m
> - %define t8d r6d
> +cglobal calcRecons4, 5,7,4,0-1
> + %define t7b byte [rsp]
> %endif
> -
> - mov t6d, r6m
> -%if ARCH_X86_64 == 0
> - add t6d, t6d
> - mov r6m, t6d
> -%else
> + mov r4d, r4m
> mov r5d, r5m
> - mov r7d, r7m
> - add t6d, t6d
> - add t7, t7
> -%endif
> + mov r6d, r6m
> + add r4d, r4d
> + add r5d, r5d
> + add r6d, r6d
>
> pxor m4, m4
> mova m5, [pw_pixel_max]
> - add t5, t5
> - mov t8d, 4/2
> + mov t7b, 4/2
> .loop:
> - movh m0, [t0]
> - movh m1, [t0 + t5]
> + movh m0, [r0]
> + movh m1, [r0 + r4]
> punpcklqdq m0, m1
> - movh m2, [t1]
> - movh m3, [t1 + t5]
> + movh m2, [r1]
> + movh m3, [r1 + r4]
> punpcklqdq m2, m3
> paddw m0, m2
> CLIPW m0, m4, m5
>
> - ; store recon[] and recipred[]
> - movh [t4], m0
> -%if ARCH_X86_64 == 0
> - add t4, t7
> - add t4, t7
> - movhps [t4], m0
> - add t4, t7
> - add t4, t7
> + ; store recipred[]
> + movh [r3], m0
> + movhps [r3 + r6], m0
> +
> + ; store recqt[]
> + movh [r2], m0
> + movhps [r2 + r5], m0
> +
> + lea r0, [r0 + r4 * 2]
> + lea r1, [r1 + r4 * 2]
> + lea r2, [r2 + r5 * 2]
> + lea r3, [r3 + r6 * 2]
> +
> + dec t7b
> + jnz .loop
> + RET
> +%else ;HIGH_BIT_DEPTH
> +
> +%if ARCH_X86_64 == 1
> +cglobal calcRecons4, 5,8,4
> + %define t7b r7b
> %else
> - movhps [t4 + t7], m0
> - lea t4, [t4 + t7 * 2]
> +cglobal calcRecons4, 5,7,4,0-1
> + %define t7b byte [rsp]
> %endif
> -
> - ; store recqt[]
> - movh [t3], m0
> - add t3, t6
> - movhps [t3], m0
> - add t3, t6
> -
> - lea t0, [t0 + t5 * 2]
> - lea t1, [t1 + t5 * 2]
> -
> - dec t8d
> - jnz .loop
> -
> -%else ;HIGH_BIT_DEPTH
> -%if ARCH_X86_64 == 1
> - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,4
> -%else
> - DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,4
> - %define t6 r6m
> - %define t6d r6d
> - %define t7 r7m
> - %define t8d r6d
> -%endif
> -
> - mov t6d, r6m
> -%if ARCH_X86_64 == 0
> - add t6d, t6d
> - mov r6m, t6d
> -%else
> + mov r4d, r4m
> mov r5d, r5m
> - mov r7d, r7m
> - add t6d, t6d
> -%endif
> + mov r6d, r6m
> + add r5d, r5d
>
> pxor m0, m0
> - mov t8d, 4/2
> + mov t7b, 4/2
> .loop:
> - movd m1, [t0]
> - movd m2, [t0 + t5]
> + movd m1, [r0]
> + movd m2, [r0 + r4]
> punpckldq m1, m2
> punpcklbw m1, m0
> - movh m2, [t1]
> - movh m3, [t1 + t5 * 2]
> + movh m2, [r1]
> + movh m3, [r1 + r4 * 2]
> punpcklqdq m2, m3
> paddw m1, m2
> packuswb m1, m1
>
> ; store recon[] and recipred[]
> - movd [t4], m1
> - add t4, t7
> + movd [r3], m1
> pshufd m2, m1, 1
> - movd [t4], m2
> - add t4, t7
> + movd [r3 + r6], m2
>
> ; store recqt[]
> punpcklbw m1, m0
> - movlps [t3], m1
> - add t3, t6
> - movhps [t3], m1
> - add t3, t6
> -
> - lea t0, [t0 + t5 * 2]
> - lea t1, [t1 + t5 * 4]
> -
> - dec t8d
> + movlps [r2], m1
> + movhps [r2 + r5], m1
> +
> + lea r0, [r0 + r4 * 2]
> + lea r1, [r1 + r4 * 4]
> + lea r2, [r2 + r5 * 2]
> + lea r3, [r3 + r6 * 2]
> +
> + dec t7b
> jnz .loop
> + RET
> %endif ;HIGH_BIT_DEPTH
> - RET
>
>
> INIT_XMM sse2
> -cglobal calcRecons8
> +%if ARCH_X86_64 == 1
> +cglobal calcRecons8, 5,8,4
> + %define t7b r7b
> +%else
> +cglobal calcRecons8, 5,7,4,0-1
> + %define t7b byte [rsp]
> +%endif
> +
> %if HIGH_BIT_DEPTH
> -%if ARCH_X86_64 == 1
> - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,6
> -%else
> - DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,6
> - %define t6 r6m
> - %define t6d r6d
> - %define t7 r7m
> - %define t8d r6d
> -%endif
> -
> - mov t6d, r6m
> -%if ARCH_X86_64 == 0
> - add t6d, t6d
> - mov r6m, t6d
> -%else
> + mov r4d, r4m
> mov r5d, r5m
> - mov r7d, r7m
> - add t6d, t6d
> - add t7, t7
> -%endif
> + mov r6d, r6m
> + add r4d, r4d
> + add r5d, r5d
> + add r6d, r6d
>
> pxor m4, m4
> mova m5, [pw_pixel_max]
> - add t5, t5
> - mov t8d, 8/2
> + mov t7b, 8/2
> .loop:
> - movu m0, [t0]
> - movu m1, [t0 + t5]
> - movu m2, [t1]
> - movu m3, [t1 + t5]
> + movu m0, [r0]
> + movu m1, [r0 + r4]
> + movu m2, [r1]
> + movu m3, [r1 + r4]
> paddw m0, m2
> paddw m1, m3
> CLIPW m0, m4, m5
> CLIPW m1, m4, m5
>
> - ; store recon[] and recipred[]
> - movu [t4], m0
> -%if ARCH_X86_64 == 0
> - add t4, t7
> - add t4, t7
> - movu [t4], m1
> - add t4, t7
> - add t4, t7
> -%else
> - movu [t4 + t7], m1
> - lea t4, [t4 + t7 * 2]
> -%endif
> + ; store recipred[]
> + movu [r3], m0
> + movu [r3 + r6], m1
>
> ; store recqt[]
> - movu [t3], m0
> - add t3, t6
> - movu [t3], m1
> - add t3, t6
> -
> - lea t0, [t0 + t5 * 2]
> - lea t1, [t1 + t5 * 2]
> -
> - dec t8d
> + movu [r2], m0
> + movu [r2 + r5], m1
> +
> + lea r0, [r0 + r4 * 2]
> + lea r1, [r1 + r4 * 2]
> + lea r2, [r2 + r5 * 2]
> + lea r3, [r3 + r6 * 2]
> +
> + dec t7b
> jnz .loop
> + RET
> %else ;HIGH_BIT_DEPTH
>
> -%if ARCH_X86_64 == 1
> - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,5
> -%else
> - DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,5
> - %define t6 r6m
> - %define t6d r6d
> - %define t7 r7m
> - %define t8d r6d
> -%endif
> -
> - mov t6d, r6m
> -%if ARCH_X86_64 == 0
> - add t6d, t6d
> - mov r6m, t6d
> -%else
> + mov r4d, r4m
> mov r5d, r5m
> - mov r7d, r7m
> - add t6d, t6d
> -%endif
> + mov r6d, r6m
> + add r5d, r5d
>
> pxor m0, m0
> - mov t8d, 8/2
> + mov t7b, 8/2
> .loop:
> - movh m1, [t0]
> - movh m2, [t0 + t5]
> + movh m1, [r0]
> + movh m2, [r0 + r4]
> punpcklbw m1, m0
> punpcklbw m2, m0
> - movu m3, [t1]
> - movu m4, [t1 + t5 * 2]
> + movu m3, [r1]
> + movu m4, [r1 + r4 * 2]
> paddw m1, m3
> paddw m2, m4
> packuswb m1, m2
>
> ; store recon[] and recipred[]
> - movlps [t4], m1
> -%if ARCH_X86_64 == 0
> - add t4, t7
> - movhps [t4], m1
> - add t4, t7
> -%else
> - movhps [t4 + t7], m1
> - lea t4, [t4 + t7 * 2]
> -%endif
> + movlps [r3], m1
> + movhps [r3 + r6], m1
>
> ; store recqt[]
> punpcklbw m2, m1, m0
> punpckhbw m1, m0
> - movu [t3], m2
> - add t3, t6
> - movu [t3], m1
> - add t3, t6
> -
> - lea t0, [t0 + t5 * 2]
> - lea t1, [t1 + t5 * 4]
> -
> - dec t8d
> + movu [r2], m2
> + movu [r2 + r5], m1
> +
> + lea r0, [r0 + r4 * 2]
> + lea r1, [r1 + r4 * 4]
> + lea r2, [r2 + r5 * 2]
> + lea r3, [r3 + r6 * 2]
> +
> + dec t7b
> jnz .loop
> + RET
> %endif ;HIGH_BIT_DEPTH
> - RET
>
>
>
> %if HIGH_BIT_DEPTH
> INIT_XMM sse2
> -cglobal calcRecons16
> %if ARCH_X86_64 == 1
> - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,6
> +cglobal calcRecons16, 5,8,4
> + %define t7b r7b
> %else
> - DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,6
> - %define t6 r6m
> - %define t6d r6d
> - %define t7 r7m
> - %define t8d r6d
> +cglobal calcRecons16, 5,7,4,0-1
> + %define t7b byte [rsp]
> %endif
>
> - mov t6d, r6m
> -%if ARCH_X86_64 == 0
> - add t6d, t6d
> - mov r6m, t6d
> -%else
> + mov r4d, r4m
> mov r5d, r5m
> - mov r7d, r7m
> - add t6d, t6d
> - add t7, t7
> -%endif
> + mov r6d, r6m
> + add r4d, r4d
> + add r5d, r5d
> + add r6d, r6d
>
> pxor m4, m4
> mova m5, [pw_pixel_max]
> - add t5, t5
> - mov t8d, 16/2
> + mov t7b, 16/2
> .loop:
> - movu m0, [t0]
> - movu m1, [t0 + 16]
> - movu m2, [t1]
> - movu m3, [t1 + 16]
> + movu m0, [r0]
> + movu m1, [r0 + 16]
> + movu m2, [r1]
> + movu m3, [r1 + 16]
> paddw m0, m2
> paddw m1, m3
> CLIPW m0, m4, m5
> CLIPW m1, m4, m5
>
> - ; store recon[] and recipred[]
> - movu [t4], m0
> - movu [t4 + 16], m1
> -%if ARCH_X86_64 == 0
> - add t4, t7
> - add t4, t7
> -%endif
> + ; store recipred[]
> + movu [r3], m0
> + movu [r3 + 16], m1
>
> ; store recqt[]
> - movu [t3], m0
> - movu [t3 + 16], m1
> - add t3, t6
> -
> - movu m0, [t0 + t5]
> - movu m1, [t0 + t5 + 16]
> - movu m2, [t1 + t5]
> - movu m3, [t1 + t5 + 16]
> + movu [r2], m0
> + movu [r2 + 16], m1
> +
> + movu m0, [r0 + r4]
> + movu m1, [r0 + r4 + 16]
> + movu m2, [r1 + r4]
> + movu m3, [r1 + r4 + 16]
> paddw m0, m2
> paddw m1, m3
> CLIPW m0, m4, m5
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> -%if ARCH_X86_64 == 0
> - movu [t4], m0
> - movu [t4 + 16], m1
> - add t4, t7
> - add t4, t7
> + movu [r3 + r6], m0
> + movu [r3 + r6 + 16], m1
> +
> + ; store recqt[]
> + movu [r2 + r5], m0
> + movu [r2 + r5 + 16], m1
> +
> + lea r0, [r0 + r4 * 2]
> + lea r1, [r1 + r4 * 2]
> + lea r2, [r2 + r5 * 2]
> + lea r3, [r3 + r6 * 2]
> +
> + dec t7b
> + jnz .loop
> + RET
> +%else ;HIGH_BIT_DEPTH
> +
> +INIT_XMM sse4
> +%if ARCH_X86_64 == 1
> +cglobal calcRecons16, 5,8,4
> + %define t7b r7b
> %else
> - movu [t4 + t7], m0
> - movu [t4 + t7 + 16], m1
> - lea t4, [t4 + t7 * 2]
> +cglobal calcRecons16, 5,7,4,0-1
> + %define t7b byte [rsp]
> %endif
>
> - ; store recqt[]
> - movu [t3], m0
> - movu [t3 + 16], m1
> - add t3, t6
> -
> - lea t0, [t0 + t5 * 2]
> - lea t1, [t1 + t5 * 2]
> -
> - dec t8d
> - jnz .loop
> -%else ;HIGH_BIT_DEPTH
> -INIT_XMM sse4
> -cglobal calcRecons16
> -%if ARCH_X86_64 == 1
> - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,3
> -%else
> - DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,3
> - %define t6 r6m
> - %define t6d r6d
> - %define t7 r7m
> - %define t8d r6d
> -%endif
> -
> - mov t6d, r6m
> -%if ARCH_X86_64 == 0
> - add t6d, t6d
> - mov r6m, t6d
> -%else
> + mov r4d, r4m
> mov r5d, r5m
> - mov r7d, r7m
> - add t6d, t6d
> -%endif
> + mov r6d, r6m
> + add r5d, r5d
>
> pxor m0, m0
> - mov t8d, 16
> + mov t7b, 16
> .loop:
> - movu m2, [t0]
> + movu m2, [r0]
> pmovzxbw m1, m2
> punpckhbw m2, m0
> - paddw m1, [t1]
> - paddw m2, [t1 + 16]
> + paddw m1, [r1]
> + paddw m2, [r1 + 16]
> packuswb m1, m2
>
> ; store recon[] and recipred[]
> - movu [t4], m1
> + movu [r3], m1
>
> ; store recqt[]
> pmovzxbw m2, m1
> punpckhbw m1, m0
> - movu [t3], m2
> - movu [t3 + 16], m1
> -
> - add t3, t6
> - add t4, t7
> - add t0, t5
> - lea t1, [t1 + t5 * 2]
> -
> - dec t8d
> + movu [r2], m2
> + movu [r2 + 16], m1
> +
> + add r2, r5
> + add r3, r6
> + add r0, r4
> + lea r1, [r1 + r4 * 2]
> +
> + dec t7b
> jnz .loop
> + RET
> %endif ;HIGH_BIT_DEPTH
> - RET
>
> %if HIGH_BIT_DEPTH
> INIT_XMM sse2
> -cglobal calcRecons32
> %if ARCH_X86_64 == 1
> - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,6
> +cglobal calcRecons32, 5,8,4
> + %define t7b r7b
> %else
> - DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,6
> - %define t6 r6m
> - %define t6d r6d
> - %define t7 r7m
> - %define t8d r6d
> +cglobal calcRecons32, 5,7,4,0-1
> + %define t7b byte [rsp]
> %endif
>
> - mov t6d, r6m
> -%if ARCH_X86_64 == 0
> - add t6d, t6d
> - mov r6m, t6d
> -%else
> + mov r4d, r4m
> mov r5d, r5m
> - mov r7d, r7m
> - add t6d, t6d
> - add t7, t7
> -%endif
> + mov r6d, r6m
> + add r4d, r4d
> + add r5d, r5d
> + add r6d, r6d
>
> pxor m4, m4
> mova m5, [pw_pixel_max]
> - add t5, t5
> - mov t8d, 32/2
> + mov t7b, 32/2
> .loop:
>
> - movu m0, [t0]
> - movu m1, [t0 + 16]
> - movu m2, [t1]
> - movu m3, [t1 + 16]
> + movu m0, [r0]
> + movu m1, [r0 + 16]
> + movu m2, [r1]
> + movu m3, [r1 + 16]
> paddw m0, m2
> paddw m1, m3
> CLIPW m0, m4, m5
> CLIPW m1, m4, m5
>
> - ; store recon[] and recipred[]
> - movu [t4], m0
> - movu [t4 + 16], m1
> + ; store recipred[]
> + movu [r3], m0
> + movu [r3 + 16], m1
>
> ; store recqt[]
> - movu [t3], m0
> - movu [t3 + 16], m1
> -
> - movu m0, [t0 + 32]
> - movu m1, [t0 + 48]
> - movu m2, [t1 + 32]
> - movu m3, [t1 + 48]
> + movu [r2], m0
> + movu [r2 + 16], m1
> +
> + movu m0, [r0 + 32]
> + movu m1, [r0 + 48]
> + movu m2, [r1 + 32]
> + movu m3, [r1 + 48]
> paddw m0, m2
> paddw m1, m3
> CLIPW m0, m4, m5
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> - movu [t4 + 32], m0
> - movu [t4 + 48], m1
> -%if ARCH_X86_64 == 0
> - add t4, t7
> - add t4, t7
> -%endif
> + movu [r3 + 32], m0
> + movu [r3 + 48], m1
>
> ; store recqt[]
> - movu [t3 + 32], m0
> - movu [t3 + 48], m1
> - add t3, t6
> -
> - movu m0, [t0 + t5]
> - movu m1, [t0 + t5 + 16]
> - movu m2, [t1 + t5]
> - movu m3, [t1 + t5 + 16]
> + movu [r2 + 32], m0
> + movu [r2 + 48], m1
> + add r2, r5
> +
> + movu m0, [r0 + r4]
> + movu m1, [r0 + r4 + 16]
> + movu m2, [r1 + r4]
> + movu m3, [r1 + r4 + 16]
> paddw m0, m2
> paddw m1, m3
> CLIPW m0, m4, m5
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> -%if ARCH_X86_64 == 0
> - movu [t4], m0
> - movu [t4 + 16], m1
> -%else
> - movu [t4 + t7], m0
> - movu [t4 + t7 + 16], m1
> -%endif
> + movu [r3 + r6], m0
> + movu [r3 + r6 + 16], m1
>
> ; store recqt[]
> - movu [t3], m0
> - movu [t3 + 16], m1
> -
> - movu m0, [t0 + t5 + 32]
> - movu m1, [t0 + t5 + 48]
> - movu m2, [t1 + t5 + 32]
> - movu m3, [t1 + t5 + 48]
> + movu [r2], m0
> + movu [r2 + 16], m1
> +
> + movu m0, [r0 + r4 + 32]
> + movu m1, [r0 + r4 + 48]
> + movu m2, [r1 + r4 + 32]
> + movu m3, [r1 + r4 + 48]
> paddw m0, m2
> paddw m1, m3
> CLIPW m0, m4, m5
> CLIPW m1, m4, m5
>
> ; store recon[] and recipred[]
> -%if ARCH_X86_64 == 0
> - movu [t4 + 32], m0
> - movu [t4 + 48], m1
> - add t4, t7
> - add t4, t7
> -%else
> - movu [t4 + t7 + 32], m0
> - movu [t4 + t7 + 48], m1
> - lea t4, [t4 + t7 * 2]
> -%endif
> + movu [r3 + r6 + 32], m0
> + movu [r3 + r6 + 48], m1
> + lea r3, [r3 + r6 * 2]
>
> ; store recqt[]
> - movu [t3 + 32], m0
> - movu [t3 + 48], m1
> - add t3, t6
> -
> - lea t0, [t0 + t5 * 2]
> - lea t1, [t1 + t5 * 2]
> -
> - dec t8d
> + movu [r2 + 32], m0
> + movu [r2 + 48], m1
> + add r2, r5
> +
> + lea r0, [r0 + r4 * 2]
> + lea r1, [r1 + r4 * 2]
> +
> + dec t7b
> jnz .loop
> + RET
> %else ;HIGH_BIT_DEPTH
> INIT_XMM sse4
> -cglobal calcRecons32
> %if ARCH_X86_64 == 1
> - DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
> - PROLOGUE 6,9,5
> +cglobal calcRecons32, 5,8,4
> + %define t7b r7b
> %else
> - DECLARE_REG_TMP 0,1,2,3,4,5
> - PROLOGUE 6,7,5
> - %define t6 r6m
> - %define t6d r6d
> - %define t7 r7m
> - %define t8d r6d
> +cglobal calcRecons32, 5,7,4,0-1
> + %define t7b byte [rsp]
> %endif
>
> - mov t6d, r6m
> -%if ARCH_X86_64 == 0
> - add t6d, t6d
> - mov r6m, t6d
> -%else
> + mov r4d, r4m
> mov r5d, r5m
> - mov r7d, r7m
> - add t6d, t6d
> -%endif
> + mov r6d, r6m
> + add r5d, r5d
>
> pxor m0, m0
> - mov t8d, 32
> + mov t7b, 32
> .loop:
> - movu m2, [t0]
> - movu m4, [t0 + 16]
> + movu m2, [r0]
> + movu m4, [r0 + 16]
> pmovzxbw m1, m2
> punpckhbw m2, m0
> pmovzxbw m3, m4
> punpckhbw m4, m0
>
> - paddw m1, [t1 + 0 * 16]
> - paddw m2, [t1 + 1 * 16]
> + paddw m1, [r1 + 0 * 16]
> + paddw m2, [r1 + 1 * 16]
> packuswb m1, m2
>
> - paddw m3, [t1 + 2 * 16]
> - paddw m4, [t1 + 3 * 16]
> + paddw m3, [r1 + 2 * 16]
> + paddw m4, [r1 + 3 * 16]
> packuswb m3, m4
>
> ; store recon[] and recipred[]
> - movu [t4], m1
> - movu [t4 + 16], m3
> + movu [r3], m1
> + movu [r3 + 16], m3
>
> ; store recqt[]
> pmovzxbw m2, m1
> punpckhbw m1, m0
> - movu [t3 + 0 * 16], m2
> - movu [t3 + 1 * 16], m1
> + movu [r2 + 0 * 16], m2
> + movu [r2 + 1 * 16], m1
> pmovzxbw m4, m3
> punpckhbw m3, m0
> - movu [t3 + 2 * 16], m4
> - movu [t3 + 3 * 16], m3
> -
> - add t3, t6
> - add t4, t7
> - add t0, t5
> - lea t1, [t1 + t5 * 2]
> -
> - dec t8d
> + movu [r2 + 2 * 16], m4
> + movu [r2 + 3 * 16], m3
> +
> + add r2, r5
> + add r3, r6
> + add r0, r4
> + lea r1, [r1 + r4 * 2]
> +
> + dec t7b
> jnz .loop
> + RET
> %endif ;HIGH_BIT_DEPTH
> - RET
>
>
> ;-----------------------------------------------------------------------------
> diff -r 0206822d9fea -r 4348a3ed1b32 source/test/pixelharness.cpp
> --- a/source/test/pixelharness.cpp Tue Apr 01 23:28:32 2014 +0530
> +++ b/source/test/pixelharness.cpp Wed Apr 02 13:12:50 2014 -0700
> @@ -354,10 +354,8 @@
> int stride = STRIDE;
> int index1 = rand() % TEST_CASES;
> int index2 = rand() % TEST_CASES;
> - ref(pixel_test_buff[index1] + j, short_test_buff[index2] + j,
> - ref_reco, ref_recq, ref_pred, stride, stride, stride);
> - opt(pixel_test_buff[index1] + j, short_test_buff[index2] + j,
> - opt_reco, opt_recq, opt_pred, stride, stride, stride);
> + ref(pixel_test_buff[index1] + j, short_test_buff[index2] + j, ref_recq, ref_pred, stride, stride, stride);
> + opt(pixel_test_buff[index1] + j, short_test_buff[index2] + j, opt_recq, opt_pred, stride, stride, stride);
>
> if (memcmp(ref_recq, opt_recq, 64 * 64 * sizeof(int16_t)))
> {
> @@ -1609,7 +1607,7 @@
> if (opt.calcrecon[i])
> {
> HEADER("recon[%dx%d]", 4 << i, 4 << i);
> - REPORT_SPEEDUP(opt.calcrecon[i], ref.calcrecon[i], pbuf1, sbuf1, pbuf2, sbuf1, pbuf1, 64, 64, 64);
> + REPORT_SPEEDUP(opt.calcrecon[i], ref.calcrecon[i], pbuf1, sbuf1, sbuf1, pbuf1, 64, 64, 64);
> }
>
> if (opt.blockfill_s[i])
>
> _______________________________________________
> x265-devel mailing list
> x265-devel at videolan.org
> https://mailman.videolan.org/listinfo/x265-devel
--
Steve Borho
More information about the x265-devel
mailing list