[x265] [PATCH] asm: 10bpp code for calcrecon_4x4 and 8x8
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Wed Dec 11 09:03:33 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386748798 -19800
# Wed Dec 11 13:29:58 2013 +0530
# Node ID 15d12e33cbf8e2766aeb2b79fed578323a66a93f
# Parent 1686a3f8b3d04a33dfff3da8ca55c0acc1684bc1
asm: 10bpp code for calcrecon_4x4 and 8x8
diff -r 1686a3f8b3d0 -r 15d12e33cbf8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 11 12:46:46 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 11 13:29:58 2013 +0530
@@ -676,6 +676,9 @@
p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2;
+
+ p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;
+ p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 1686a3f8b3d0 -r 15d12e33cbf8 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Dec 11 12:46:46 2013 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Dec 11 13:29:58 2013 +0530
@@ -55,12 +55,80 @@
cextern pw_1
cextern pw_00ff
cextern pw_2000
+cextern pw_pixel_max
;-----------------------------------------------------------------------------
; void calcrecon(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal calcRecons4
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,6
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,6
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+ add t7, t7
+%endif
+
+ pxor m4, m4
+ mova m5, [pw_pixel_max]
+ add t5, t5
+ mov t8d, 4/2
+.loop:
+ movh m0, [t0]
+ movh m1, [t0 + t5]
+ punpcklqdq m0, m1
+ movh m2, [t1]
+ movh m3, [t1 + t5]
+ punpcklqdq m2, m3
+ paddw m0, m2
+ CLIPW m0, m4, m5
+
+ ; store recon[] and recipred[]
+ movh [t2], m0
+ movh [t4], m0
+%if ARCH_X86_64 == 0
+ add t4, t7
+ add t4, t7
+ movhps [t4], m0
+ add t4, t7
+ add t4, t7
+%else
+ movhps [t4 + t7], m0
+ lea t4, [t4 + t7 * 2]
+%endif
+ movhps [t2 + t5], m0
+
+ ; store recqt[]
+ movh [t3], m0
+ add t3, t6
+ movhps [t3], m0
+ add t3, t6
+
+ lea t0, [t0 + t5 * 2]
+ lea t1, [t1 + t5 * 2]
+ lea t2, [t2 + t5 * 2]
+
+ dec t8d
+ jnz .loop
+
+%else ;HIGH_BIT_DEPTH
%if ARCH_X86_64 == 1
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
PROLOGUE 6,9,4
@@ -118,11 +186,79 @@
dec t8d
jnz .loop
+%endif ;HIGH_BIT_DEPTH
RET
INIT_XMM sse2
cglobal calcRecons8
+%if HIGH_BIT_DEPTH
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,6
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,6
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+ add t7, t7
+%endif
+
+ pxor m4, m4
+ mova m5, [pw_pixel_max]
+ add t5, t5
+ mov t8d, 8/2
+.loop:
+ movu m0, [t0]
+ movu m1, [t0 + t5]
+ movu m2, [t1]
+ movu m3, [t1 + t5]
+ paddw m0, m2
+ paddw m1, m3
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ ; store recon[] and recipred[]
+ movu [t2], m0
+ movu [t2 + t5], m1
+ movu [t4], m0
+%if ARCH_X86_64 == 0
+ add t4, t7
+ add t4, t7
+ movu [t4], m1
+ add t4, t7
+ add t4, t7
+%else
+ movu [t4 + t7], m1
+ lea t4, [t4 + t7 * 2]
+%endif
+
+ ; store recqt[]
+ movu [t3], m0
+ add t3, t6
+ movu [t3], m1
+ add t3, t6
+
+ lea t0, [t0 + t5 * 2]
+ lea t1, [t1 + t5 * 2]
+ lea t2, [t2 + t5 * 2]
+
+ dec t8d
+ jnz .loop
+%else ;HIGH_BIT_DEPTH
+
%if ARCH_X86_64 == 1
DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
PROLOGUE 6,9,5
@@ -185,6 +321,7 @@
dec t8d
jnz .loop
+%endif ;HIGH_BIT_DEPTH
RET
More information about the x265-devel
mailing list