[x265] [PATCH] asm: 10bpp code for calcrecon_16x16 and 32x32
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Wed Dec 11 09:52:47 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386751746 -19800
# Wed Dec 11 14:19:06 2013 +0530
# Node ID 2e141f382fa809330244989c2822412e62b6015d
# Parent 15d12e33cbf8e2766aeb2b79fed578323a66a93f
asm: 10bpp code for calcrecon_16x16 and 32x32
diff -r 15d12e33cbf8 -r 2e141f382fa8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 11 13:29:58 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 11 14:19:06 2013 +0530
@@ -679,6 +679,8 @@
p.calcrecon[BLOCK_4x4] = x265_calcRecons4_sse2;
p.calcrecon[BLOCK_8x8] = x265_calcRecons8_sse2;
+ p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
+ p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 15d12e33cbf8 -r 2e141f382fa8 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Wed Dec 11 13:29:58 2013 +0530
+++ b/source/common/x86/pixel-util.h Wed Dec 11 14:19:06 2013 +0530
@@ -26,6 +26,8 @@
void x265_calcRecons4_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
void x265_calcRecons8_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons16_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
+void x265_calcRecons32_sse2(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
void x265_calcRecons16_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
void x265_calcRecons32_sse4(pixel* pred, int16_t* residual, pixel* recon, int16_t* reconqt, pixel *reconipred, int stride, int strideqt, int strideipred);
diff -r 15d12e33cbf8 -r 2e141f382fa8 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Wed Dec 11 13:29:58 2013 +0530
+++ b/source/common/x86/pixel-util8.asm Wed Dec 11 14:19:06 2013 +0530
@@ -325,6 +325,97 @@
RET
+
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+cglobal calcRecons16
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,6
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,6
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+ add t7, t7
+%endif
+
+ pxor m4, m4
+ mova m5, [pw_pixel_max]
+ add t5, t5
+ mov t8d, 16/2
+.loop:
+ movu m0, [t0]
+ movu m1, [t0 + 16]
+ movu m2, [t1]
+ movu m3, [t1 + 16]
+ paddw m0, m2
+ paddw m1, m3
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ ; store recon[] and recipred[]
+ movu [t2], m0
+ movu [t2 + 16], m1
+ movu [t4], m0
+ movu [t4 + 16], m1
+%if ARCH_X86_64 == 0
+ add t4, t7
+ add t4, t7
+%endif
+
+ ; store recqt[]
+ movu [t3], m0
+ movu [t3 + 16], m1
+ add t3, t6
+
+ movu m0, [t0 + t5]
+ movu m1, [t0 + t5 + 16]
+ movu m2, [t1 + t5]
+ movu m3, [t1 + t5 + 16]
+ paddw m0, m2
+ paddw m1, m3
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ ; store recon[] and recipred[]
+ movu [t2 + t5], m0
+ movu [t2 + t5 + 16], m1
+%if ARCH_X86_64 == 0
+ movu [t4], m0
+ movu [t4 + 16], m1
+ add t4, t7
+ add t4, t7
+%else
+ movu [t4 + t7], m0
+ movu [t4 + t7 + 16], m1
+ lea t4, [t4 + t7 * 2]
+%endif
+
+ ; store recqt[]
+ movu [t3], m0
+ movu [t3 + 16], m1
+ add t3, t6
+
+ lea t0, [t0 + t5 * 2]
+ lea t1, [t1 + t5 * 2]
+ lea t2, [t2 + t5 * 2]
+
+ dec t8d
+ jnz .loop
+%else ;HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal calcRecons16
%if ARCH_X86_64 == 1
@@ -377,9 +468,143 @@
dec t8d
jnz .loop
+%endif ;HIGH_BIT_DEPTH
RET
-
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+cglobal calcRecons32
+%if ARCH_X86_64 == 1
+ DECLARE_REG_TMP 0,1,2,3,4,5,6,7,8
+ PROLOGUE 6,9,6
+%else
+ DECLARE_REG_TMP 0,1,2,3,4,5
+ PROLOGUE 6,7,6
+ %define t6 r6m
+ %define t6d r6d
+ %define t7 r7m
+ %define t8d r6d
+%endif
+
+ mov t6d, r6m
+%if ARCH_X86_64 == 0
+ add t6d, t6d
+ mov r6m, t6d
+%else
+ mov r5d, r5m
+ mov r7d, r7m
+ add t6d, t6d
+ add t7, t7
+%endif
+
+ pxor m4, m4
+ mova m5, [pw_pixel_max]
+ add t5, t5
+ mov t8d, 32/2
+.loop:
+
+ movu m0, [t0]
+ movu m1, [t0 + 16]
+ movu m2, [t1]
+ movu m3, [t1 + 16]
+ paddw m0, m2
+ paddw m1, m3
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ ; store recon[] and recipred[]
+ movu [t2], m0
+ movu [t2 + 16], m1
+ movu [t4], m0
+ movu [t4 + 16], m1
+
+ ; store recqt[]
+ movu [t3], m0
+ movu [t3 + 16], m1
+
+ movu m0, [t0 + 32]
+ movu m1, [t0 + 48]
+ movu m2, [t1 + 32]
+ movu m3, [t1 + 48]
+ paddw m0, m2
+ paddw m1, m3
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ ; store recon[] and recipred[]
+ movu [t2 + 32], m0
+ movu [t2 + 48], m1
+ movu [t4 + 32], m0
+ movu [t4 + 48], m1
+%if ARCH_X86_64 == 0
+ add t4, t7
+ add t4, t7
+%endif
+
+ ; store recqt[]
+ movu [t3 + 32], m0
+ movu [t3 + 48], m1
+ add t3, t6
+
+ movu m0, [t0 + t5]
+ movu m1, [t0 + t5 + 16]
+ movu m2, [t1 + t5]
+ movu m3, [t1 + t5 + 16]
+ paddw m0, m2
+ paddw m1, m3
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ ; store recon[] and recipred[]
+ movu [t2 + t5], m0
+ movu [t2 + t5 + 16], m1
+%if ARCH_X86_64 == 0
+ movu [t4], m0
+ movu [t4 + 16], m1
+%else
+ movu [t4 + t7], m0
+ movu [t4 + t7 + 16], m1
+%endif
+
+ ; store recqt[]
+ movu [t3], m0
+ movu [t3 + 16], m1
+
+ movu m0, [t0 + t5 + 32]
+ movu m1, [t0 + t5 + 48]
+ movu m2, [t1 + t5 + 32]
+ movu m3, [t1 + t5 + 48]
+ paddw m0, m2
+ paddw m1, m3
+ CLIPW m0, m4, m5
+ CLIPW m1, m4, m5
+
+ ; store recon[] and recipred[]
+ movu [t2 + t5 + 32], m0
+ movu [t2 + t5 + 48], m1
+%if ARCH_X86_64 == 0
+ movu [t4 + 32], m0
+ movu [t4 + 48], m1
+ add t4, t7
+ add t4, t7
+%else
+ movu [t4 + t7 + 32], m0
+ movu [t4 + t7 + 48], m1
+ lea t4, [t4 + t7 * 2]
+%endif
+
+ ; store recqt[]
+ movu [t3 + 32], m0
+ movu [t3 + 48], m1
+ add t3, t6
+
+ lea t0, [t0 + t5 * 2]
+ lea t1, [t1 + t5 * 2]
+ lea t2, [t2 + t5 * 2]
+
+ dec t8d
+ jnz .loop
+%else ;HIGH_BIT_DEPTH
INIT_XMM sse4
cglobal calcRecons32
%if ARCH_X86_64 == 1
@@ -446,6 +671,7 @@
dec t8d
jnz .loop
+%endif ;HIGH_BIT_DEPTH
RET
More information about the x265-devel
mailing list