[x265] [PATCH] asm: 10bpp code for calcresidual_16x16 and 32x32
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Tue Dec 10 14:40:44 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386682816 -19800
# Tue Dec 10 19:10:16 2013 +0530
# Node ID 98cbf29964c94fdc10319c7919b78a2adba6cf61
# Parent 1e14d4cc6f85b76a14713db5ef6526e71d5016c4
asm: 10bpp code for calcresidual_16x16 and 32x32
diff -r 1e14d4cc6f85 -r 98cbf29964c9 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 10 18:21:47 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 10 19:10:16 2013 +0530
@@ -669,6 +669,8 @@
p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
+ p.calcresidual[BLOCK_16x16] = x265_getResidual16_sse2;
+ p.calcresidual[BLOCK_32x32] = x265_getResidual32_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 1e14d4cc6f85 -r 98cbf29964c9 source/common/x86/pixel-util.h
--- a/source/common/x86/pixel-util.h Tue Dec 10 18:21:47 2013 +0530
+++ b/source/common/x86/pixel-util.h Tue Dec 10 19:10:16 2013 +0530
@@ -31,7 +31,9 @@
void x265_getResidual4_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
void x265_getResidual8_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
+void x265_getResidual16_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
void x265_getResidual16_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
+void x265_getResidual32_sse2(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
void x265_getResidual32_sse4(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride);
void x265_transpose4_sse2(pixel *dest, pixel *src, intptr_t stride);
diff -r 1e14d4cc6f85 -r 98cbf29964c9 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Dec 10 18:21:47 2013 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Dec 10 19:10:16 2013 +0530
@@ -434,6 +434,61 @@
%endif
RET
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+cglobal getResidual16, 4,5,6
+ add r3, r3
+ mov r4d, 16/4
+.loop:
+ ; row 0-1
+ movu m0, [r0]
+ movu m1, [r0 + 16]
+ movu m2, [r0 + r3]
+ movu m3, [r0 + r3 + 16]
+ movu m4, [r1]
+ movu m5, [r1 + 16]
+ psubw m0, m4
+ psubw m1, m5
+ movu m4, [r1 + r3]
+ movu m5, [r1 + r3 + 16]
+ psubw m2, m4
+ psubw m3, m5
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+
+ movu [r2], m0
+ movu [r2 + 16], m1
+ movu [r2 + r3], m2
+ movu [r2 + r3 + 16], m3
+ lea r2, [r2 + r3 * 2]
+
+ ; row 2-3
+ movu m0, [r0]
+ movu m1, [r0 + 16]
+ movu m2, [r0 + r3]
+ movu m3, [r0 + r3 + 16]
+ movu m4, [r1]
+ movu m5, [r1 + 16]
+ psubw m0, m4
+ psubw m1, m5
+ movu m4, [r1 + r3]
+ movu m5, [r1 + r3 + 16]
+ psubw m2, m4
+ psubw m3, m5
+
+ movu [r2], m0
+ movu [r2 + 16], m1
+ movu [r2 + r3], m2
+ movu [r2 + r3 + 16], m3
+
+ dec r4d
+
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 2]
+
+ jnz .loop
+%else
INIT_XMM sse4
cglobal getResidual16, 4,5,8
@@ -497,9 +552,63 @@
lea r2, [r2 + r3 * 4]
jnz .loop
+%endif
+
RET
-
+%if HIGH_BIT_DEPTH
+INIT_XMM sse2
+cglobal getResidual32, 4,5,6
+ add r3, r3
+ mov r4d, 32/2
+.loop:
+ ; row 0
+ movu m0, [r0]
+ movu m1, [r0 + 16]
+ movu m2, [r0 + 32]
+ movu m3, [r0 + 48]
+ movu m4, [r1]
+ movu m5, [r1 + 16]
+ psubw m0, m4
+ psubw m1, m5
+ movu m4, [r1 + 32]
+ movu m5, [r1 + 48]
+ psubw m2, m4
+ psubw m3, m5
+
+ movu [r2], m0
+ movu [r2 + 16], m1
+ movu [r2 + 32], m2
+ movu [r2 + 48], m3
+
+ ; row 1
+ movu m0, [r0 + r3]
+ movu m1, [r0 + r3 + 16]
+ movu m2, [r0 + r3 + 32]
+ movu m3, [r0 + r3 + 48]
+ movu m4, [r1 + r3]
+ movu m5, [r1 + r3 + 16]
+ psubw m0, m4
+ psubw m1, m5
+ movu m4, [r1 + r3 + 32]
+ movu m5, [r1 + r3 + 48]
+ psubw m2, m4
+ psubw m3, m5
+
+ movu [r2 + r3], m0
+ movu [r2 + r3 + 16], m1
+ movu [r2 + r3 + 32], m2
+ movu [r2 + r3 + 48], m3
+
+ dec r4d
+
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 2]
+
+ jnz .loop
+
+%else
INIT_XMM sse4
cglobal getResidual32, 4,5,7
mov r4d, 32/2
@@ -556,6 +665,7 @@
lea r2, [r2 + r3 * 4]
jnz .loop
+%endif
RET
More information about the x265-devel
mailing list