[x265] [PATCH] asm: 10bpp code for calcresidual_4x4 and 8x8
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Tue Dec 10 13:52:14 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386679907 -19800
# Tue Dec 10 18:21:47 2013 +0530
# Node ID 1e14d4cc6f85b76a14713db5ef6526e71d5016c4
# Parent 682981f97057b0e66cc9fca638a9eb81938b3444
asm: 10bpp code for calcresidual_4x4 and 8x8
diff -r 682981f97057 -r 1e14d4cc6f85 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 10 16:46:51 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Tue Dec 10 18:21:47 2013 +0530
@@ -666,6 +666,9 @@
CHROMA_BLOCKCOPY(_sse2);
LUMA_BLOCKCOPY(_sse2);
+
+ p.calcresidual[BLOCK_4x4] = x265_getResidual4_sse2;
+ p.calcresidual[BLOCK_8x8] = x265_getResidual8_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 682981f97057 -r 1e14d4cc6f85 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Tue Dec 10 16:46:51 2013 +0530
+++ b/source/common/x86/pixel-util8.asm Tue Dec 10 18:21:47 2013 +0530
@@ -316,6 +316,37 @@
; void getResidual(pixel *fenc, pixel *pred, int16_t *residual, intptr_t stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
+%if HIGH_BIT_DEPTH
+cglobal getResidual4, 4,4,4
+ add r3, r3
+
+ ; row 0-1
+ movh m0, [r0]
+ movh m1, [r0 + r3]
+ movh m2, [r1]
+ movh m3, [r1 + r3]
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+ psubw m0, m2
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 2]
+
+ ; row 2-3
+ movh m0, [r0]
+ movh m1, [r0 + r3]
+ movh m2, [r1]
+ movh m3, [r1 + r3]
+ punpcklqdq m0, m1
+ punpcklqdq m2, m3
+ psubw m0, m2
+
+ movlps [r2], m0
+ movhps [r2 + r3], m0
+%else
cglobal getResidual4, 4,4,5
pxor m0, m0
@@ -347,11 +378,34 @@
psubw m1, m3
movlps [r2], m1
movhps [r2 + r3 * 2], m1
-
+%endif
RET
INIT_XMM sse2
+%if HIGH_BIT_DEPTH
+cglobal getResidual8, 4,4,4
+ add r3, r3
+
+%assign x 0
+%rep 8/2
+ ; row 0-1
+ movu m1, [r0]
+ movu m2, [r0 + r3]
+ movu m3, [r1]
+ movu m4, [r1 + r3]
+ psubw m1, m3
+ psubw m2, m4
+ movu [r2], m1
+ movu [r2 + r3], m2
+%assign x x+1
+%if (x != 4)
+ lea r0, [r0 + r3 * 2]
+ lea r1, [r1 + r3 * 2]
+ lea r2, [r2 + r3 * 2]
+%endif
+%endrep
+%else
cglobal getResidual8, 4,4,5
pxor m0, m0
@@ -377,6 +431,7 @@
lea r2, [r2 + r3 * 4]
%endif
%endrep
+%endif
RET
More information about the x265-devel
mailing list