[x265] [PATCH] asm: 10bpp code for blockcopy_pp_2xN and 4xN
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Mon Dec 9 12:52:20 CET 2013
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1386589680 -19800
# Mon Dec 09 17:18:00 2013 +0530
# Node ID 72495934c306f0c42297bd3f01778a8da85346a5
# Parent 76b0c50cd719281494a835907fd737b01310fe50
asm: 10bpp code for blockcopy_pp_2xN and 4xN
diff -r 76b0c50cd719 -r 72495934c306 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Dec 09 15:54:31 2013 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Dec 09 17:18:00 2013 +0530
@@ -663,6 +663,11 @@
CHROMA_PIXELSUB_PS(_sse2);
LUMA_PIXELSUB(_sse2);
+
+ p.chroma[X265_CSP_I420].copy_pp[LUMA_4x8] = x265_blockcopy_pp_2x4_sse2;
+ p.chroma[X265_CSP_I420].copy_pp[LUMA_4x16] = x265_blockcopy_pp_2x8_sse2;
+ p.chroma[X265_CSP_I420].copy_pp[LUMA_8x4] = x265_blockcopy_pp_4x2_sse2;
+ p.chroma[X265_CSP_I420].copy_pp[LUMA_8x8] = x265_blockcopy_pp_4x4_sse2;
}
if (cpuMask & X265_CPU_SSSE3)
{
diff -r 76b0c50cd719 -r 72495934c306 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Mon Dec 09 15:54:31 2013 +0530
+++ b/source/common/x86/blockcopy8.asm Mon Dec 09 17:18:00 2013 +0530
@@ -36,19 +36,33 @@
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x4, 4, 7, 0, dest, deststride, src, srcstride
-
-mov r4w, [r2]
-mov r5w, [r2 + r3]
-mov r6w, [r2 + 2 * r3]
-lea r3, [r3 + r3 * 2]
-mov r3w, [r2 + r3]
-
-mov [r0], r4w
-mov [r0 + r1], r5w
-mov [r0 + 2 * r1], r6w
-lea r1, [r1 + 2 * r1]
-mov [r0 + r1], r3w
-
+%if HIGH_BIT_DEPTH
+ add r1, r1
+ add r3, r3
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ mov r6d, [r2]
+ mov r3d, [r2 + r3]
+
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+ lea r0, [r0 + 2 * r1]
+ mov [r0], r6d
+ mov [r0 + r1], r3d
+%else
+ mov r4w, [r2]
+ mov r5w, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ mov r6w, [r2]
+ mov r3w, [r2 + r3]
+
+ mov [r0], r4w
+ mov [r0 + r1], r5w
+ lea r0, [r0 + 2 * r1]
+ mov [r0], r6w
+ mov [r0 + r1], r3w
+%endif
RET
;-----------------------------------------------------------------------------
@@ -56,154 +70,204 @@
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_2x8, 4, 7, 0, dest, deststride, src, srcstride
-
-mov r4w, [r2]
-mov r5w, [r2 + r3]
-mov r6w, [r2 + 2 * r3]
-
-mov [r0], r4w
-mov [r0 + r1], r5w
-mov [r0 + 2 * r1], r6w
-
-lea r0, [r0 + 2 * r1]
-lea r2, [r2 + 2 * r3]
-
-mov r4w, [r2 + r3]
-mov r5w, [r2 + 2 * r3]
-
-mov [r0 + r1], r4w
-mov [r0 + 2 * r1], r5w
-
-lea r0, [r0 + 2 * r1]
-lea r2, [r2 + 2 * r3]
-
-mov r4w, [r2 + r3]
-mov r5w, [r2 + 2 * r3]
-
-mov [r0 + r1], r4w
-mov [r0 + 2 * r1], r5w
-
-lea r0, [r0 + 2 * r1]
-lea r2, [r2 + 2 * r3]
-
-mov r4w, [r2 + r3]
-mov [r0 + r1], r4w
-RET
+%if HIGH_BIT_DEPTH
+ add r1, r1
+ add r3, r3
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ mov r6d, [r2]
+
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+ lea r0, [r0 + 2 * r1]
+ mov [r0], r6d
+ mov r4d, [r2 + r3]
+ mov [r0 + r1], r4d
+
+ lea r2, [r2 + r3 * 2]
+ lea r0, [r0 + 2 * r1]
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ mov r6d, [r2]
+ mov r3d, [r2 + r3]
+
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+ lea r0, [r0 + 2 * r1]
+ mov [r0], r6d
+ mov [r0 + r1], r3d
+%else
+ mov r4w, [r2]
+ mov r5w, [r2 + r3]
+ mov r6w, [r2 + 2 * r3]
+
+ mov [r0], r4w
+ mov [r0 + r1], r5w
+ mov [r0 + 2 * r1], r6w
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ mov r4w, [r2 + r3]
+ mov r5w, [r2 + 2 * r3]
+
+ mov [r0 + r1], r4w
+ mov [r0 + 2 * r1], r5w
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ mov r4w, [r2 + r3]
+ mov r5w, [r2 + 2 * r3]
+
+ mov [r0 + r1], r4w
+ mov [r0 + 2 * r1], r5w
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ mov r4w, [r2 + r3]
+ mov [r0 + r1], r4w
+%endif
+ RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_4x2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
-cglobal blockcopy_pp_4x2, 4, 6, 2, dest, deststride, src, srcstride
-
-mov r4d, [r2]
-mov r5d, [r2 + r3]
-
-mov [r0], r4d
-mov [r0 + r1], r5d
-
-RET
+cglobal blockcopy_pp_4x2, 4, 6, 0, dest, deststride, src, srcstride
+%if HIGH_BIT_DEPTH
+ add r1, r1
+ add r3, r3
+ mov r4, [r2]
+ mov r5, [r2 + r3]
+ mov [r0], r4
+ mov [r0 + r1], r5
+%else
+ mov r4d, [r2]
+ mov r5d, [r2 + r3]
+
+ mov [r0], r4d
+ mov [r0 + r1], r5d
+%endif
+ RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_4x4(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_4x4, 4, 4, 4, dest, deststride, src, srcstride
-
-movd m0, [r2]
-movd m1, [r2 + r3]
-movd m2, [r2 + 2 * r3]
-lea r3, [r3 + r3 * 2]
-movd m3, [r2 + r3]
-
-movd [r0], m0
-movd [r0 + r1], m1
-movd [r0 + 2 * r1], m2
-lea r1, [r1 + 2 * r1]
-movd [r0 + r1], m3
-
-RET
-
-;-----------------------------------------------------------------------------
-; void blockcopy_pp_4x8(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
-;-----------------------------------------------------------------------------
-INIT_XMM sse2
-cglobal blockcopy_pp_4x8, 4, 6, 8, dest, deststride, src, srcstride
-
-movd m0, [r2]
-movd m1, [r2 + r3]
-movd m2, [r2 + 2 * r3]
-lea r4, [r2 + 2 * r3]
-movd m3, [r4 + r3]
-
-movd m4, [r4 + 2 * r3]
-lea r4, [r4 + 2 * r3]
-movd m5, [r4 + r3]
-movd m6, [r4 + 2 * r3]
-lea r4, [r4 + 2 * r3]
-movd m7, [r4 + r3]
-
-movd [r0], m0
-movd [r0 + r1], m1
-movd [r0 + 2 * r1], m2
-lea r5, [r0 + 2 * r1]
-movd [r5 + r1], m3
-
-movd [r5 + 2 * r1], m4
-lea r5, [r5 + 2 * r1]
-movd [r5 + r1], m5
-movd [r5 + 2 * r1], m6
-lea r5, [r5 + 2 * r1]
-movd [r5 + r1], m7
-
-RET
+%if HIGH_BIT_DEPTH
+ add r1, r1
+ add r3, r3
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movh m2, [r2]
+ movh m3, [r2 + r3]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movh [r0], m2
+ movh [r0 + r1], m3
+%else
+ movd m0, [r2]
+ movd m1, [r2 + r3]
+ movd m2, [r2 + 2 * r3]
+ lea r3, [r3 + r3 * 2]
+ movd m3, [r2 + r3]
+
+ movd [r0], m0
+ movd [r0 + r1], m1
+ movd [r0 + 2 * r1], m2
+ lea r1, [r1 + 2 * r1]
+ movd [r0 + r1], m3
+%endif
+ RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_%1x%2(pixel *dest, intptr_t deststride, pixel *src, intptr_t srcstride)
;-----------------------------------------------------------------------------
%macro BLOCKCOPY_PP_W4_H8 2
INIT_XMM sse2
-cglobal blockcopy_pp_%1x%2, 4, 7, 8, dest, deststride, src, srcstride
-
-
-mov r4d, %2
-
+cglobal blockcopy_pp_%1x%2, 4, 5, 4, dest, deststride, src, srcstride
+ mov r4d, %2/8
+
+%if HIGH_BIT_DEPTH
+ add r1, r1
+ add r3, r3
.loop
- movd m0, [r2]
- movd m1, [r2 + r3]
- movd m2, [r2 + 2 * r3]
- lea r5, [r2 + 2 * r3]
- movd m3, [r5 + r3]
-
- movd m4, [r5 + 2 * r3]
- lea r5, [r5 + 2 * r3]
- movd m5, [r5 + r3]
- movd m6, [r5 + 2 * r3]
- lea r5, [r5 + 2 * r3]
- movd m7, [r5 + r3]
-
- movd [r0], m0
- movd [r0 + r1], m1
- movd [r0 + 2 * r1], m2
- lea r6, [r0 + 2 * r1]
- movd [r6 + r1], m3
-
- movd [r6 + 2 * r1], m4
- lea r6, [r6 + 2 * r1]
- movd [r6 + r1], m5
- movd [r6 + 2 * r1], m6
- lea r6, [r6 + 2 * r1]
- movd [r6 + r1], m7
-
- lea r0, [r0 + 8 * r1]
- lea r2, [r2 + 8 * r3]
-
- sub r4d, 8
- jnz .loop
-
-RET
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movh m2, [r2]
+ movh m3, [r2 + r3]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movh [r0], m2
+ movh [r0 + r1], m3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ movh m0, [r2]
+ movh m1, [r2 + r3]
+ lea r2, [r2 + r3 * 2]
+ movh m2, [r2]
+ movh m3, [r2 + r3]
+
+ movh [r0], m0
+ movh [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movh [r0], m2
+ movh [r0 + r1], m3
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+%else
+.loop
+ movd m0, [r2]
+ movd m1, [r2 + r3]
+ lea r2, [r2 + 2 * r3]
+ movd m2, [r2]
+ movd m3, [r2 + r3]
+
+ movd [r0], m0
+ movd [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movd [r0], m2
+ movd [r0 + r1], m3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+ movd m0, [r2]
+ movd m1, [r2 + r3]
+ lea r2, [r2 + 2 * r3]
+ movd m2, [r2]
+ movd m3, [r2 + r3]
+
+ movd [r0], m0
+ movd [r0 + r1], m1
+ lea r0, [r0 + 2 * r1]
+ movd [r0], m2
+ movd [r0 + r1], m3
+
+ lea r0, [r0 + 2 * r1]
+ lea r2, [r2 + 2 * r3]
+
+ dec r4d
+ jnz .loop
+%endif
+ RET
%endmacro
+BLOCKCOPY_PP_W4_H8 4, 8
BLOCKCOPY_PP_W4_H8 4, 16
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list