[x265] [PATCH] blockcopy_pp_12x32: SSE2 asm code optimization
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Thu Feb 5 12:59:24 CET 2015
# HG changeset patch
# User Praveen Tiwari
# Date 1423137543 -19800
# Node ID b10384b8c8a9a60fe37f4e5f3506673dcf00c004
# Parent 499eddf5c1e4dfcb8447d65cb0b48d633b3660a5
blockcopy_pp_12x32: SSE2 asm code optimization
diff -r 499eddf5c1e4 -r b10384b8c8a9 source/common/x86/blockcopy8.asm
--- a/source/common/x86/blockcopy8.asm Thu Feb 05 16:48:36 2015 +0530
+++ b/source/common/x86/blockcopy8.asm Thu Feb 05 17:29:03 2015 +0530
@@ -584,7 +584,55 @@
BLOCKCOPY_PP_W12_H4 12, 16
-BLOCKCOPY_PP_W12_H4 12, 32
+;-----------------------------------------------------------------------------
+; void blockcopy_pp_12x32(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
+;-----------------------------------------------------------------------------
+INIT_XMM sse2
+cglobal blockcopy_pp_12x32, 4, 7, 8
+
+ lea r5, [3 * r1]
+ lea r6, [3 * r3]
+
+ movh m0, [r2]
+ movd m1, [r2 + 8]
+ movh m2, [r2 + r3]
+ movd m3, [r2 + r3 + 8]
+ movh m4, [r2 + 2 * r3]
+ movd m5, [r2 + 2 * r3 + 8]
+ movh m6, [r2 + r6]
+ movd m7, [r2 + r6 + 8]
+
+ movh [r0], m0
+ movd [r0 + 8], m1
+ movh [r0 + r1], m2
+ movd [r0 + r1 + 8], m3
+ movh [r0 + 2 * r1], m4
+ movd [r0 + 2 * r1 + 8], m5
+ movh [r0 + r5], m6
+ movd [r0 + r5 + 8], m7
+
+ %rep 7
+ lea r2, [r2 + 4 * r3]
+ movh m0, [r2]
+ movd m1, [r2 + 8]
+ movh m2, [r2 + r3]
+ movd m3, [r2 + r3 + 8]
+ movh m4, [r2 + 2 * r3]
+ movd m5, [r2 + 2 * r3 + 8]
+ movh m6, [r2 + r6]
+ movd m7, [r2 + r6 + 8]
+
+ lea r0, [r0 + 4 * r1]
+ movh [r0], m0
+ movd [r0 + 8], m1
+ movh [r0 + r1], m2
+ movd [r0 + r1 + 8], m3
+ movh [r0 + 2 * r1], m4
+ movd [r0 + 2 * r1 + 8], m5
+ movh [r0 + r5], m6
+ movd [r0 + r5 + 8], m7
+%endrep
+ RET
;-----------------------------------------------------------------------------
; void blockcopy_pp_16x4(pixel* dst, intptr_t dstStride, const pixel* src, intptr_t srcStride)
More information about the x265-devel
mailing list