[x265] Fwd: [PATCH] blockcopy_pp_12x32: SSE2 asm code optimization
Praveen Tiwari
praveen at multicorewareinc.com
Fri Feb 6 10:23:23 CET 2015
---------- Forwarded message ----------
From: chen <chenm003 at 163.com>
Date: Thu, Feb 5, 2015 at 5:55 PM
Subject: Re: [x265] [PATCH] blockcopy_pp_12x32: SSE2 asm code optimization
To: Development for x265 <x265-devel at videolan.org>
>>this code is right
>>but could you try use general register move (rN, rNd) in x64 mode?
I applied your idea of using general register as buffer in x64 for 4x8
(easy to test with) but surprisingly using SIMD registers is faster. here I
have the code and performance numbers:
copy_pp[ 4x8] 2.67x *139.98 * 374.18 [using general
register move (rN, rNd)]
copy_pp[ 4x8] 3.34x *109.60 * 366.35 [SIMD registers
as buffer]
codes: [using general register move (rN, rNd)]
;-----------------------------------------------------------------------------
; void blockcopy_pp_4x8(pixel* dst, intptr_t dstStride, const pixel* src,
intptr_t srcStride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal blockcopy_pp_4x8, 4, 10, 0
lea r4, [3 * r1]
lea r5, [3 * r3]
mov r6d, [r2]
mov r7d, [r2 + r3]
mov r8d, [r2 + 2 * r3]
mov r9d, [r2 + r5]
mov [r0], r6d
mov [r0 + r1], r7d
mov [r0 + 2 * r1], r8d
mov [r0 + r4], r9d
lea r2, [r2 + 4 * r3]
mov r6d, [r2]
mov r7d, [r2 + r3]
mov r8d, [r2 + 2 * r3]
mov r9d, [r2 + r5]
lea r0, [r0 + 4 * r1]
mov [r0], r6d
mov [r0 + r1], r7d
mov [r0 + 2 * r1], r8d
mov [r0 + r4], r9d
RET
code [SIMD registers as buffer]
INIT_XMM sse2
cglobal blockcopy_pp_4x8, 4, 6, 4
lea r4, [3 * r1]
lea r5, [3 * r3]
movd m0, [r2]
movd m1, [r2 + r3]
movd m2, [r2 + 2 * r3]
movd m3, [r2 + r5]
movd [r0], m0
movd [r0 + r1], m1
movd [r0 + 2 * r1], m2
movd [r0 + r4], m3
lea r2, [r2 + 4 * r3]
movd m0, [r2]
movd m1, [r2 + r3]
movd m2, [r2 + 2 * r3]
movd m3, [r2 + r5]
lea r0, [r0 + 4 * r1]
movd [r0], m0
movd [r0 + r1], m1
movd [r0 + 2 * r1], m2
movd [r0 + r4], m3
RET
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150206/8cae167f/attachment-0001.html>
More information about the x265-devel
mailing list