[x265] [PATCH] asm: improve pixel_add_ps_8xN with loop unroll, [8x8] from 278c to 245c
Min Chen
chenm003 at 163.com
Tue Mar 3 01:54:09 CET 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1425344040 28800
# Node ID 64214b2faa324d91a015190b8dc69716ebab41f8
# Parent 018e8bbaa854b1a4bd82b3a2e23f7775a77da5cc
asm: improve pixel_add_ps_8xN with loop unroll, [8x8] from 278c to 245c
---
source/common/x86/pixeladd8.asm | 81 ++++++++++++++++++++-------------------
1 files changed, 41 insertions(+), 40 deletions(-)
diff -r 018e8bbaa854 -r 64214b2faa32 source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Fri Feb 27 11:46:09 2015 +0530
+++ b/source/common/x86/pixeladd8.asm Mon Mar 02 16:54:00 2015 -0800
@@ -188,47 +188,48 @@
%macro PIXEL_ADD_PS_W8_H4 2
%if HIGH_BIT_DEPTH
INIT_XMM sse2
-cglobal pixel_add_ps_8x%2, 6, 7, 6, dest, destride, src0, scr1, srcStride0, srcStride1
- mova m5, [pw_pixel_max]
- pxor m4, m4
- mov r6d, %2/4
- add r4, r4
- add r5, r5
- add r1, r1
-.loop:
- movu m0, [r2]
- movu m2, [r2 + r4]
- movu m1, [r3]
- movu m3, [r3 + r5]
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
+cglobal pixel_add_ps_8x%2, 6,6,6
+ FIX_STRIDES r4, r5, r1 ; the instruction decode component performance affect by register order
+ pxor m0, m0
+ mova m1, [pw_pixel_max]
- paddw m0, m1
- paddw m2, m3
- CLIPW2 m0, m2, m4, m5
+%assign x 0
+%rep %2/4
+ movu m2, [r2] ; row 0 of src0
+ movu m3, [r2 + r4] ; row 1 of src0
+ movu m4, [r3] ; row 0 of src1
+ movu m5, [r3 + r5] ; row 1 of src1
+ paddw m2, m4
+ paddw m3, m5
+ CLIPW m2, m0, m1
+ CLIPW m3, m0, m1
+ movu [r0], m2 ; row 0 of dst
+ movu [r0 + r1], m3 ; row 1 of dst
- movu [r0], m0
- movu [r0 + r1], m2
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ movu m2, [r2] ; row 2 of src0
+ movu m3, [r2 + r4] ; row 3 of src0
+ movu m4, [r3] ; row 2 of src1
+ movu m5, [r3 + r5] ; row 3 of src1
+ paddw m2, m4
+ paddw m3, m5
+ CLIPW m2, m0, m1
+ CLIPW m3, m0, m1
+ movu [r0], m2 ; row 2 of dst
+ movu [r0 + r1], m3 ; row 3 of dst
- movu m0, [r2]
- movu m2, [r2 + r4]
- movu m1, [r3]
- movu m3, [r3 + r5]
- dec r6d
- lea r0, [r0 + r1 * 2]
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
+ ; didn't generate at last unroll pass
+ %if x != (%2/4)-1
+ lea r2, [r2 + r4 * 2]
+ lea r3, [r3 + r5 * 2]
+ lea r0, [r0 + r1 * 2]
+ %endif
+%assign x x+1
+%endrep
+ RET
- paddw m0, m1
- paddw m2, m3
- CLIPW2 m0, m2, m4, m5
-
- movu [r0], m0
- movu [r0 + r1], m2
- lea r0, [r0 + r1 * 2]
-
- jnz .loop
- RET
%else
INIT_XMM sse4
cglobal pixel_add_ps_8x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
More information about the x265-devel
mailing list