[x265] [PATCH] asm: improve avx2 code for add_ps[32x32] (1428 -> 1312)
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Tue Apr 7 11:09:08 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy
# Date 1428395385 -19800
# Tue Apr 07 13:59:45 2015 +0530
# Node ID f0de9cd5399a5b49293d328ff56476ddc16066e9
# Parent 095ed87526e5964ad45949ec81903704451d79f8
asm: improve avx2 code for add_ps[32x32] (1428 -> 1312)
diff -r 095ed87526e5 -r f0de9cd5399a source/common/x86/pixeladd8.asm
--- a/source/common/x86/pixeladd8.asm Tue Apr 07 09:42:31 2015 +0530
+++ b/source/common/x86/pixeladd8.asm Tue Apr 07 13:59:45 2015 +0530
@@ -570,10 +570,14 @@
jnz .loop
RET
+%if ARCH_X86_64
INIT_YMM avx2
-cglobal pixel_add_ps_32x%2, 6, 7, 8, dest, destride, src0, scr1, srcStride0, srcStride1
+cglobal pixel_add_ps_32x%2, 6, 10, 8, dest, destride, src0, scr1, srcStride0, srcStride1
mov r6d, %2/4
add r5, r5
+ lea r7, [r4 * 3]
+ lea r8, [r5 * 3]
+ lea r9, [r1 * 3]
.loop:
pmovzxbw m0, [r2] ; first half of row 0 of src0
pmovzxbw m1, [r2 + 16] ; second half of row 0 of src0
@@ -597,40 +601,37 @@
vpermq m0, m0, 11011000b
movu [r0 + r1], m0 ; row 1 of dst
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
- lea r0, [r0 + r1 * 2]
-
- pmovzxbw m0, [r2] ; first half of row 2 of src0
- pmovzxbw m1, [r2 + 16] ; second half of row 2 of src0
- movu m2, [r3] ; first half of row 2 of src1
- movu m3, [r3 + 32] ; second half of row 2 of src1
+ pmovzxbw m0, [r2 + r4 * 2] ; first half of row 2 of src0
+ pmovzxbw m1, [r2 + r4 * 2 + 16] ; second half of row 2 of src0
+ movu m2, [r3 + r5 * 2] ; first half of row 2 of src1
+ movu m3, [r3 + + r5 * 2 + 32]; second half of row 2 of src1
paddw m0, m2
paddw m1, m3
packuswb m0, m1
vpermq m0, m0, 11011000b
- movu [r0], m0 ; row 2 of dst
+ movu [r0 + r1 * 2], m0 ; row 2 of dst
- pmovzxbw m0, [r2 + r4] ; first half of row 3 of src0
- pmovzxbw m1, [r2 + r4 + 16] ; second half of row 3 of src0
- movu m2, [r3 + r5] ; first half of row 3 of src1
- movu m3, [r3 + r5 + 32] ; second half of row 3 of src1
+ pmovzxbw m0, [r2 + r7] ; first half of row 3 of src0
+ pmovzxbw m1, [r2 + r7 + 16] ; second half of row 3 of src0
+ movu m2, [r3 + r8] ; first half of row 3 of src1
+ movu m3, [r3 + r8 + 32] ; second half of row 3 of src1
paddw m0, m2
paddw m1, m3
packuswb m0, m1
vpermq m0, m0, 11011000b
- movu [r0 + r1], m0 ; row 3 of dst
+ movu [r0 + r9], m0 ; row 3 of dst
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
- lea r0, [r0 + r1 * 2]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+ lea r0, [r0 + r1 * 4]
dec r6d
jnz .loop
RET
%endif
+%endif
%endmacro
PIXEL_ADD_PS_W32_H2 32, 32
More information about the x265-devel
mailing list