[x265] [PATCH] asm: improve sub_ps[16x16] (477 -> 461) and reduce code size
sumalatha at multicorewareinc.com
sumalatha at multicorewareinc.com
Tue Apr 14 08:31:40 CEST 2015
# HG changeset patch
# User Sumalatha Polureddy
# Date 1428922508 -19800
# Mon Apr 13 16:25:08 2015 +0530
# Node ID c08b05773bb99280ca3ab8a30fbe0c64dd8cecc0
# Parent abfbfdf724a0b224ba5e98c55e81cc9ed295c2f9
asm: improve sub_ps[16x16] (477 -> 461) and reduce code size
diff -r abfbfdf724a0 -r c08b05773bb9 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm Mon Apr 13 14:13:19 2015 -0700
+++ b/source/common/x86/pixel-util8.asm Mon Apr 13 16:25:08 2015 +0530
@@ -4514,18 +4514,21 @@
;-----------------------------------------------------------------------------
; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
+%if ARCH_X86_64
INIT_YMM avx2
-cglobal pixel_sub_ps_16x16, 6, 7, 4, dest, deststride, src0, src1, srcstride0, srcstride1
+cglobal pixel_sub_ps_16x16, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
add r1, r1
lea r6, [r1 * 3]
-
-%rep 4
+ mov r7d, 2
+
+ lea r9, [r4 * 3]
+ lea r8, [r5 * 3]
+
+.loop
pmovzxbw m0, [r2]
pmovzxbw m1, [r3]
pmovzxbw m2, [r2 + r4]
pmovzxbw m3, [r3 + r5]
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
psubw m0, m1
psubw m2, m3
@@ -4533,6 +4536,21 @@
movu [r0], m0
movu [r0 + r1], m2
+ pmovzxbw m0, [r2 + 2 * r4]
+ pmovzxbw m1, [r3 + 2 * r5]
+ pmovzxbw m2, [r2 + r9]
+ pmovzxbw m3, [r3 + r8]
+
+ psubw m0, m1
+ psubw m2, m3
+
+ movu [r0 + r1 * 2], m0
+ movu [r0 + r6], m2
+
+ lea r0, [r0 + r1 * 4]
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+
pmovzxbw m0, [r2]
pmovzxbw m1, [r3]
pmovzxbw m2, [r2 + r4]
@@ -4541,14 +4559,29 @@
psubw m0, m1
psubw m2, m3
+ movu [r0], m0
+ movu [r0 + r1], m2
+
+ pmovzxbw m0, [r2 + 2 * r4]
+ pmovzxbw m1, [r3 + 2 * r5]
+ pmovzxbw m2, [r2 + r9]
+ pmovzxbw m3, [r3 + r8]
+
+ psubw m0, m1
+ psubw m2, m3
+
movu [r0 + r1 * 2], m0
movu [r0 + r6], m2
lea r0, [r0 + r1 * 4]
- lea r2, [r2 + r4 * 2]
- lea r3, [r3 + r5 * 2]
-%endrep
+ lea r2, [r2 + r4 * 4]
+ lea r3, [r3 + r5 * 4]
+
+ dec r7d
+ jnz .loop
RET
+%endif
+
;-----------------------------------------------------------------------------
; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
;-----------------------------------------------------------------------------
@@ -5125,7 +5158,6 @@
dec r6d
jnz .loop
RET
-
;=============================================================================
; variance
;=============================================================================
More information about the x265-devel
mailing list