[x265] [PATCH] asm: improve sub_ps[16x16] (477 -> 461) and reduce code size

sumalatha at multicorewareinc.com sumalatha at multicorewareinc.com
Tue Apr 14 08:31:40 CEST 2015


# HG changeset patch
# User Sumalatha Polureddy
# Date 1428922508 -19800
#      Mon Apr 13 16:25:08 2015 +0530
# Node ID c08b05773bb99280ca3ab8a30fbe0c64dd8cecc0
# Parent  abfbfdf724a0b224ba5e98c55e81cc9ed295c2f9
asm: improve sub_ps[16x16] (477 -> 461) and reduce code size

diff -r abfbfdf724a0 -r c08b05773bb9 source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Mon Apr 13 14:13:19 2015 -0700
+++ b/source/common/x86/pixel-util8.asm	Mon Apr 13 16:25:08 2015 +0530
@@ -4514,18 +4514,21 @@
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_16x16(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
+%if ARCH_X86_64
 INIT_YMM avx2
-cglobal pixel_sub_ps_16x16, 6, 7, 4, dest, deststride, src0, src1, srcstride0, srcstride1
+cglobal pixel_sub_ps_16x16, 6, 10, 4, dest, deststride, src0, src1, srcstride0, srcstride1
     add         r1,     r1
     lea         r6,     [r1 * 3]
-
-%rep 4
+    mov         r7d,    2
+
+    lea         r9,     [r4 * 3]
+    lea         r8,     [r5 * 3]
+
+.loop
     pmovzxbw    m0,     [r2]
     pmovzxbw    m1,     [r3]
     pmovzxbw    m2,     [r2 + r4]
     pmovzxbw    m3,     [r3 + r5]
-    lea         r2,     [r2 + r4 * 2]
-    lea         r3,     [r3 + r5 * 2]
 
     psubw       m0,     m1
     psubw       m2,     m3
@@ -4533,6 +4536,21 @@
     movu        [r0],            m0
     movu        [r0 + r1],       m2
 
+    pmovzxbw    m0,     [r2 + 2 * r4]
+    pmovzxbw    m1,     [r3 + 2 * r5]
+    pmovzxbw    m2,     [r2 + r9]
+    pmovzxbw    m3,     [r3 + r8]
+
+    psubw       m0,     m1
+    psubw       m2,     m3
+
+    movu        [r0 + r1 * 2],   m0
+    movu        [r0 + r6],       m2
+
+    lea         r0,     [r0 + r1 * 4]
+    lea         r2,     [r2 + r4 * 4]
+    lea         r3,     [r3 + r5 * 4]
+
     pmovzxbw    m0,     [r2]
     pmovzxbw    m1,     [r3]
     pmovzxbw    m2,     [r2 + r4]
@@ -4541,14 +4559,29 @@
     psubw       m0,     m1
     psubw       m2,     m3
 
+    movu        [r0],            m0
+    movu        [r0 + r1],       m2
+
+    pmovzxbw    m0,     [r2 + 2 * r4]
+    pmovzxbw    m1,     [r3 + 2 * r5]
+    pmovzxbw    m2,     [r2 + r9]
+    pmovzxbw    m3,     [r3 + r8]
+
+    psubw       m0,     m1
+    psubw       m2,     m3
+
     movu        [r0 + r1 * 2],   m0
     movu        [r0 + r6],       m2
 
     lea         r0,     [r0 + r1 * 4]
-    lea         r2,     [r2 + r4 * 2]
-    lea         r3,     [r3 + r5 * 2]
-%endrep
+    lea         r2,     [r2 + r4 * 4]
+    lea         r3,     [r3 + r5 * 4]
+
+    dec         r7d
+    jnz         .loop
     RET
+%endif
+
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_32x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
@@ -5125,7 +5158,6 @@
     dec         r6d
     jnz         .loop
     RET
-
 ;=============================================================================
 ; variance
 ;=============================================================================


More information about the x265-devel mailing list