[x265] [PATCH] asm: avx2 sub_ps 8bpp - use loop for 32x32 and 64x64

rajesh at multicorewareinc.com rajesh at multicorewareinc.com
Fri Mar 20 05:44:18 CET 2015


# HG changeset patch
# User Rajesh Paulraj<rajesh at multicorewareinc.com>
# Date 1426744567 -19800
#      Thu Mar 19 11:26:07 2015 +0530
# Node ID 085a32290bdf170aaede30c6c28b799148bab911
# Parent  ee667a53aeb475c5d14a75f1d537111e5654ff79
asm: avx2 sub_ps 8bpp - use loop for 32x32 and 64x64

     sub_ps[32x32](10.03x), sub_ps[64x64](10.36x)

diff -r ee667a53aeb4 -r 085a32290bdf source/common/x86/pixel-util8.asm
--- a/source/common/x86/pixel-util8.asm	Thu Mar 19 20:56:29 2015 -0500
+++ b/source/common/x86/pixel-util8.asm	Thu Mar 19 11:26:07 2015 +0530
@@ -4510,10 +4510,11 @@
 ; void pixel_sub_ps_32x32(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal pixel_sub_ps_32x32, 6, 6, 4, dest, deststride, src0, src1, srcstride0, srcstride1
-     add         r1,    r1
-
-%rep 4
+cglobal pixel_sub_ps_32x32, 6, 7, 4, dest, deststride, src0, src1, srcstride0, srcstride1
+     mov        r6d,    4
+     add        r1,     r1
+
+.loop:
     pmovzxbw    m0,     [r2]
     pmovzxbw    m1,     [r2 + 16]
     pmovzxbw    m2,     [r3]
@@ -4630,8 +4631,11 @@
     lea         r0,     [r0 + r1]
     lea         r2,     [r2 + r4 * 2]
     lea         r3,     [r3 + r5 * 2]
-%endrep
+
+    dec         r6d
+    jnz         .loop
     RET
+
 ;-----------------------------------------------------------------------------
 ; void pixel_sub_ps_64x%2(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
@@ -4855,10 +4859,11 @@
 ; void pixel_sub_ps_64x64(int16_t *dest, intptr_t destride, pixel *src0, pixel *src1, intptr_t srcstride0, intptr_t srcstride1);
 ;-----------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal pixel_sub_ps_64x64, 6, 6, 8, dest, deststride, src0, src1, srcstride0, srcstride1
-     add        r1,     r1
-
-%rep 16
+cglobal pixel_sub_ps_64x64, 6, 7, 8, dest, deststride, src0, src1, srcstride0, srcstride1
+     mov        r6d,    16
+     add        r1,     r1
+
+.loop:
     pmovzxbw    m0,     [r2]
     pmovzxbw    m1,     [r2 + 16]
     pmovzxbw    m2,     [r2 + 32]
@@ -4954,8 +4959,11 @@
     add         r0,     r1
     add         r2,     r4
     add         r3,     r5
-%endrep
+
+    dec         r6d
+    jnz         .loop
     RET
+
 ;=============================================================================
 ; variance
 ;=============================================================================


More information about the x265-devel mailing list