[x265] [PATCH] avx2: 'integral4v' asm code -> 7.48x faster than 'C' version

praveen at multicorewareinc.com praveen at multicorewareinc.com
Mon May 8 06:31:06 CEST 2017


# HG changeset patch
# User Praveen Tiwari <praveen at multicorewareinc.com>
# Date 1493905428 -19800
#      Thu May 04 19:13:48 2017 +0530
# Node ID 41611825c2f4661536500e1306db7d8c4bf7fd07
# Parent  48502979a4b21f6982dcdacbf7796bf5d9fb395c
avx2: 'integral4v' asm code -> 7.48x faster than 'C' version

   integral_init4v  7.48x    202.53          1515.14

diff -r 48502979a4b2 -r 41611825c2f4 source/common/x86/seaintegral.asm
--- a/source/common/x86/seaintegral.asm	Wed May 03 11:26:26 2017 +0530
+++ b/source/common/x86/seaintegral.asm	Thu May 04 19:13:48 2017 +0530
@@ -32,8 +32,19 @@
 ;void integral_init4v_c(uint32_t *sum4, intptr_t stride)
 ;-----------------------------------------------------------------------------
 INIT_YMM avx2
-cglobal integral4v, 2, 2, 0
- 
+cglobal integral4v, 2, 3, 2
+    mov r2, r1
+    shl r2, 4
+
+.loop
+    movu    m0, [r0]
+    movu    m1, [r0 + r2]
+    psubd   m1, m0
+    movu    [r0], m1
+    add     r0, 32
+    sub     r1, 8
+    cmp     r1, 0
+    jnz     .loop
     RET
 
 ;-----------------------------------------------------------------------------


More information about the x265-devel mailing list