[x265] [PATCH 4 of 6] avx2:'integral24v' asm code-> 7.30x faster than 'C' version
vignesh at multicorewareinc.com
vignesh at multicorewareinc.com
Tue May 9 06:46:21 CEST 2017
# HG changeset patch
# User Jayashri Murugan <jayashri at multicorewareinc.com>
# Date 1494222918 -19800
# Mon May 08 11:25:18 2017 +0530
# Node ID 12017e874c9adfd6dbba05a18c3244089d0a51b3
# Parent ecea0022176fa6c7e8fb8dc7be7b182bc19bab68
avx2:'integral24v' asm code-> 7.30x faster than 'C' version
integral_init24v 7.30x 207.58 1515.26
diff -r ecea0022176f -r 12017e874c9a source/common/x86/seaintegral.asm
--- a/source/common/x86/seaintegral.asm Mon May 08 11:02:46 2017 +0530
+++ b/source/common/x86/seaintegral.asm Mon May 08 11:25:18 2017 +0530
@@ -111,8 +111,22 @@
;void integral_init24v_c(uint32_t *sum24, intptr_t stride)
;-----------------------------------------------------------------------------
INIT_YMM avx2
-cglobal integral24v, 2, 2, 0
-
+cglobal integral24v, 2, 4, 2
+ mov r2, r1
+ mov r3, r1
+ shl r2, 6
+ shl r3, 5
+ add r2, r3
+
+.loop
+ movu m0, [r0]
+ movu m1, [r0 + r2]
+ psubd m1, m0
+ movu [r0], m1
+ add r0, 32
+ sub r1, 8
+ cmp r1, 0
+ jnz .loop
RET
;-----------------------------------------------------------------------------
More information about the x265-devel
mailing list