[x265] [PATCH] asm: improve the old avx2 code for sad[64x64]

sumalatha at multicorewareinc.com sumalatha at multicorewareinc.com
Mon Apr 6 12:31:12 CEST 2015


# HG changeset patch
# User Sumalatha Polureddy
# Date 1428312216 -19800
#      Mon Apr 06 14:53:36 2015 +0530
# Node ID ee87bb231905d5d8462e763d7e25c7957f52af2a
# Parent  7fcb5dd81aa6b7dd20ce072e32b18e1ffa6bd5c1
asm: improve the old avx2 code for sad[64x64]

old:
sad[64x64]  21.47x   1702.40         36545.14
new:
sad[64x64]  22.89x   1595.16         36506.87

diff -r 7fcb5dd81aa6 -r ee87bb231905 source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Mon Apr 06 11:47:55 2015 +0530
+++ b/source/common/x86/sad-a.asm	Mon Apr 06 14:53:36 2015 +0530
@@ -4346,10 +4346,12 @@
     RET
 
 INIT_YMM avx2
-cglobal pixel_sad_64x64, 4,5,6
+cglobal pixel_sad_64x64, 4,7,6
     xorps           m0, m0
     xorps           m5, m5
     mov             r4d, 8
+    lea             r5, [r1 * 3]
+    lea             r6, [r3 * 3]
 .loop
     movu           m1, [r0]               ; first 32 of row 0 of pix0
     movu           m2, [r2]               ; first 32 of row 0 of pix1
@@ -4371,31 +4373,28 @@
     paddd          m0, m1
     paddd          m5, m3
 
-    lea     r2,     [r2 + 2 * r3]
-    lea     r0,     [r0 + 2 * r1]
-
-    movu           m1, [r0]               ; first 32 of row 2 of pix0
-    movu           m2, [r2]               ; first 32 of row 2 of pix1
-    movu           m3, [r0 + 32]          ; second 32 of row 2 of pix0
-    movu           m4, [r2 + 32]          ; second 32 of row 2 of pix1
+    movu           m1, [r0 + 2 * r1]      ; first 32 of row 2 of pix0
+    movu           m2, [r2 + 2 * r3]      ; first 32 of row 2 of pix1
+    movu           m3, [r0 + 2 * r1 + 32] ; second 32 of row 2 of pix0
+    movu           m4, [r2 + 2 * r3 + 32] ; second 32 of row 2 of pix1
 
     psadbw         m1, m2
     psadbw         m3, m4
     paddd          m0, m1
     paddd          m5, m3
 
-    movu           m1, [r0 + r1]          ; first 32 of row 3 of pix0
-    movu           m2, [r2 + r3]          ; first 32 of row 3 of pix1
-    movu           m3, [r0 + 32 + r1]     ; second 32 of row 3 of pix0
-    movu           m4, [r2 + 32 + r3]     ; second 32 of row 3 of pix1
+    movu           m1, [r0 + r5]          ; first 32 of row 3 of pix0
+    movu           m2, [r2 + r6]          ; first 32 of row 3 of pix1
+    movu           m3, [r0 + 32 + r5]     ; second 32 of row 3 of pix0
+    movu           m4, [r2 + 32 + r6]     ; second 32 of row 3 of pix1
 
     psadbw         m1, m2
     psadbw         m3, m4
     paddd          m0, m1
     paddd          m5, m3
 
-    lea     r2,     [r2 + 2 * r3]
-    lea     r0,     [r0 + 2 * r1]
+    lea     r2,     [r2 + 4 * r3]
+    lea     r0,     [r0 + 4 * r1]
 
     movu           m1, [r0]               ; first 32 of row 4 of pix0
     movu           m2, [r2]               ; first 32 of row 4 of pix1
@@ -4417,31 +4416,28 @@
     paddd          m0, m1
     paddd          m5, m3
 
-    lea     r2,     [r2 + 2 * r3]
-    lea     r0,     [r0 + 2 * r1]
-
-    movu           m1, [r0]               ; first 32 of row 6 of pix0
-    movu           m2, [r2]               ; first 32 of row 6 of pix1
-    movu           m3, [r0 + 32]          ; second 32 of row 6 of pix0
-    movu           m4, [r2 + 32]          ; second 32 of row 6 of pix1
+    movu           m1, [r0 + 2 * r1]      ; first 32 of row 6 of pix0
+    movu           m2, [r2 + 2 * r3]      ; first 32 of row 6 of pix1
+    movu           m3, [r0 + 2 * r1 + 32] ; second 32 of row 6 of pix0
+    movu           m4, [r2 + 2 * r3 + 32] ; second 32 of row 6 of pix1
 
     psadbw         m1, m2
     psadbw         m3, m4
     paddd          m0, m1
     paddd          m5, m3
 
-    movu           m1, [r0 + r1]          ; first 32 of row 7 of pix0
-    movu           m2, [r2 + r3]          ; first 32 of row 7 of pix1
-    movu           m3, [r0 + 32 + r1]     ; second 32 of row 7 of pix0
-    movu           m4, [r2 + 32 + r3]     ; second 32 of row 7 of pix1
+    movu           m1, [r0 + r5]          ; first 32 of row 7 of pix0
+    movu           m2, [r2 + r6]          ; first 32 of row 7 of pix1
+    movu           m3, [r0 + 32 + r5]     ; second 32 of row 7 of pix0
+    movu           m4, [r2 + 32 + r6]     ; second 32 of row 7 of pix1
 
     psadbw         m1, m2
     psadbw         m3, m4
     paddd          m0, m1
     paddd          m5, m3
 
-    lea     r2,     [r2 + 2 * r3]
-    lea     r0,     [r0 + 2 * r1]
+    lea     r2,     [r2 + 4 * r3]
+    lea     r0,     [r0 + 4 * r1]
 
     dec         r4d
     jnz         .loop


More information about the x265-devel mailing list