[x265] [PATCH] asm: optimised pixel_sad_xN_24x32 assembly code

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Fri Nov 8 13:30:29 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1383913778 -19800
#      Fri Nov 08 17:59:38 2013 +0530
# Node ID 282201736bff80020b82be211ae8a3ebfb59a6f1
# Parent  94cba84de8dd754322cf62df71abd4af24b0b47d
asm: optimised pixel_sad_xN_24x32 assembly code

diff -r 94cba84de8dd -r 282201736bff source/common/x86/sad-a.asm
--- a/source/common/x86/sad-a.asm	Fri Nov 08 02:01:52 2013 -0600
+++ b/source/common/x86/sad-a.asm	Fri Nov 08 17:59:38 2013 +0530
@@ -1774,91 +1774,86 @@
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m2,  m5
-    lea     r0,  [r0 + FENC_STRIDE]
-    lea     r1,  [r1 + r4]
-    lea     r2,  [r2 + r4]
-    lea     r3,  [r3 + r4]
-    mova    m3,  [r0]
-    mova    m4,  [r0 + 16]
-    movu    m5,  [r1]
-    movu    m6,  [r1 + 16]
+
+    mova    m3,  [r0 + FENC_STRIDE]
+    mova    m4,  [r0 + 16 + FENC_STRIDE]
+    movu    m5,  [r1 + r4]
+    movu    m6,  [r1 + 16 + r4]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m0,  m5
-    movu    m5,  [r2]
-    movu    m6,  [r2 + 16]
+    movu    m5,  [r2 + r4]
+    movu    m6,  [r2 + 16 + r4]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m1,  m5
-    movu    m5,  [r3]
-    movu    m6,  [r3 + 16]
+    movu    m5,  [r3 + r4]
+    movu    m6,  [r3 + 16 + r4]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m2,  m5
-    lea     r0,  [r0 + FENC_STRIDE]
-    lea     r1,  [r1 + r4]
-    lea     r2,  [r2 + r4]
-    lea     r3,  [r3 + r4]
-    mova    m3,  [r0]
-    mova    m4,  [r0 + 16]
-    movu    m5,  [r1]
-    movu    m6,  [r1 + 16]
+
+    mova    m3,  [r0 + FENC_STRIDE * 2]
+    mova    m4,  [r0 + 16 + FENC_STRIDE * 2]
+    movu    m5,  [r1 + r4 * 2]
+    movu    m6,  [r1 + 16 + r4 * 2]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m0,  m5
-    movu    m5,  [r2]
-    movu    m6,  [r2 + 16]
+    movu    m5,  [r2 + r4 * 2]
+    movu    m6,  [r2 + 16 + r4 * 2]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m1,  m5
-    movu    m5,  [r3]
-    movu    m6,  [r3 + 16]
+    movu    m5,  [r3 + r4 * 2]
+    movu    m6,  [r3 + 16 + r4 * 2]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m2,  m5
-    lea     r0,  [r0 + FENC_STRIDE]
-    lea     r1,  [r1 + r4]
-    lea     r2,  [r2 + r4]
-    lea     r3,  [r3 + r4]
-    mova    m3,  [r0]
-    mova    m4,  [r0 + 16]
-    movu    m5,  [r1]
-    movu    m6,  [r1 + 16]
+    lea     r0,  [r0 + FENC_STRIDE * 2]
+    lea     r1,  [r1 + r4 * 2]
+    lea     r2,  [r2 + r4 * 2]
+    lea     r3,  [r3 + r4 * 2]
+
+    mova    m3,  [r0 + FENC_STRIDE]
+    mova    m4,  [r0 + 16 + FENC_STRIDE]
+    movu    m5,  [r1 + r4]
+    movu    m6,  [r1 + 16 + r4]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m0,  m5
-    movu    m5,  [r2]
-    movu    m6,  [r2 + 16]
+    movu    m5,  [r2 + r4]
+    movu    m6,  [r2 + 16 + r4]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m1,  m5
-    movu    m5,  [r3]
-    movu    m6,  [r3 + 16]
+    movu    m5,  [r3 + r4]
+    movu    m6,  [r3 + 16 + r4]
     psadbw  m5,  m3
     psadbw  m6,  m4
     pshufd  m6,  m6, 84
     paddd   m5,  m6
     paddd   m2,  m5
-    lea     r0,  [r0 + FENC_STRIDE]
-    lea     r1,  [r1 + r4]
-    lea     r2,  [r2 + r4]
-    lea     r3,  [r3 + r4]
+    lea     r0,  [r0 + FENC_STRIDE * 2]
+    lea     r1,  [r1 + r4 * 2]
+    lea     r2,  [r2 + r4 * 2]
+    lea     r3,  [r3 + r4 * 2]
 %endmacro
 
 %macro SAD_X4_24x4 0
@@ -1892,116 +1887,108 @@
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m3,  m6
-    lea     r0,  [r0 + FENC_STRIDE]
-    lea     r1,  [r1 + r5]
-    lea     r2,  [r2 + r5]
-    lea     r3,  [r3 + r5]
-    lea     r4,  [r4 + r5]
-    mova    m4,  [r0]
-    mova    m5,  [r0 + 16]
-    movu    m6,  [r1]
-    movu    m7,  [r1 + 16]
+
+    mova    m4,  [r0 + FENC_STRIDE]
+    mova    m5,  [r0 + 16 + FENC_STRIDE]
+    movu    m6,  [r1 + r5]
+    movu    m7,  [r1 + 16 + r5]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m0,  m6
-    movu    m6,  [r2]
-    movu    m7,  [r2 + 16]
+    movu    m6,  [r2 + r5]
+    movu    m7,  [r2 + 16 + r5]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m1,  m6
-    movu    m6,  [r3]
-    movu    m7,  [r3 + 16]
+    movu    m6,  [r3 + r5]
+    movu    m7,  [r3 + 16 + r5]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m2,  m6
-    movu    m6,  [r4]
-    movu    m7,  [r4 + 16]
+    movu    m6,  [r4 + r5]
+    movu    m7,  [r4 + 16 + r5]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m3,  m6
-    lea     r0,  [r0 + FENC_STRIDE]
-    lea     r1,  [r1 + r5]
-    lea     r2,  [r2 + r5]
-    lea     r3,  [r3 + r5]
-    lea     r4,  [r4 + r5]
-    mova    m4,  [r0]
-    mova    m5,  [r0 + 16]
-    movu    m6,  [r1]
-    movu    m7,  [r1 + 16]
+
+    mova    m4,  [r0 + FENC_STRIDE * 2]
+    mova    m5,  [r0 + 16 + FENC_STRIDE * 2]
+    movu    m6,  [r1 + r5 * 2]
+    movu    m7,  [r1 + 16 + r5 * 2]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m0,  m6
-    movu    m6,  [r2]
-    movu    m7,  [r2 + 16]
+    movu    m6,  [r2 + r5 * 2]
+    movu    m7,  [r2 + 16 + r5 * 2]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m1,  m6
-    movu    m6,  [r3]
-    movu    m7,  [r3 + 16]
+    movu    m6,  [r3 + r5 * 2]
+    movu    m7,  [r3 + 16 + r5 * 2]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m2,  m6
-    movu    m6,  [r4]
-    movu    m7,  [r4 + 16]
+    movu    m6,  [r4 + r5 * 2]
+    movu    m7,  [r4 + 16 + r5 * 2]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m3,  m6
-    lea     r0,  [r0 + FENC_STRIDE]
-    lea     r1,  [r1 + r5]
-    lea     r2,  [r2 + r5]
-    lea     r3,  [r3 + r5]
-    lea     r4,  [r4 + r5]
-    mova    m4,  [r0]
-    mova    m5,  [r0 + 16]
-    movu    m6,  [r1]
-    movu    m7,  [r1 + 16]
+    lea     r0,  [r0 + FENC_STRIDE * 2]
+    lea     r1,  [r1 + r5 * 2]
+    lea     r2,  [r2 + r5 * 2]
+    lea     r3,  [r3 + r5 * 2]
+    lea     r4,  [r4 + r5 * 2]
+    mova    m4,  [r0 + FENC_STRIDE]
+    mova    m5,  [r0 + 16 + FENC_STRIDE]
+    movu    m6,  [r1 + r5]
+    movu    m7,  [r1 + 16 + r5]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m0,  m6
-    movu    m6,  [r2]
-    movu    m7,  [r2 + 16]
+    movu    m6,  [r2 + r5]
+    movu    m7,  [r2 + 16 + r5]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m1,  m6
-    movu    m6,  [r3]
-    movu    m7,  [r3 + 16]
+    movu    m6,  [r3 + r5]
+    movu    m7,  [r3 + 16 + r5]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m2,  m6
-    movu    m6,  [r4]
-    movu    m7,  [r4 + 16]
+    movu    m6,  [r4 + r5]
+    movu    m7,  [r4 + 16 + r5]
     psadbw  m6,  m4
     psadbw  m7,  m5
     pshufd  m7,  m7, 84
     paddd   m6,  m7
     paddd   m3,  m6
-    lea     r0,  [r0 + FENC_STRIDE]
-    lea     r1,  [r1 + r5]
-    lea     r2,  [r2 + r5]
-    lea     r3,  [r3 + r5]
-    lea     r4,  [r4 + r5]
+    lea     r0,  [r0 + FENC_STRIDE * 2]
+    lea     r1,  [r1 + r5 * 2]
+    lea     r2,  [r2 + r5 * 2]
+    lea     r3,  [r3 + r5 * 2]
+    lea     r4,  [r4 + r5 * 2]
 %endmacro
 
 %macro SAD_X3_32x4 0


More information about the x265-devel mailing list