[x265] [PATCH] asm: improve on intra_dc32

Min Chen chenm003 at 163.com
Sat Mar 7 02:07:18 CET 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1425690429 28800
# Node ID 63d132c844b9d299081b40e7589275b78fe71093
# Parent  043c2418864b0a3ada6f597e6def6ead73d90b5f
asm: improve on intra_dc32
---
 source/common/x86/intrapred8.asm |   71 ++++++++++++--------------------------
 1 files changed, 22 insertions(+), 49 deletions(-)

diff -r 043c2418864b -r 63d132c844b9 source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm	Fri Mar 06 13:15:55 2015 -0600
+++ b/source/common/x86/intrapred8.asm	Fri Mar 06 17:07:09 2015 -0800
@@ -524,15 +524,21 @@
     pshuflw         m1,            m1, 0x00       ; m1 = byte [dc_val ...]
     pshufd          m1,            m1, 0x00
 
+    lea             r2,            [r0 + r1 * 2]
 %assign x 0
-%rep 16
+%rep 8
     ; store DC 16x16
     movu            [r0],               m1
+    movu            [r0 + 16],          m1
     movu            [r0 + r1],          m1
-    movu            [r0 + 16],          m1
     movu            [r0 + r1 + 16],     m1
-%if x < 16
-    lea             r0,            [r0 + 2 * r1]
+    movu            [r2],               m1
+    movu            [r2 + 16],          m1
+    movu            [r2 + r1],          m1
+    movu            [r2 + r1 + 16],     m1
+%if x < 8
+    lea             r0,            [r0 + 4 * r1]
+    lea             r2,            [r2 + 4 * r1]
 %endif
 %assign x x+1
 %endrep
@@ -996,14 +1002,13 @@
 ; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
 ;---------------------------------------------------------------------------------------------
 INIT_XMM sse4
-cglobal intra_pred_dc32, 3, 5, 5
-    lea             r3, [r2 + 65]
+cglobal intra_pred_dc32, 3,3,5
     inc             r2
     pxor            m0,            m0
     movu            m1,            [r2]
     movu            m2,            [r2 + 16]
-    movu            m3,            [r3]
-    movu            m4,            [r3 + 16]
+    movu            m3,            [r2 + 64]
+    movu            m4,            [r2 + 64 + 16]
     psadbw          m1,            m0
     psadbw          m2,            m0
     psadbw          m3,            m0
@@ -1014,54 +1019,22 @@
     pshufd          m2,            m1, 2
     paddw           m1,            m2
 
-    movd            r4d,           m1
-    add             r4d,           32
-    shr             r4d,           6     ; sum = sum / 64
-    movd            m1,            r4d
-    pshufb          m1,            m0    ; m1 = byte [dc_val ...]
-
-%rep 2
+    paddw           m1,            [pw_32]      ; sum = (sum + 32) / 64
+    psrlw           m1,            6
+    pshufb          m1,            m0           ; m1 = byte [dc_val ...]
+
+    ; store DC 16x16
+%assign x 0
+%rep 16
     ; store DC 16x16
     movu            [r0],          m1
+    movu            [r0 + 16],     m1
     movu            [r0 + r1],     m1
-    movu            [r0 + 16],     m1
     movu            [r0 + r1 + 16],m1
+  %if (x < 16)
     lea             r0,            [r0 + 2 * r1]
-    movu            [r0],          m1
-    movu            [r0 + r1],     m1
-    movu            [r0 + 16],     m1
-    movu            [r0 + r1 + 16],m1
-    lea             r0,            [r0 + 2 * r1]
-    movu            [r0],          m1
-    movu            [r0 + r1],     m1
-    movu            [r0 + 16],     m1
-    movu            [r0 + r1 + 16],m1
-    lea             r0,            [r0 + 2 * r1]
-    movu            [r0],          m1
-    movu            [r0 + r1],     m1
-    movu            [r0 + 16],     m1
-    movu            [r0 + r1 + 16],m1
-    lea             r0,            [r0 + 2 * r1]
-    movu            [r0],          m1
-    movu            [r0 + r1],     m1
-    movu            [r0 + 16],     m1
-    movu            [r0 + r1 + 16],m1
-    lea             r0,            [r0 + 2 * r1]
-    movu            [r0],          m1
-    movu            [r0 + r1],     m1
-    movu            [r0 + 16],     m1
-    movu            [r0 + r1 + 16],m1
-    lea             r0,            [r0 + 2 * r1]
-    movu            [r0],          m1
-    movu            [r0 + r1],     m1
-    movu            [r0 + 16],     m1
-    movu            [r0 + r1 + 16],m1
-    lea             r0,            [r0 + 2 * r1]
-    movu            [r0],          m1
-    movu            [r0 + r1],     m1
-    movu            [r0 + 16],     m1
-    movu            [r0 + r1 + 16],m1
-    lea             r0,            [r0 + 2 * r1]
+  %endif
+%assign x x+1
 %endrep
 
     RET



More information about the x265-devel mailing list