[x265] [PATCH 2 of 3] asm: fix intra_pred_dc_sse2 in Main12

Min Chen chenm003 at 163.com
Wed Jul 15 03:05:59 CEST 2015


# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1436916586 25200
# Node ID 39d3e3941f865084c5b559f97f6178d26c7751c2
# Parent  4ecdacda40d51cf6ce8ceabad7cbdeb0081a6962
asm: fix intra_pred_dc_sse2 in Main12
---
 source/common/x86/intrapred16.asm |   57 +++++++++++++++++--------------------
 1 files changed, 26 insertions(+), 31 deletions(-)

diff -r 4ecdacda40d5 -r 39d3e3941f86 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm	Tue Jul 14 16:29:43 2015 -0700
+++ b/source/common/x86/intrapred16.asm	Tue Jul 14 16:29:46 2015 -0700
@@ -142,7 +142,7 @@
     test        r4d,            r4d
 
     paddw       m0,             [pw_4]
-    psraw       m0,             3
+    psrlw       m0,             3
 
     ; store DC 4x4
     movh        [r0],           m0
@@ -161,7 +161,7 @@
     ; filter top
     movh        m1,             [r2 + 2]
     paddw       m1,             m0
-    psraw       m1,             2
+    psrlw       m1,             2
     movh        [r0],           m1             ; overwrite top-left pixel, we will update it later
 
     ; filter top-left
@@ -176,7 +176,7 @@
     ; filter left
     movu        m1,             [r2 + 20]
     paddw       m1,             m0
-    psraw       m1,             2
+    psrlw       m1,             2
     movd        r3d,            m1
     mov         [r0 + r1 * 2],  r3w
     shr         r3d,            16
@@ -202,7 +202,7 @@
     pmaddwd         m0,            [pw_1]
 
     paddw           m0,            [pw_8]
-    psraw           m0,            4              ; sum = sum / 16
+    psrlw           m0,            4              ; sum = sum / 16
     pshuflw         m0,            m0, 0
     pshufd          m0,            m0, 0          ; m0 = word [dc_val ...]
 
@@ -235,7 +235,7 @@
     ; filter top
     movu            m0,            [r2 + 2]
     paddw           m0,            m1
-    psraw           m0,            2
+    psrlw           m0,            2
     movu            [r0],          m0
 
     ; filter top-left
@@ -250,7 +250,7 @@
     ; filter left
     movu            m0,            [r2 + 36]
     paddw           m0,            m1
-    psraw           m0,            2
+    psrlw           m0,            2
     movh            r3,            m0
     mov             [r0 + r1 * 2], r3w
     shr             r3,            16
@@ -284,14 +284,10 @@
     paddw           m0,                  m1
     paddw           m2,                  m3
     paddw           m0,                  m2
-    movhlps         m1,                  m0
-    paddw           m0,                  m1
-    pshuflw         m1,                  m0, 0x6E
-    paddw           m0,                  m1
-    pmaddwd         m0,                  [pw_1]
-
-    paddw           m0,                  [pw_16]
-    psraw           m0,                  5
+    HADDUW          m0,                  m1
+    paddd           m0,                  [pd_16]
+    psrld           m0,                  5
+
     movd            r5d,                 m0
     pshuflw         m0,                  m0, 0 ; m0 = word [dc_val ...]
     pshufd          m0,                  m0, 0
@@ -347,11 +343,11 @@
     ; filter top
     movu            m2,                  [r2 + 2]
     paddw           m2,                  m1
-    psraw           m2,                  2
+    psrlw           m2,                  2
     movu            [r0],                m2
     movu            m3,                  [r2 + 18]
     paddw           m3,                  m1
-    psraw           m3,                  2
+    psrlw           m3,                  2
     movu            [r0 + 16],           m3
 
     ; filter top-left
@@ -366,7 +362,7 @@
     ; filter left
     movu            m2,                  [r3 + 2]
     paddw           m2,                  m1
-    psraw           m2,                  2
+    psrlw           m2,                  2
 
     movq            r2,                  m2
     pshufd          m2,                  m2, 0xEE
@@ -388,7 +384,7 @@
 
     movu            m3,                  [r3 + 18]
     paddw           m3,                  m1
-    psraw           m3,                  2
+    psrlw           m3,                  2
 
     movq            r3,                  m3
     pshufd          m3,                  m3, 0xEE
@@ -423,20 +419,19 @@
     paddw           m0,                  m1
     paddw           m2,                  m3
     paddw           m0,                  m2
+    HADDUWD         m0,                  m1
+
     movu            m1,                  [r2]
-    movu            m3,                  [r2 + 16]
-    movu            m4,                  [r2 + 32]
-    movu            m5,                  [r2 + 48]
+    movu            m2,                  [r2 + 16]
+    movu            m3,                  [r2 + 32]
+    movu            m4,                  [r2 + 48]
+    paddw           m1,                  m2
+    paddw           m3,                  m4
     paddw           m1,                  m3
-    paddw           m4,                  m5
-    paddw           m1,                  m4
-    paddw           m0,                  m1
-    movhlps         m1,                  m0
-    paddw           m0,                  m1
-    pshuflw         m1,                  m0, 0x6E
-    paddw           m0,                  m1
-    pmaddwd         m0,                  [pw_1]
-
+    HADDUWD         m1,                  m2
+
+    paddd           m0,                  m1
+    HADDD           m0,                  m1
     paddd           m0,                  [pd_32]     ; sum = sum + 32
     psrld           m0,                  6           ; sum = sum / 64
     pshuflw         m0,                  m0, 0
@@ -487,7 +482,7 @@
     phaddw          xm0,                 xm0
     pmaddwd         xm0,                 [pw_1]
     paddd           xm0,                 [pd_16]
-    psrad           xm0,                 5
+    psrld           xm0,                 5
     movd            r5d,                 xm0
     vpbroadcastw    m0,                  xm0
 
@@ -527,7 +522,7 @@
     ; filter top
     movu            m2,                  [r2 + 2]
     paddw           m2,                  m1
-    psraw           m2,                  2
+    psrlw           m2,                  2
     movu            [r0],                m2
 
     ; filter top-left
@@ -542,7 +537,7 @@
     ; filter left
     movu            m2,                  [r2 + 68]
     paddw           m2,                  m1
-    psraw           m2,                  2
+    psrlw           m2,                  2
     vextracti128    xm3,                 m2, 1
 
     movq            r3,                  xm2



More information about the x265-devel mailing list