[x265] [PATCH 2 of 3] asm: fix intra_pred_dc_sse2 in Main12
Min Chen
chenm003 at 163.com
Wed Jul 15 03:05:59 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1436916586 25200
# Node ID 39d3e3941f865084c5b559f97f6178d26c7751c2
# Parent 4ecdacda40d51cf6ce8ceabad7cbdeb0081a6962
asm: fix intra_pred_dc_sse2 in Main12
---
source/common/x86/intrapred16.asm | 57 +++++++++++++++++--------------------
1 files changed, 26 insertions(+), 31 deletions(-)
diff -r 4ecdacda40d5 -r 39d3e3941f86 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Jul 14 16:29:43 2015 -0700
+++ b/source/common/x86/intrapred16.asm Tue Jul 14 16:29:46 2015 -0700
@@ -142,7 +142,7 @@
test r4d, r4d
paddw m0, [pw_4]
- psraw m0, 3
+ psrlw m0, 3
; store DC 4x4
movh [r0], m0
@@ -161,7 +161,7 @@
; filter top
movh m1, [r2 + 2]
paddw m1, m0
- psraw m1, 2
+ psrlw m1, 2
movh [r0], m1 ; overwrite top-left pixel, we will update it later
; filter top-left
@@ -176,7 +176,7 @@
; filter left
movu m1, [r2 + 20]
paddw m1, m0
- psraw m1, 2
+ psrlw m1, 2
movd r3d, m1
mov [r0 + r1 * 2], r3w
shr r3d, 16
@@ -202,7 +202,7 @@
pmaddwd m0, [pw_1]
paddw m0, [pw_8]
- psraw m0, 4 ; sum = sum / 16
+ psrlw m0, 4 ; sum = sum / 16
pshuflw m0, m0, 0
pshufd m0, m0, 0 ; m0 = word [dc_val ...]
@@ -235,7 +235,7 @@
; filter top
movu m0, [r2 + 2]
paddw m0, m1
- psraw m0, 2
+ psrlw m0, 2
movu [r0], m0
; filter top-left
@@ -250,7 +250,7 @@
; filter left
movu m0, [r2 + 36]
paddw m0, m1
- psraw m0, 2
+ psrlw m0, 2
movh r3, m0
mov [r0 + r1 * 2], r3w
shr r3, 16
@@ -284,14 +284,10 @@
paddw m0, m1
paddw m2, m3
paddw m0, m2
- movhlps m1, m0
- paddw m0, m1
- pshuflw m1, m0, 0x6E
- paddw m0, m1
- pmaddwd m0, [pw_1]
-
- paddw m0, [pw_16]
- psraw m0, 5
+ HADDUW m0, m1
+ paddd m0, [pd_16]
+ psrld m0, 5
+
movd r5d, m0
pshuflw m0, m0, 0 ; m0 = word [dc_val ...]
pshufd m0, m0, 0
@@ -347,11 +343,11 @@
; filter top
movu m2, [r2 + 2]
paddw m2, m1
- psraw m2, 2
+ psrlw m2, 2
movu [r0], m2
movu m3, [r2 + 18]
paddw m3, m1
- psraw m3, 2
+ psrlw m3, 2
movu [r0 + 16], m3
; filter top-left
@@ -366,7 +362,7 @@
; filter left
movu m2, [r3 + 2]
paddw m2, m1
- psraw m2, 2
+ psrlw m2, 2
movq r2, m2
pshufd m2, m2, 0xEE
@@ -388,7 +384,7 @@
movu m3, [r3 + 18]
paddw m3, m1
- psraw m3, 2
+ psrlw m3, 2
movq r3, m3
pshufd m3, m3, 0xEE
@@ -423,20 +419,19 @@
paddw m0, m1
paddw m2, m3
paddw m0, m2
+ HADDUWD m0, m1
+
movu m1, [r2]
- movu m3, [r2 + 16]
- movu m4, [r2 + 32]
- movu m5, [r2 + 48]
+ movu m2, [r2 + 16]
+ movu m3, [r2 + 32]
+ movu m4, [r2 + 48]
+ paddw m1, m2
+ paddw m3, m4
paddw m1, m3
- paddw m4, m5
- paddw m1, m4
- paddw m0, m1
- movhlps m1, m0
- paddw m0, m1
- pshuflw m1, m0, 0x6E
- paddw m0, m1
- pmaddwd m0, [pw_1]
-
+ HADDUWD m1, m2
+
+ paddd m0, m1
+ HADDD m0, m1
paddd m0, [pd_32] ; sum = sum + 32
psrld m0, 6 ; sum = sum / 64
pshuflw m0, m0, 0
@@ -487,7 +482,7 @@
phaddw xm0, xm0
pmaddwd xm0, [pw_1]
paddd xm0, [pd_16]
- psrad xm0, 5
+ psrld xm0, 5
movd r5d, xm0
vpbroadcastw m0, xm0
@@ -527,7 +522,7 @@
; filter top
movu m2, [r2 + 2]
paddw m2, m1
- psraw m2, 2
+ psrlw m2, 2
movu [r0], m2
; filter top-left
@@ -542,7 +537,7 @@
; filter left
movu m2, [r2 + 68]
paddw m2, m1
- psraw m2, 2
+ psrlw m2, 2
vextracti128 xm3, m2, 1
movq r3, xm2
More information about the x265-devel
mailing list