[x265] [PATCH] asm : saturation bug fix for luma_vss asm routine

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Wed Jan 29 07:32:26 CET 2014


# HG changeset patch
# User Nabajit Deka
# Date 1390977132 -19800
#      Wed Jan 29 12:02:12 2014 +0530
# Node ID a03f9fbd6af6d793af9054c85ee7d281fe447af8
# Parent  8552e8cc1a3c60ddcab85e7421229c9a86d4785f
asm : saturation bug fix for luma_vss asm routine.

diff -r 8552e8cc1a3c -r a03f9fbd6af6 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Tue Jan 28 08:49:01 2014 -0600
+++ b/source/common/x86/ipfilter8.asm	Wed Jan 29 12:02:12 2014 +0530
@@ -130,6 +130,7 @@
 
 cextern pw_512
 cextern pw_2000
+cextern pd_ffff
 
 %macro FILTER_H4_w2_2 3
     movh        %2, [srcq - 1]
@@ -5033,7 +5034,7 @@
 ;-----------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_LUMA_SS 2
 INIT_XMM sse2
-cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 8 ,0-1
 
     add        r1d, r1d
     add        r3d, r3d
@@ -5047,6 +5048,7 @@
 %else
     lea        r6, [tab_LumaCoeffV + r4]
 %endif
+    mova       m7, [pd_ffff]
 
     mov        byte [rsp], %2/4
 .loopH
@@ -5097,6 +5099,7 @@
     pmaddwd    m4, [r6 + 3 * 16]
     paddd      m0, m4                          ;m0=[0+1+2+3+4+5+6+7]  Row1 end
     psrad      m0, 6
+    pand       m0, m7
 
     movq       m4, [r0 + 2 * r1]
     punpcklwd  m5, m4                          ;m5=[7 8]
@@ -5105,8 +5108,9 @@
     pmaddwd    m5, [r6 + 3 * 16]
     paddd      m1, m5                          ;m1=[1+2+3+4+5+6+7+8]  Row2 end
     psrad      m1, 6
-
-    packssdw   m0, m1
+    pand       m1, m7
+
+    packusdw   m0, m1
 
     movlps     [r2], m0
     movhps     [r2 + r3], m0
@@ -5117,14 +5121,16 @@
     pmaddwd    m4, [r6 + 3 * 16]
     paddd      m2, m4                          ;m2=[2+3+4+5+6+7+8+9]  Row3 end
     psrad      m2, 6
+    pand       m2, m7
 
     movq       m4, [r0 + 2 * r1]
     punpcklwd  m5, m4                          ;m5=[9 10]
     pmaddwd    m5, [r6 + 3 * 16]
     paddd      m3, m5                          ;m3=[3+4+5+6+7+8+9+10]  Row4 end
     psrad      m3, 6
-
-    packssdw   m2, m3
+    pand       m3, m7
+
+    packusdw   m2, m3
 
     movlps     [r2 + 2 * r3], m2
     lea        r5, [r3 + 2 * r3]


More information about the x265-devel mailing list