[x265] [PATCH] asm : saturation bug fix for luma_vss asm routine
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Wed Jan 29 07:32:26 CET 2014
# HG changeset patch
# User Nabajit Deka
# Date 1390977132 -19800
# Wed Jan 29 12:02:12 2014 +0530
# Node ID a03f9fbd6af6d793af9054c85ee7d281fe447af8
# Parent 8552e8cc1a3c60ddcab85e7421229c9a86d4785f
asm : saturation bug fix for luma_vss asm routine.
diff -r 8552e8cc1a3c -r a03f9fbd6af6 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Tue Jan 28 08:49:01 2014 -0600
+++ b/source/common/x86/ipfilter8.asm Wed Jan 29 12:02:12 2014 +0530
@@ -130,6 +130,7 @@
cextern pw_512
cextern pw_2000
+cextern pd_ffff
%macro FILTER_H4_w2_2 3
movh %2, [srcq - 1]
@@ -5033,7 +5034,7 @@
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_LUMA_SS 2
INIT_XMM sse2
-cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
+cglobal interp_8tap_vert_ss_%1x%2, 5, 7, 8 ,0-1
add r1d, r1d
add r3d, r3d
@@ -5047,6 +5048,7 @@
%else
lea r6, [tab_LumaCoeffV + r4]
%endif
+ mova m7, [pd_ffff]
mov byte [rsp], %2/4
.loopH
@@ -5097,6 +5099,7 @@
pmaddwd m4, [r6 + 3 * 16]
paddd m0, m4 ;m0=[0+1+2+3+4+5+6+7] Row1 end
psrad m0, 6
+ pand m0, m7
movq m4, [r0 + 2 * r1]
punpcklwd m5, m4 ;m5=[7 8]
@@ -5105,8 +5108,9 @@
pmaddwd m5, [r6 + 3 * 16]
paddd m1, m5 ;m1=[1+2+3+4+5+6+7+8] Row2 end
psrad m1, 6
-
- packssdw m0, m1
+ pand m1, m7
+
+ packusdw m0, m1
movlps [r2], m0
movhps [r2 + r3], m0
@@ -5117,14 +5121,16 @@
pmaddwd m4, [r6 + 3 * 16]
paddd m2, m4 ;m2=[2+3+4+5+6+7+8+9] Row3 end
psrad m2, 6
+ pand m2, m7
movq m4, [r0 + 2 * r1]
punpcklwd m5, m4 ;m5=[9 10]
pmaddwd m5, [r6 + 3 * 16]
paddd m3, m5 ;m3=[3+4+5+6+7+8+9+10] Row4 end
psrad m3, 6
-
- packssdw m2, m3
+ pand m3, m7
+
+ packusdw m2, m3
movlps [r2 + 2 * r3], m2
lea r5, [r3 + 2 * r3]
More information about the x265-devel
mailing list