[x265] [PATCH] asm : saturation bug fix for chroma_vss asm routine
nabajit at multicorewareinc.com
nabajit at multicorewareinc.com
Wed Jan 29 08:28:01 CET 2014
# HG changeset patch
# User Nabajit Deka
# Date 1390980467 -19800
# Wed Jan 29 12:57:47 2014 +0530
# Node ID ba8c31037a655ae55e53cee753677f78d56df397
# Parent a03f9fbd6af6d793af9054c85ee7d281fe447af8
asm : saturation bug fix for chroma_vss asm routine.
diff -r a03f9fbd6af6 -r ba8c31037a65 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm Wed Jan 29 12:02:12 2014 +0530
+++ b/source/common/x86/ipfilter8.asm Wed Jan 29 12:57:47 2014 +0530
@@ -4753,7 +4753,7 @@
;-----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_CHROMA_SS 2
INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-1
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
add r1d, r1d
add r3d, r3d
@@ -4766,24 +4766,29 @@
%else
lea r6, [tab_ChromaCoeffV + r4]
%endif
+ mova m6, [pd_ffff]
mov byte [rsp], %2/4
-
.loopH
mov r4d, (%1/4)
.loopW
PROCESS_CHROMA_SP_W4_4R
psrad m0, 6
+ pand m0, m6
psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-
- packssdw m0, m1
- packssdw m2, m3
-
+ pand m1, m6
+
+ packusdw m0, m1
movlps [r2], m0
movhps [r2 + r3], m0
+
+ psrad m2, 6
+ pand m2, m6
+ psrad m3, 6
+ pand m3, m6
+
+ packusdw m2, m3
movlps [r2 + 2 * r3], m2
lea r5, [r3 + 2 * r3]
movhps [r2 + r5], m2
@@ -4824,7 +4829,7 @@
;---------------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_CHROMA_SS_W2_4R 2
INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 5
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6
add r1d, r1d
add r3d, r3d
@@ -4837,21 +4842,24 @@
%else
lea r6, [tab_ChromaCoeffV + r4]
%endif
+ mova m5, [pd_ffff]
mov r4d, (%2/4)
-
.loopH
PROCESS_CHROMA_SP_W2_4R
psrad m0, 6
- psrad m2, 6
-
- packssdw m0, m0
- packssdw m2, m2
+ pand m0, m5
+ packusdw m0, m0
movd [r2], m0
pshufd m0, m0, 1
movd [r2 + r3], m0
+
+ psrad m2, 6
+ pand m2, m5
+ packusdw m2, m2
+
lea r2, [r2 + 2 * r3]
movd [r2], m2
pshufd m2, m2, 1
@@ -4872,7 +4880,7 @@
; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;---------------------------------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
+cglobal interp_4tap_vert_ss_4x2, 5, 6, 5
add r1d, r1d
add r3d, r3d
@@ -4885,6 +4893,7 @@
%else
lea r5, [tab_ChromaCoeffV + r4]
%endif
+ mova m4, [pd_ffff]
movq m0, [r0]
movq m1, [r0 + r1]
@@ -4901,15 +4910,16 @@
pmaddwd m2, [r5 + 1 * 16]
paddd m0, m2 ;m0=[0+1+2+3] Row1 done
psrad m0, 6
+ pand m0, m4
movq m2, [r0 + 2 * r1]
punpcklwd m3, m2 ;m5=[3 4]
pmaddwd m3, [r5 + 1 * 16]
paddd m1, m3 ;m1=[1+2+3+4] Row2 done
psrad m1, 6
-
- packssdw m0, m1
-
+ pand m1, m4
+
+ packusdw m0, m1
movlps [r2], m0
movhps [r2 + r3], m0
@@ -4919,7 +4929,7 @@
; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
;-------------------------------------------------------------------------------------------------------------------
INIT_XMM sse2
-cglobal interp_4tap_vert_ss_6x8, 5, 7, 6
+cglobal interp_4tap_vert_ss_6x8, 5, 7, 7
add r1d, r1d
add r3d, r3d
@@ -4932,6 +4942,7 @@
%else
lea r6, [tab_ChromaCoeffV + r4]
%endif
+ mova m6, [pd_ffff]
mov r4d, 8/4
@@ -4939,15 +4950,20 @@
PROCESS_CHROMA_SP_W4_4R
psrad m0, 6
+ pand m0, m6
psrad m1, 6
- psrad m2, 6
- psrad m3, 6
-
- packssdw m0, m1
- packssdw m2, m3
-
+ pand m1, m6
+
+ packusdw m0, m1
movlps [r2], m0
movhps [r2 + r3], m0
+
+ psrad m2, 6
+ pand m2, m6
+ psrad m3, 6
+ pand m3, m6
+
+ packusdw m2, m3
movlps [r2 + 2 * r3], m2
lea r5, [r3 + 2 * r3]
movhps [r2 + r5], m2
@@ -4959,14 +4975,17 @@
PROCESS_CHROMA_SP_W2_4R
psrad m0, 6
- psrad m2, 6
-
- packssdw m0, m0
- packssdw m2, m2
+ pand m0, m6
+ packusdw m0, m0
movd [r2], m0
pshufd m0, m0, 1
movd [r2 + r3], m0
+
+ psrad m2, 6
+ pand m2, m6
+ packusdw m2, m2
+
lea r2, [r2 + 2 * r3]
movd [r2], m2
pshufd m2, m2, 1
@@ -4985,7 +5004,7 @@
;----------------------------------------------------------------------------------------------------------------
%macro FILTER_VER_CHROMA_SS_W8_H2 2
INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
+cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 8
add r1d, r1d
add r3d, r3d
@@ -4998,20 +5017,26 @@
%else
lea r5, [tab_ChromaCoeffV + r4]
%endif
+ mova m7, [pd_ffff]
mov r4d, %2/2
.loopH
PROCESS_CHROMA_SP_W8_2R
psrad m0, 6
+ pand m0, m7
psrad m1, 6
+ pand m1, m7
+
+ packusdw m0, m1
+ movu [r2], m0
+
psrad m2, 6
+ pand m2, m7
psrad m3, 6
-
- packssdw m0, m1
- packssdw m2, m3
-
- movu [r2], m0
+ pand m3, m7
+
+ packusdw m2, m3
movu [r2 + r3], m2
lea r2, [r2 + 2 * r3]
More information about the x265-devel
mailing list