[x265] [PATCH] asm : saturation bug fix for chroma_vss asm routine

nabajit at multicorewareinc.com nabajit at multicorewareinc.com
Wed Jan 29 08:28:01 CET 2014


# HG changeset patch
# User Nabajit Deka
# Date 1390980467 -19800
#      Wed Jan 29 12:57:47 2014 +0530
# Node ID ba8c31037a655ae55e53cee753677f78d56df397
# Parent  a03f9fbd6af6d793af9054c85ee7d281fe447af8
asm : saturation bug fix for chroma_vss asm routine.

diff -r a03f9fbd6af6 -r ba8c31037a65 source/common/x86/ipfilter8.asm
--- a/source/common/x86/ipfilter8.asm	Wed Jan 29 12:02:12 2014 +0530
+++ b/source/common/x86/ipfilter8.asm	Wed Jan 29 12:57:47 2014 +0530
@@ -4753,7 +4753,7 @@
 ;-----------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_CHROMA_SS 2
 INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6 ,0-1
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 7 ,0-1
 
     add       r1d, r1d
     add       r3d, r3d
@@ -4766,24 +4766,29 @@
 %else
     lea       r6, [tab_ChromaCoeffV + r4]
 %endif
+    mova      m6, [pd_ffff]
 
     mov       byte [rsp], %2/4
-
 .loopH
     mov       r4d, (%1/4)
 .loopW
     PROCESS_CHROMA_SP_W4_4R
 
     psrad     m0, 6
+    pand      m0, m6
     psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
-
-    packssdw  m0, m1
-    packssdw  m2, m3
-
+    pand      m1, m6
+
+    packusdw  m0, m1
     movlps    [r2], m0
     movhps    [r2 + r3], m0
+
+    psrad     m2, 6
+    pand      m2, m6
+    psrad     m3, 6
+    pand      m3, m6
+
+    packusdw  m2, m3
     movlps    [r2 + 2 * r3], m2
     lea       r5, [r3 + 2 * r3]
     movhps    [r2 + r5], m2
@@ -4824,7 +4829,7 @@
 ;---------------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_CHROMA_SS_W2_4R 2
 INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 5
+cglobal interp_4tap_vert_ss_%1x%2, 5, 7, 6
 
     add       r1d, r1d
     add       r3d, r3d
@@ -4837,21 +4842,24 @@
 %else
     lea       r6, [tab_ChromaCoeffV + r4]
 %endif
+    mova      m5, [pd_ffff]
 
     mov       r4d, (%2/4)
-
 .loopH
     PROCESS_CHROMA_SP_W2_4R
 
     psrad     m0, 6
-    psrad     m2, 6
-
-    packssdw  m0, m0
-    packssdw  m2, m2
+    pand      m0, m5
+    packusdw  m0, m0
 
     movd      [r2], m0
     pshufd    m0, m0, 1
     movd      [r2 + r3], m0
+
+    psrad     m2, 6
+    pand      m2, m5
+    packusdw  m2, m2
+
     lea       r2, [r2 + 2 * r3]
     movd      [r2], m2
     pshufd    m2, m2, 1
@@ -4872,7 +4880,7 @@
 ; void interp_4tap_vert_ss_4x2(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;---------------------------------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal interp_4tap_vert_ss_4x2, 5, 6, 4
+cglobal interp_4tap_vert_ss_4x2, 5, 6, 5
 
     add        r1d, r1d
     add        r3d, r3d
@@ -4885,6 +4893,7 @@
 %else
     lea        r5, [tab_ChromaCoeffV + r4]
 %endif
+    mova       m4, [pd_ffff]
 
     movq       m0, [r0]
     movq       m1, [r0 + r1]
@@ -4901,15 +4910,16 @@
     pmaddwd    m2, [r5 + 1 * 16]
     paddd      m0, m2                          ;m0=[0+1+2+3]  Row1 done
     psrad      m0, 6
+    pand       m0, m4
 
     movq       m2, [r0 + 2 * r1]
     punpcklwd  m3, m2                          ;m5=[3 4]
     pmaddwd    m3, [r5 + 1 * 16]
     paddd      m1, m3                          ;m1=[1+2+3+4]  Row2 done
     psrad      m1, 6
-
-    packssdw   m0, m1
-
+    pand       m1, m4
+
+    packusdw   m0, m1
     movlps     [r2], m0
     movhps     [r2 + r3], m0
 
@@ -4919,7 +4929,7 @@
 ; void interp_4tap_vertical_ss_6x8(int16_t *src, intptr_t srcStride, int16_t *dst, intptr_t dstStride, int coeffIdx)
 ;-------------------------------------------------------------------------------------------------------------------
 INIT_XMM sse2
-cglobal interp_4tap_vert_ss_6x8, 5, 7, 6
+cglobal interp_4tap_vert_ss_6x8, 5, 7, 7
 
     add       r1d, r1d
     add       r3d, r3d
@@ -4932,6 +4942,7 @@
 %else
     lea       r6, [tab_ChromaCoeffV + r4]
 %endif
+    mova      m6, [pd_ffff]
 
     mov       r4d, 8/4
 
@@ -4939,15 +4950,20 @@
     PROCESS_CHROMA_SP_W4_4R
 
     psrad     m0, 6
+    pand      m0, m6
     psrad     m1, 6
-    psrad     m2, 6
-    psrad     m3, 6
-
-    packssdw  m0, m1
-    packssdw  m2, m3
-
+    pand      m1, m6
+
+    packusdw  m0, m1
     movlps    [r2], m0
     movhps    [r2 + r3], m0
+
+    psrad     m2, 6
+    pand      m2, m6
+    psrad     m3, 6
+    pand      m3, m6
+
+    packusdw  m2, m3
     movlps    [r2 + 2 * r3], m2
     lea       r5, [r3 + 2 * r3]
     movhps    [r2 + r5], m2
@@ -4959,14 +4975,17 @@
     PROCESS_CHROMA_SP_W2_4R
 
     psrad     m0, 6
-    psrad     m2, 6
-
-    packssdw  m0, m0
-    packssdw  m2, m2
+    pand      m0, m6
+    packusdw  m0, m0
 
     movd      [r2], m0
     pshufd    m0, m0, 1
     movd      [r2 + r3], m0
+
+    psrad     m2, 6
+    pand      m2, m6
+    packusdw  m2, m2
+
     lea       r2, [r2 + 2 * r3]
     movd      [r2], m2
     pshufd    m2, m2, 1
@@ -4985,7 +5004,7 @@
 ;----------------------------------------------------------------------------------------------------------------
 %macro FILTER_VER_CHROMA_SS_W8_H2 2
 INIT_XMM sse2
-cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 7
+cglobal interp_4tap_vert_ss_%1x%2, 5, 6, 8
 
     add       r1d, r1d
     add       r3d, r3d
@@ -4998,20 +5017,26 @@
 %else
     lea       r5, [tab_ChromaCoeffV + r4]
 %endif
+    mova      m7, [pd_ffff]
 
     mov       r4d, %2/2
 .loopH
     PROCESS_CHROMA_SP_W8_2R
 
     psrad     m0, 6
+    pand      m0, m7
     psrad     m1, 6
+    pand      m1, m7
+
+    packusdw  m0, m1
+    movu      [r2], m0
+
     psrad     m2, 6
+    pand      m2, m7
     psrad     m3, 6
-
-    packssdw  m0, m1
-    packssdw  m2, m3
-
-    movu      [r2], m0
+    pand      m3, m7
+
+    packusdw  m2, m3
     movu      [r2 + r3], m2
 
     lea       r2, [r2 + 2 * r3]


More information about the x265-devel mailing list