[x265] [PATCH] asm: fix the alignment issues occured in sse_ss

yuvaraj at multicorewareinc.com yuvaraj at multicorewareinc.com
Wed Nov 27 06:41:10 CET 2013


# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1385530851 -19800
#      Wed Nov 27 11:10:51 2013 +0530
# Node ID ef02ad58dd763924012710f1ff9091ab399c93b1
# Parent  b09b6fa7e89a6971c7dfa57c1e539f1836f9fcf9
asm: fix the alignment issues occured in sse_ss

diff -r b09b6fa7e89a -r ef02ad58dd76 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Nov 26 12:24:24 2013 -0600
+++ b/source/common/x86/asm-primitives.cpp	Wed Nov 27 11:10:51 2013 +0530
@@ -496,7 +496,6 @@
         p.sad[LUMA_12x16] = x265_pixel_sad_12x16_sse2;
 
         ASSGN_SSE(sse2);
-        ASSGN_SSE_SS(sse2);
         INIT2(sad, _sse2);
         INIT2(sad_x3, _sse2);
         INIT2(sad_x4, _sse2);
diff -r b09b6fa7e89a -r ef02ad58dd76 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm	Tue Nov 26 12:24:24 2013 -0600
+++ b/source/common/x86/pixel-a.asm	Wed Nov 27 11:10:51 2013 +0530
@@ -202,7 +202,7 @@
 %macro SSD_SS 2
 cglobal pixel_ssd_ss_%1x%2, 4,7,6
     FIX_STRIDES r1, r3
-%if mmsize == %1*4
+%if mmsize == %1*4 || mmsize == %1*2
     %define offset0_1 r1*2
     %define offset0_2 r1*4
     %define offset0_3 r5
@@ -213,20 +213,13 @@
     lea     r6, [4*r3]
     lea     r5, [r5 + 2*r1]
     lea     r6, [r6 + 2*r3]
-%elif mmsize == %1*2
-    %define offset0_1 8
+%elif mmsize == %1
+    %define offset0_1 16
     %define offset0_2 r1*2
-    %define offset0_3 r1*2+8
-    %define offset1_1 8
+    %define offset0_3 r1*2+16
+    %define offset1_1 16
     %define offset1_2 r3*2
-    %define offset1_3 r3*2+8
-%elif mmsize == %1
-    %define offset0_1 8
-    %define offset0_2 16
-    %define offset0_3 24
-    %define offset1_1 8
-    %define offset1_2 16
-    %define offset1_3 24
+    %define offset1_3 r3*2+16
 %endif
 %if %1 == 4
     %assign %%n %2/(mmsize/%1)
@@ -238,64 +231,61 @@
 %endif
     pxor    m0, m0
 .loop
-    pmovsxwd  m1, [r0]
-    pmovsxwd  m2, [r2]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + offset0_1]
-    pmovsxwd  m2, [r2 + offset1_1]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + offset0_2]
-    pmovsxwd  m2, [r2 + offset1_2]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + offset0_3]
-    pmovsxwd  m2, [r2 + offset1_3]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-%if %1 > 4
-    %assign %%m 4/(%1/8)
-    lea       r0, [r0+r1*%%m]
-    lea       r2, [r2+r3*%%m]
-    pmovsxwd  m1, [r0]
-    pmovsxwd  m2, [r2]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + offset0_1]
-    pmovsxwd  m2, [r2 + offset1_1]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + offset0_2]
-    pmovsxwd  m2, [r2 + offset1_2]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + offset0_3]
-    pmovsxwd  m2, [r2 + offset1_3]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-%endif
 %if %1 == 4
+    movh    m1, [r0]
+    movh    m2, [r2]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movh    m1, [r0 + offset0_1]
+    movh    m2, [r2 + offset1_1]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movh    m1, [r0 + offset0_2]
+    movh    m2, [r2 + offset1_2]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movh    m1, [r0 + offset0_3]
+    movh    m2, [r2 + offset1_3]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+%else
+    movu    m1, [r0]
+    movu    m2, [r2]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + offset0_1]
+    movu    m2, [r2 + offset1_1]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + offset0_2]
+    movu    m2, [r2 + offset1_2]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + offset0_3]
+    movu    m2, [r2 + offset1_3]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+%endif
     lea       r0, [r0+r1*(%2/%%n)*2]
     lea       r2, [r2+r3*(%2/%%n)*2]
-%else
-    lea       r0, [r0+r1*(%2/%%n)]
-    lea       r2, [r2+r3*(%2/%%n)]
-%endif
 %if %%n > 1
     dec    r4d
     jg .loop
 %endif
+%if %1 == 4
     phaddd    m0, m0
+%else
     phaddd    m0, m0
+    phaddd    m0, m0
+%endif
     movd     eax, m0
     RET
 %endmacro
@@ -321,38 +311,32 @@
     mov    r4d, 8
     pxor    m0, m0
 .loop
-    pmovsxwd  m1, [r0]
-    pmovsxwd  m2, [r2]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 8]
-    pmovsxwd  m2, [r2 + 8]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 16]
-    pmovsxwd  m2, [r2 + 16]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
+    movu    m1, [r0]
+    movu    m2, [r2]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + 16]
+    movu    m2, [r2 + 16]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    pslldq  m1, 8
+    psrldq  m1, 8
+    paddd   m0, m1
     lea       r0, [r0 + 2*r1]
     lea       r2, [r2 + 2*r3]
-    pmovsxwd  m1, [r0]
-    pmovsxwd  m2, [r2]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 8]
-    pmovsxwd  m2, [r2 + 8]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 16]
-    pmovsxwd  m2, [r2 + 16]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
+    movu    m1, [r0]
+    movu    m2, [r2]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + 16]
+    movu    m2, [r2 + 16]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    pslldq  m1, 8
+    psrldq  m1, 8
+    paddd   m0, m1
     lea       r0, [r0 + 2*r1]
     lea       r2, [r2 + 2*r3]
     dec      r4d
@@ -369,88 +353,48 @@
     mov    r4d, %1/2
     pxor    m0, m0
 .loop
-    pmovsxwd  m1, [r0]
-    pmovsxwd  m2, [r2]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 8]
-    pmovsxwd  m2, [r2 + 8]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 16]
-    pmovsxwd  m2, [r2 + 16]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 24]
-    pmovsxwd  m2, [r2 + 24]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 32]
-    pmovsxwd  m2, [r2 + 32]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 40]
-    pmovsxwd  m2, [r2 + 40]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 48]
-    pmovsxwd  m2, [r2 + 48]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 56]
-    pmovsxwd  m2, [r2 + 56]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
+    movu    m1, [r0]
+    movu    m2, [r2]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + 16]
+    movu    m2, [r2 + 16]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + 32]
+    movu    m2, [r2 + 32]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + 48]
+    movu    m2, [r2 + 48]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
     lea       r0, [r0 + 2*r1]
     lea       r2, [r2 + 2*r3]
-    pmovsxwd  m1, [r0]
-    pmovsxwd  m2, [r2]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 8]
-    pmovsxwd  m2, [r2 + 8]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 16]
-    pmovsxwd  m2, [r2 + 16]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 24]
-    pmovsxwd  m2, [r2 + 24]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 32]
-    pmovsxwd  m2, [r2 + 32]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 40]
-    pmovsxwd  m2, [r2 + 40]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 48]
-    pmovsxwd  m2, [r2 + 48]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
-    pmovsxwd  m1, [r0 + 56]
-    pmovsxwd  m2, [r2 + 56]
-    psubd     m1, m2
-    pmulld    m1, m1
-    paddd     m0, m1
+    movu    m1, [r0]
+    movu    m2, [r2]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + 16]
+    movu    m2, [r2 + 16]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + 32]
+    movu    m2, [r2 + 32]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
+    movu    m1, [r0 + 48]
+    movu    m2, [r2 + 48]
+    psubw   m1, m2
+    pmaddwd m1, m1
+    paddd   m0, m1
     lea       r0, [r0 + 2*r1]
     lea       r2, [r2 + 2*r3]
     dec      r4d


More information about the x265-devel mailing list