[x265] [PATCH 1 of 2] asm: fix sse_pp[16x32] sse2 asm for 12-bit

ramya at multicorewareinc.com ramya at multicorewareinc.com
Thu Sep 3 10:22:52 CEST 2015


# HG changeset patch
# User Ramya Sriraman <ramya at multicorewareinc.com>
# Date 1440588985 -19800
#      Wed Aug 26 17:06:25 2015 +0530
# Node ID 83dc8aea6ba7c10e0d78ec7dc34b3d8f7d114563
# Parent  d8091487bc9749e702c468786b0cd9e663478a91
asm: fix sse_pp[16x32] sse2 asm for 12-bit

diff -r d8091487bc97 -r 83dc8aea6ba7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Tue Aug 25 16:39:12 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp	Wed Aug 26 17:06:25 2015 +0530
@@ -998,13 +998,13 @@
         p.cu[BLOCK_4x4].intra_pred[32] = PFX(intra_pred_ang4_32_sse2);
         p.cu[BLOCK_4x4].intra_pred[33] = PFX(intra_pred_ang4_33_sse2);
 
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
+        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
+
 #if X265_DEPTH <= 10
         p.cu[BLOCK_4x4].sse_ss = PFX(pixel_ssd_ss_4x4_mmx2);
         ALL_LUMA_CU(sse_ss, pixel_ssd_ss, sse2);
-
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_4x8_mmx2);
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_8x16_sse2);
-        p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_sse2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_sse2);
 #endif
 
diff -r d8091487bc97 -r 83dc8aea6ba7 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Tue Aug 25 16:39:12 2015 -0700
+++ b/source/common/x86/ssd-a.asm	Wed Aug 26 17:06:25 2015 +0530
@@ -105,8 +105,20 @@
     dec    r4d
     jg .loop
 %endif
+
+%if BIT_DEPTH == 12 && mmsize == 16
+    movu        m5, m0
+    pxor        m6, m6
+    punpckldq   m0, m6
+    punpckhdq   m5, m6
+    paddq       m0, m5
+    movhlps     m5, m0
+    paddq       m0, m5
+    movq        r6, xm0
+%else 
     HADDD   m0, m5
-    movd   eax, xm0
+    movd    eax,xm0
+%endif
 %ifidn movu,movq ; detect MMX
     EMMS
 %endif


More information about the x265-devel mailing list