[x265] [PATCH] asm: Fix sse_pp [16x32] & [32x64] main12 AVX2

ramya at multicorewareinc.com ramya at multicorewareinc.com
Mon Oct 19 13:25:06 CEST 2015


# HG changeset patch
# User Ramya Sriraman <ramya at multicorewareinc.com>
# Date 1445250664 -19800
#      Mon Oct 19 16:01:04 2015 +0530
# Node ID 80864f2e4ff7b968cca457aac41e73c3fd203ead
# Parent  04575a459a160162391fcf1a12e8e6f2e81e95b4
asm: Fix sse_pp [16x32] & [32x64] main12 AVX2

diff -r 04575a459a16 -r 80864f2e4ff7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 30 11:22:16 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Oct 19 16:01:04 2015 +0530
@@ -1540,15 +1540,10 @@
         p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
         p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
-
-#if X265_DEPTH <= 10
-
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
-        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_16x16_avx2);
+        p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_avx2);
         p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_avx2);
-#endif
-
         p.quant = PFX(quant_avx2);
         p.nquant = PFX(nquant_avx2);
         p.dequant_normal  = PFX(dequant_normal_avx2);
diff -r 04575a459a16 -r 80864f2e4ff7 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Wed Sep 30 11:22:16 2015 +0530
+++ b/source/common/x86/ssd-a.asm	Mon Oct 19 16:01:04 2015 +0530
@@ -107,16 +107,28 @@
     dec    r4d
     jg .loop
 %endif
-
-%if BIT_DEPTH == 12 && mmsize == 16
-    movu        m5, m0
-    pxor        m6, m6
-    punpckldq   m0, m6
-    punpckhdq   m5, m6
-    paddq       m0, m5
-    movhlps     m5, m0
-    paddq       m0, m5
-    movq        r6, xm0
+%if BIT_DEPTH == 12 && %1 >= 16 && %2 >=16
+%if  mmsize == 16
+    movu            m5, m0
+    pxor            m6, m6
+    punpckldq       m0, m6
+    punpckhdq       m5, m6
+    paddq           m0, m5
+    movhlps         m5, m0
+    paddq           m0, m5
+    movq            r6, xm0
+%elif mmsize == 32
+    movu            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m0, m1
+    vextracti128    xm2, m0, 1
+    paddq           xm2, xm0
+    movhlps         xm1, xm2
+    paddq           xm2, xm1
+    movq            rax, xm2
+%endif
 %else 
     HADDD   m0, m5
     movd    eax,xm0


More information about the x265-devel mailing list