[x265] [PATCH] asm: Enable and fix ssd_s for main10 and main12, sse2 and avx2

ramya at multicorewareinc.com ramya at multicorewareinc.com
Mon Oct 26 11:18:20 CET 2015


# HG changeset patch
# User Ramya Sriraman <ramya at multicorewareinc.com>
# Date 1445851853 -19800
#      Mon Oct 26 15:00:53 2015 +0530
# Node ID 34dee101f2e2956b156f9596ca91517a16d2460b
# Parent  6563218ce342c30bfd4f9bc172a1dab510e6e55b
asm: Enable and fix ssd_s for main10 and main12, sse2 and avx2

diff -r 6563218ce342 -r 34dee101f2e2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Mon Oct 26 12:13:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Mon Oct 26 15:00:53 2015 +0530
@@ -956,7 +956,7 @@
         ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
         ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
-       // ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
+        ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
         ALL_LUMA_TU_S(calcresidual, getResidual, sse2);
         ALL_LUMA_TU_S(transpose, transpose, sse2);
 
@@ -1534,8 +1534,8 @@
         p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx2);
         p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx2);
 
-        //p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
-        //p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
+        p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
+        p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
 
         p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
         p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
diff -r 6563218ce342 -r 34dee101f2e2 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm	Mon Oct 26 12:13:53 2015 +0530
+++ b/source/common/x86/ssd-a.asm	Mon Oct 26 15:00:53 2015 +0530
@@ -3105,9 +3105,20 @@
     dec     r2d
     jnz    .loop
 
+%if BIT_DEPTH >= 10
+    movu            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m0, m1
+    movhlps         m1, m0
+    paddq           m0, m1
+    movq            rax, xm0
+%else
     ; calculate sum and return
     HADDD   m0, m1
     movd    eax, m0
+%endif
     RET
 
 INIT_YMM avx2
@@ -3179,8 +3190,20 @@
 
     dec     r2d
     jnz    .loop
-
+%if BIT_DEPTH >= 10
+    movu            m1, m0
+    pxor            m2, m2
+    punpckldq       m0, m2
+    punpckhdq       m1, m2
+    paddq           m0, m1
+    vextracti128    xm2, m0, 1
+    paddq           xm2, xm0
+    movhlps         xm1, xm2
+    paddq           xm2, xm1
+    movq            rax, xm2
+%else
     ; calculate sum and return
     HADDD   m0, m1
     movd    eax, xm0
+%endif
     RET


More information about the x265-devel mailing list