[x265] [PATCH] asm: Enable and fix ssd_s for main10 and main12, sse2 and avx2
ramya at multicorewareinc.com
ramya at multicorewareinc.com
Mon Oct 26 11:18:20 CET 2015
# HG changeset patch
# User Ramya Sriraman <ramya at multicorewareinc.com>
# Date 1445851853 -19800
# Mon Oct 26 15:00:53 2015 +0530
# Node ID 34dee101f2e2956b156f9596ca91517a16d2460b
# Parent 6563218ce342c30bfd4f9bc172a1dab510e6e55b
asm: Enable and fix ssd_s for main10 and main12, sse2 and avx2
diff -r 6563218ce342 -r 34dee101f2e2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Oct 26 12:13:53 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Oct 26 15:00:53 2015 +0530
@@ -956,7 +956,7 @@
ALL_LUMA_TU_S(cpy1Dto2D_shl, cpy1Dto2D_shl_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shr, cpy2Dto1D_shr_, sse2);
ALL_LUMA_TU_S(cpy2Dto1D_shl, cpy2Dto1D_shl_, sse2);
- // ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
+ ALL_LUMA_TU_S(ssd_s, pixel_ssd_s_, sse2);
ALL_LUMA_TU_S(calcresidual, getResidual, sse2);
ALL_LUMA_TU_S(transpose, transpose, sse2);
@@ -1534,8 +1534,8 @@
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_avx2);
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_avx2);
- //p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
- //p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
+ p.cu[BLOCK_16x16].ssd_s = PFX(pixel_ssd_s_16_avx2);
+ p.cu[BLOCK_32x32].ssd_s = PFX(pixel_ssd_s_32_avx2);
p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
diff -r 6563218ce342 -r 34dee101f2e2 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Mon Oct 26 12:13:53 2015 +0530
+++ b/source/common/x86/ssd-a.asm Mon Oct 26 15:00:53 2015 +0530
@@ -3105,9 +3105,20 @@
dec r2d
jnz .loop
+%if BIT_DEPTH >= 10
+ movu m1, m0
+ pxor m2, m2
+ punpckldq m0, m2
+ punpckhdq m1, m2
+ paddq m0, m1
+ movhlps m1, m0
+ paddq m0, m1
+ movq rax, xm0
+%else
; calculate sum and return
HADDD m0, m1
movd eax, m0
+%endif
RET
INIT_YMM avx2
@@ -3179,8 +3190,20 @@
dec r2d
jnz .loop
-
+%if BIT_DEPTH >= 10
+ movu m1, m0
+ pxor m2, m2
+ punpckldq m0, m2
+ punpckhdq m1, m2
+ paddq m0, m1
+ vextracti128 xm2, m0, 1
+ paddq xm2, xm0
+ movhlps xm1, xm2
+ paddq xm2, xm1
+ movq rax, xm2
+%else
; calculate sum and return
HADDD m0, m1
movd eax, xm0
+%endif
RET
More information about the x265-devel
mailing list