[x265] [PATCH] asm: Fix sse_pp [16x32] & [32x64] main12 AVX2
ramya at multicorewareinc.com
ramya at multicorewareinc.com
Mon Oct 19 13:25:06 CEST 2015
# HG changeset patch
# User Ramya Sriraman <ramya at multicorewareinc.com>
# Date 1445250664 -19800
# Mon Oct 19 16:01:04 2015 +0530
# Node ID 80864f2e4ff7b968cca457aac41e73c3fd203ead
# Parent 04575a459a160162391fcf1a12e8e6f2e81e95b4
asm: Fix sse_pp [16x32] & [32x64] main12 AVX2
diff -r 04575a459a16 -r 80864f2e4ff7 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Sep 30 11:22:16 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Oct 19 16:01:04 2015 +0530
@@ -1540,15 +1540,10 @@
p.cu[BLOCK_16x16].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_16x16_avx2);
p.cu[BLOCK_32x32].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_32x32_avx2);
p.cu[BLOCK_64x64].sse_ss = (pixel_sse_ss_t)PFX(pixel_ssd_64x64_avx2);
-
-#if X265_DEPTH <= 10
-
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_ssd_16x16_avx2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_ssd_32x32_avx2);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = (pixel_sse_t)PFX(pixel_ssd_16x16_avx2);
+ p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_32x32_avx2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_16x32_avx2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = (pixel_sse_t)PFX(pixel_ssd_ss_32x64_avx2);
-#endif
-
p.quant = PFX(quant_avx2);
p.nquant = PFX(nquant_avx2);
p.dequant_normal = PFX(dequant_normal_avx2);
diff -r 04575a459a16 -r 80864f2e4ff7 source/common/x86/ssd-a.asm
--- a/source/common/x86/ssd-a.asm Wed Sep 30 11:22:16 2015 +0530
+++ b/source/common/x86/ssd-a.asm Mon Oct 19 16:01:04 2015 +0530
@@ -107,16 +107,28 @@
dec r4d
jg .loop
%endif
-
-%if BIT_DEPTH == 12 && mmsize == 16
- movu m5, m0
- pxor m6, m6
- punpckldq m0, m6
- punpckhdq m5, m6
- paddq m0, m5
- movhlps m5, m0
- paddq m0, m5
- movq r6, xm0
+%if BIT_DEPTH == 12 && %1 >= 16 && %2 >=16
+%if mmsize == 16
+ movu m5, m0
+ pxor m6, m6
+ punpckldq m0, m6
+ punpckhdq m5, m6
+ paddq m0, m5
+ movhlps m5, m0
+ paddq m0, m5
+ movq r6, xm0
+%elif mmsize == 32
+ movu m1, m0
+ pxor m2, m2
+ punpckldq m0, m2
+ punpckhdq m1, m2
+ paddq m0, m1
+ vextracti128 xm2, m0, 1
+ paddq xm2, xm0
+ movhlps xm1, xm2
+ paddq xm2, xm1
+ movq rax, xm2
+%endif
%else
HADDD m0, m5
movd eax,xm0
More information about the x265-devel
mailing list