[x265] [PATCH] denoiseDct asm code: nit faulty code, need a new SSE version
praveen at multicorewareinc.com
praveen at multicorewareinc.com
Wed Sep 17 13:32:26 CEST 2014
# HG changeset patch
# User Praveen Tiwari
# Date 1410952504 -19800
# Node ID 530c1824c585870c07ba13623cb92b21637a8514
# Parent a2dcc12bd36f41a99c346870cc4c23c1e313665b
denoiseDct asm code: nit faulty code, need a new SSE version
diff -r a2dcc12bd36f -r 530c1824c585 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Sep 17 16:33:52 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Sep 17 16:45:04 2014 +0530
@@ -1565,7 +1565,6 @@
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
- //p.denoiseDct = x265_denoise_dct_sse2;
p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
p.copy_shl[BLOCK_8x8] = x265_copy_shl_8_sse2;
p.copy_shl[BLOCK_16x16] = x265_copy_shl_16_sse2;
@@ -1605,7 +1604,6 @@
p.dct[DST_4x4] = x265_dst4_ssse3;
p.idct[IDCT_8x8] = x265_idct8_ssse3;
p.count_nonzero = x265_count_nonzero_ssse3;
- //p.denoiseDct = x265_denoise_dct_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -1709,7 +1707,6 @@
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_avx;
p.ssim_end_4 = x265_pixel_ssim_end4_avx;
- //p.denoiseDct = x265_denoise_dct_avx;
}
if (cpuMask & X265_CPU_XOP)
{
diff -r a2dcc12bd36f -r 530c1824c585 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Sep 17 16:33:52 2014 +0530
+++ b/source/common/x86/dct8.asm Wed Sep 17 16:45:04 2014 +0530
@@ -1054,102 +1054,6 @@
RET
-; TODO: split into two version after coeff_t changed
-%if 1 ;HIGH_BIT_DEPTH
-;-----------------------------------------------------------------------------
-; void denoise_dct( int32_t *dct, uint32_t *sum, uint32_t *offset, int size )
-;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 0
-cglobal denoise_dct, 4,4,6
- pxor m5, m5
- movsxdifnidn r3, r3d
-.loop:
- mova m2, [r0+r3*4-2*mmsize]
- mova m3, [r0+r3*4-1*mmsize]
- ABSD m0, m2
- ABSD m1, m3
- paddd m4, m0, [r1+r3*4-2*mmsize]
- psubd m0, [r2+r3*4-2*mmsize]
- mova [r1+r3*4-2*mmsize], m4
- paddd m4, m1, [r1+r3*4-1*mmsize]
- psubd m1, [r2+r3*4-1*mmsize]
- mova [r1+r3*4-1*mmsize], m4
- pcmpgtd m4, m0, m5
- pand m0, m4
- pcmpgtd m4, m1, m5
- pand m1, m4
- PSIGND m0, m2
- PSIGND m1, m3
- mova [r0+r3*4-2*mmsize], m0
- mova [r0+r3*4-1*mmsize], m1
- sub r3d, mmsize/2
- jg .loop
- RET
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx
-DENOISE_DCT
-%endif
-INIT_XMM sse2
-DENOISE_DCT
-INIT_XMM ssse3
-DENOISE_DCT
-INIT_XMM avx
-DENOISE_DCT
-INIT_YMM avx2
-DENOISE_DCT
-
-%else ; !HIGH_BIT_DEPTH
-
-;-----------------------------------------------------------------------------
-; void denoise_dct( int16_t *dct, uint32_t *sum, uint16_t *offset, int size )
-;-----------------------------------------------------------------------------
-%macro DENOISE_DCT 0
-cglobal denoise_dct, 4,4,7
- pxor m6, m6
- movsxdifnidn r3, r3d
-.loop:
- mova m2, [r0+r3*2-2*mmsize]
- mova m3, [r0+r3*2-1*mmsize]
- ABSW m0, m2, sign
- ABSW m1, m3, sign
- psubusw m4, m0, [r2+r3*2-2*mmsize]
- psubusw m5, m1, [r2+r3*2-1*mmsize]
- PSIGNW m4, m2
- PSIGNW m5, m3
- mova [r0+r3*2-2*mmsize], m4
- mova [r0+r3*2-1*mmsize], m5
- punpcklwd m2, m0, m6
- punpcklwd m3, m1, m6
- punpckhwd m0, m6
- punpckhwd m1, m6
- paddd m2, [r1+r3*4-4*mmsize]
- paddd m0, [r1+r3*4-3*mmsize]
- paddd m3, [r1+r3*4-2*mmsize]
- paddd m1, [r1+r3*4-1*mmsize]
- mova [r1+r3*4-4*mmsize], m2
- mova [r1+r3*4-3*mmsize], m0
- mova [r1+r3*4-2*mmsize], m3
- mova [r1+r3*4-1*mmsize], m1
- sub r3, mmsize
- jg .loop
-%if (mmsize == 8)
- EMMS
-%endif
- RET
-%endmacro
-
-%if ARCH_X86_64 == 0
-INIT_MMX mmx
-DENOISE_DCT
-%endif
-INIT_XMM sse2
-DENOISE_DCT
-INIT_XMM ssse3
-DENOISE_DCT
-INIT_XMM avx
-DENOISE_DCT
INIT_YMM avx2
cglobal denoise_dct, 4,4,4
@@ -1172,7 +1076,6 @@
jg .loop
RET
-%endif ; !HIGH_BIT_DEPTH
%macro DCT16_PASS_1_E 2
vpbroadcastq m7, [r7 + %1]
More information about the x265-devel
mailing list