[x265] [PATCH 229 of 307] [x265-avx512]x86: AVX512 denoise DCT
mythreyi at multicorewareinc.com
mythreyi at multicorewareinc.com
Sat Apr 7 04:33:47 CEST 2018
# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1512036841 -19800
# Thu Nov 30 15:44:01 2017 +0530
# Node ID f86b11b8c629b0e4bf8342d42a0e9c475d7c3a7d
# Parent e77ef4964dd04de6a8b84378f7a46219f34bf1b5
[x265-avx512]x86: AVX512 denoise DCT
diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 30 15:44:01 2017 +0530
@@ -2888,6 +2888,7 @@
p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
p.quant = PFX(quant_avx512);
p.nquant = PFX(nquant_avx512);
+ p.denoiseDct = PFX(denoise_dct_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
@@ -5068,6 +5069,7 @@
p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
p.quant = PFX(quant_avx512);
p.nquant = PFX(nquant_avx512);
+ p.denoiseDct = PFX(denoise_dct_avx512);
}
#endif
}
diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/dct8.asm Thu Nov 30 15:44:01 2017 +0530
@@ -2357,6 +2357,67 @@
dec r3d
jnz .loop
RET
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal denoise_dct, 4, 4, 22
+ pxor m16, m16
+ sub r3d, 16
+ je .coeff16
+ add r3d, 16
+ shr r3d, 5
+ jmp .loop
+
+.coeff16:
+ movu ym19, [r0]
+ pabsw ym17, ym19
+ movu m2, [r1]
+ pmovsxwd m18, ym17
+ paddd m2, m18
+ movu [r1], m2
+ movu ym3, [r2]
+ psubusw ym17, ym3
+ pcmpgtw ym18, ym17, ym16
+ pand ym17, ym18
+ psignw ym17, ym19
+ movu [r0], ym17
+ RET
+
+.loop:
+ movu m21, [r0]
+ pabsw m17, m21
+ movu m2, [r1]
+ pmovsxwd m4, ym17
+ paddd m2, m4
+ movu [r1], m2
+ vextracti64x4 ym4, m17, 1
+
+ movu m2, [r1 + mmsize]
+ pmovsxwd m3, ym4
+ paddd m2, m3
+ movu [r1 + mmsize], m2
+ movu m3, [r2]
+ psubusw m17, m3
+
+ vextracti64x4 ym20, m17, 1
+ pcmpgtw ym18, ym17, ym16
+ pcmpgtw ym19, ym20, ym16
+ vinserti64x4 m18, m18, ym19, 1
+
+ pand m17, m18
+ vextracti64x4 ym19, m17, 1
+ vextracti64x4 ym20, m21, 1
+ psignw ym17, ym21
+ psignw ym19, ym20
+ vinserti64x4 m17, m17, ym19, 1
+
+ movu [r0], m17
+ add r0, mmsize
+ add r1, mmsize * 2
+ add r2, mmsize
+ dec r3d
+ jnz .loop
+ RET
+%endif ; ARCH_X86_64 == 1
%if ARCH_X86_64 == 1
%macro DCT8_PASS_1 4
diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Thu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/dct8.h Thu Nov 30 15:44:01 2017 +0530
@@ -42,7 +42,7 @@
void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
-
+void PFX(denoise_dct_avx512)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
More information about the x265-devel
mailing list