[x265] [PATCH 229 of 307] [x265-avx512]x86: AVX512 denoise DCT

mythreyi at multicorewareinc.com mythreyi at multicorewareinc.com
Sat Apr 7 04:33:47 CEST 2018


# HG changeset patch
# User Gopi Satykrishna Akisetty <gopi.satykrishna at multicorewareinc.com>
# Date 1512036841 -19800
#      Thu Nov 30 15:44:01 2017 +0530
# Node ID f86b11b8c629b0e4bf8342d42a0e9c475d7c3a7d
# Parent  e77ef4964dd04de6a8b84378f7a46219f34bf1b5
[x265-avx512]x86: AVX512 denoise DCT

diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Thu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Nov 30 15:44:01 2017 +0530
@@ -2888,6 +2888,7 @@
         p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
         p.quant = PFX(quant_avx512);
         p.nquant = PFX(nquant_avx512);
+        p.denoiseDct = PFX(denoise_dct_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].filter_hps = PFX(interp_4tap_horiz_ps_32x32_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].filter_hps = PFX(interp_4tap_horiz_ps_32x16_avx512);
         p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].filter_hps = PFX(interp_4tap_horiz_ps_32x24_avx512);
@@ -5068,6 +5069,7 @@
         p.cu[BLOCK_32x32].idct = PFX(idct32_avx512);
         p.quant = PFX(quant_avx512);
         p.nquant = PFX(nquant_avx512);
+        p.denoiseDct = PFX(denoise_dct_avx512);
     }
 #endif
 }
diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Thu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/dct8.asm	Thu Nov 30 15:44:01 2017 +0530
@@ -2357,6 +2357,67 @@
     dec      r3d
     jnz .loop
     RET
+%if ARCH_X86_64 == 1
+INIT_ZMM avx512
+cglobal denoise_dct, 4, 4, 22
+    pxor     m16,  m16
+    sub      r3d,   16
+    je       .coeff16
+    add      r3d,   16
+    shr      r3d,    5
+    jmp      .loop
+
+.coeff16:
+    movu          ym19,  [r0]
+    pabsw         ym17, ym19
+    movu            m2, [r1]
+    pmovsxwd       m18, ym17
+    paddd           m2,  m18
+    movu          [r1],   m2
+    movu           ym3, [r2]
+    psubusw       ym17, ym3
+    pcmpgtw       ym18, ym17, ym16
+    pand          ym17, ym18
+    psignw        ym17, ym19
+    movu          [r0], ym17
+    RET
+
+.loop:
+    movu          m21, [r0]
+    pabsw         m17, m21
+    movu           m2, [r1]
+    pmovsxwd       m4, ym17
+    paddd          m2,  m4
+    movu         [r1],  m2
+    vextracti64x4 ym4, m17, 1
+
+    movu           m2, [r1 + mmsize]
+    pmovsxwd       m3, ym4
+    paddd          m2, m3
+    movu           [r1 + mmsize], m2
+    movu           m3, [r2]
+    psubusw       m17, m3
+
+    vextracti64x4 ym20,  m17,    1
+    pcmpgtw       ym18, ym17, ym16
+    pcmpgtw       ym19, ym20, ym16
+    vinserti64x4   m18,  m18, ym19, 1
+
+    pand           m17,  m18
+    vextracti64x4 ym19,  m17, 1
+    vextracti64x4 ym20,  m21, 1
+    psignw        ym17, ym21
+    psignw        ym19, ym20
+    vinserti64x4   m17,  m17, ym19, 1
+
+    movu          [r0],  m17
+    add             r0,  mmsize
+    add             r1,  mmsize * 2
+    add             r2,  mmsize
+    dec             r3d
+    jnz             .loop
+    RET
+%endif ; ARCH_X86_64 == 1
 
 %if ARCH_X86_64 == 1
 %macro DCT8_PASS_1 4
diff -r e77ef4964dd0 -r f86b11b8c629 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Thu Nov 30 17:06:16 2017 +0530
+++ b/source/common/x86/dct8.h	Thu Nov 30 15:44:01 2017 +0530
@@ -42,7 +42,7 @@
 void PFX(idst4_avx2)(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void PFX(denoise_dct_sse4)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
 void PFX(denoise_dct_avx2)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
-
+void PFX(denoise_dct_avx512)(int16_t* dct, uint32_t* sum, const uint16_t* offset, int size);
 void PFX(dct8_avx512)(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void PFX(idct8_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);
 void PFX(idct16_avx512)(const int16_t* src, int16_t* dst, intptr_t dstStride);


More information about the x265-devel mailing list