[x265] [PATCH] denoiseDct: SSE version of asm code

praveen at multicorewareinc.com praveen at multicorewareinc.com
Wed Sep 17 13:33:16 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1410953432 -19800
# Node ID e919c3dde6bd9a3b74177e48a14e8b151556caee
# Parent  de0b737ed7165b4739128ee430f259ea0f8a5e81
denoiseDct: SSE version of asm code

diff -r de0b737ed716 -r e919c3dde6bd source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 17 16:52:15 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Sep 17 17:00:32 2014 +0530
@@ -1689,6 +1689,7 @@
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
         p.copy_shr = x265_copy_shr_sse4;
+        p.denoiseDct = x265_denoise_dct_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r de0b737ed716 -r e919c3dde6bd source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Sep 17 16:52:15 2014 +0530
+++ b/source/common/x86/dct8.asm	Wed Sep 17 17:00:32 2014 +0530
@@ -1054,6 +1054,32 @@
     RET
 
 
+;-----------------------------------------------------------------------------
+; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal denoise_dct, 4, 4, 6
+    pxor     m5,  m5
+    shr      r3d, 2
+.loop:
+    mova     m0, [r0]
+    pabsd    m1, m0
+    mova     m2, [r1]
+    paddd    m2, m1
+    mova     [r1], m2
+    movh     m2, [r2]
+    pmovzxwd m3, m2
+    psubd    m1, m3
+    pcmpgtd  m4, m1, m5
+    pand     m1, m4
+    psignd   m1, m0
+    mova     [r0], m1
+    add      r0, 16
+    add      r1, 16
+    add      r2, 8
+    dec      r3d
+    jg .loop
+    RET
 
 INIT_YMM avx2
 cglobal denoise_dct, 4,4,4
diff -r de0b737ed716 -r e919c3dde6bd source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Sep 17 16:52:15 2014 +0530
+++ b/source/common/x86/dct8.h	Wed Sep 17 17:00:32 2014 +0530
@@ -33,6 +33,7 @@
 void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
 void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
 
+void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
 void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
 
 #endif // ifndef X265_DCT8_H


More information about the x265-devel mailing list