[x265] [PATCH] denoise_dct asm code: SSE version

praveen at multicorewareinc.com praveen at multicorewareinc.com
Thu Sep 18 11:41:52 CEST 2014


# HG changeset patch
# User Praveen Tiwari
# Date 1411033286 -19800
# Node ID c4b689f6050231e99b9663b7504cd7fff90bdafb
# Parent  54ad38a84a6900a7c674e6d1738fd31271129139
denoise_dct asm code: SSE version

diff -r 54ad38a84a69 -r c4b689f60502 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Sep 17 16:52:15 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp	Thu Sep 18 15:11:26 2014 +0530
@@ -1689,6 +1689,7 @@
 
         p.dct[DCT_8x8] = x265_dct8_sse4;
         p.copy_shr = x265_copy_shr_sse4;
+        p.denoiseDct = x265_denoise_dct_sse4;
     }
     if (cpuMask & X265_CPU_AVX)
     {
diff -r 54ad38a84a69 -r c4b689f60502 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Sep 17 16:52:15 2014 +0530
+++ b/source/common/x86/dct8.asm	Thu Sep 18 15:11:26 2014 +0530
@@ -4,6 +4,7 @@
 ;* Authors: Nabajit Deka <nabajit at multicorewareinc.com>
 ;*          Min Chen <chenm003 at 163.com> <min.chen at multicorewareinc.com>
 ;*          Li Cao <li at multicorewareinc.com>
+;*          Praveen Kumar Tiwari <Praveen at multicorewareinc.com>
 ;*
 ;* This program is free software; you can redistribute it and/or modify
 ;* it under the terms of the GNU General Public License as published by
@@ -1054,6 +1055,31 @@
     RET
 
 
+;-----------------------------------------------------------------------------
+; void denoise_dct(int32_t *dct, uint32_t *sum, uint16_t *offset, int size)
+;-----------------------------------------------------------------------------
+INIT_XMM sse4
+cglobal denoise_dct, 4, 4, 6
+    pxor     m5,  m5
+    shr      r3d, 2
+.loop:
+    mova     m0, [r0]
+    pabsd    m1, m0
+    mova     m2, [r1]
+    paddd    m2, m1
+    mova     [r1], m2
+    pmovzxwd m3, [r2]
+    psubd    m1, m3
+    pcmpgtd  m4, m1, m5
+    pand     m1, m4
+    psignd   m1, m0
+    mova     [r0], m1
+    add      r0, 16
+    add      r1, 16
+    add      r2, 8
+    dec      r3d
+    jnz .loop
+    RET
 
 INIT_YMM avx2
 cglobal denoise_dct, 4,4,4
diff -r 54ad38a84a69 -r c4b689f60502 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Sep 17 16:52:15 2014 +0530
+++ b/source/common/x86/dct8.h	Thu Sep 18 15:11:26 2014 +0530
@@ -33,6 +33,7 @@
 void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
 void x265_dct32_avx2(int16_t *src, int32_t *dst, intptr_t stride);
 
+void x265_denoise_dct_sse4(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
 void x265_denoise_dct_avx2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
 
 #endif // ifndef X265_DCT8_H


More information about the x265-devel mailing list