[x265] [PATCH] asm: dst4 sse2 8bpp and 10bpp

dtyx265 at gmail.com dtyx265 at gmail.com
Wed Jun 10 16:55:20 CEST 2015


# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1433948100 25200
# Node ID c9debeec039e01c501884ab10dc9e32f55092b73
# Parent  6245476add8f0562e3ccb657f572ff94fe96adf0
asm: dst4 sse2 8bpp and 10bpp

This replaces c code.

64-bit

dst4x4		1.43x 	 1575.01  	 2249.96

32-bit

dst4x4		2.10x 	 1452.65  	 3052.47

10bpp

dst4x4		1.40x 	 1567.49  	 2192.50

diff -r 6245476add8f -r c9debeec039e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp	Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp	Wed Jun 10 07:55:00 2015 -0700
@@ -930,6 +930,7 @@
         p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
 
         p.idst4x4 = x265_idst4_sse2;
+        p.dst4x4 = x265_dst4_sse2;
 
         LUMA_VSS_FILTERS(sse2);
 
@@ -2049,6 +2050,7 @@
         p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
 #endif
         p.idst4x4 = x265_idst4_sse2;
+        p.dst4x4 = x265_dst4_sse2;
 
         p.planecopy_sp = x265_downShift_16_sse2;
         ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
diff -r 6245476add8f -r c9debeec039e source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm	Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/dct8.asm	Wed Jun 10 07:55:00 2015 -0700
@@ -582,6 +582,146 @@
 ;------------------------------------------------------
 ;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
 ;------------------------------------------------------
+INIT_XMM sse2
+%if ARCH_X86_64
+cglobal dst4, 3, 4, 8+4
+  %define       coef0   m8
+  %define       coef1   m9
+  %define       coef2   m10
+  %define       coef3   m11
+%else ; ARCH_X86_64 = 0
+cglobal dst4, 3, 4, 8
+  %define       coef0   [r3 + 0 * 16]
+  %define       coef1   [r3 + 1 * 16]
+  %define       coef2   [r3 + 2 * 16]
+  %define       coef3   [r3 + 3 * 16]
+%endif ; ARCH_X86_64
+
+%if BIT_DEPTH == 8
+  %define       DST_SHIFT 1
+  mova          m5, [pd_1]
+%elif BIT_DEPTH == 10
+  %define       DST_SHIFT 3
+  mova          m5, [pd_4]
+%endif
+    add         r2d, r2d
+    lea         r3, [tab_dst4]
+%if ARCH_X86_64
+    mova        coef0, [r3 + 0 * 16]
+    mova        coef1, [r3 + 1 * 16]
+    mova        coef2, [r3 + 2 * 16]
+    mova        coef3, [r3 + 3 * 16]
+%endif
+    movh        m0, [r0 + 0 * r2]            ; load
+    movhps      m0, [r0 + 1 * r2]
+    lea         r0, [r0 + 2 * r2]
+    movh        m1, [r0]
+    movhps      m1, [r0 + r2]
+    pmaddwd     m2, m0, coef0                ; DST1
+    pmaddwd     m3, m1, coef0
+    pshufd      m6, m2, q2301
+    pshufd      m7, m3, q2301
+    paddd       m2, m6
+    paddd       m3, m7
+    pshufd      m2, m2, q3120
+    pshufd      m3, m3, q3120
+    punpcklqdq  m2, m3
+    paddd       m2, m5
+    psrad       m2, DST_SHIFT
+    pmaddwd     m3, m0, coef1
+    pmaddwd     m4, m1, coef1
+    pshufd      m6, m4, q2301
+    pshufd      m7, m3, q2301
+    paddd       m4, m6
+    paddd       m3, m7
+    pshufd      m4, m4, q3120
+    pshufd      m3, m3, q3120
+    punpcklqdq  m3, m4
+    paddd       m3, m5
+    psrad       m3, DST_SHIFT
+    packssdw    m2, m3                       ; m2 = T70
+    pmaddwd     m3, m0, coef2
+    pmaddwd     m4, m1, coef2
+    pshufd      m6, m4, q2301
+    pshufd      m7, m3, q2301
+    paddd       m4, m6
+    paddd       m3, m7
+    pshufd      m4, m4, q3120
+    pshufd      m3, m3, q3120
+    punpcklqdq  m3, m4
+    paddd       m3, m5
+    psrad       m3, DST_SHIFT
+    pmaddwd     m0, coef3
+    pmaddwd     m1, coef3
+    pshufd      m6, m0, q2301
+    pshufd      m7, m1, q2301
+    paddd       m0, m6
+    paddd       m1, m7
+    pshufd      m0, m0, q3120
+    pshufd      m1, m1, q3120
+    punpcklqdq  m0, m1
+    paddd       m0, m5
+    psrad       m0, DST_SHIFT
+    packssdw    m3, m0                       ; m3 = T71
+    mova        m5, [pd_128]
+
+    pmaddwd     m0, m2, coef0                ; DST2
+    pmaddwd     m1, m3, coef0
+    pshufd      m6, m0, q2301
+    pshufd      m7, m1, q2301
+    paddd       m0, m6
+    paddd       m1, m7
+    pshufd      m0, m0, q3120
+    pshufd      m1, m1, q3120
+    punpcklqdq  m0, m1
+    paddd       m0, m5
+    psrad       m0, 8
+
+    pmaddwd     m4, m2, coef1
+    pmaddwd     m1, m3, coef1
+    pshufd      m6, m4, q2301
+    pshufd      m7, m1, q2301
+    paddd       m4, m6
+    paddd       m1, m7
+    pshufd      m4, m4, q3120
+    pshufd      m1, m1, q3120
+    punpcklqdq  m4, m1
+    paddd       m4, m5
+    psrad       m4, 8
+    packssdw    m0, m4
+    movu        [r1 + 0 * 16], m0
+
+    pmaddwd     m0, m2, coef2
+    pmaddwd     m1, m3, coef2
+    pshufd      m6, m0, q2301
+    pshufd      m7, m1, q2301
+    paddd       m0, m6
+    paddd       m1, m7
+    pshufd      m0, m0, q3120
+    pshufd      m1, m1, q3120
+    punpcklqdq  m0, m1
+    paddd       m0, m5
+    psrad       m0, 8
+
+    pmaddwd     m2, coef3
+    pmaddwd     m3, coef3
+    pshufd      m6, m2, q2301
+    pshufd      m7, m3, q2301
+    paddd       m2, m6
+    paddd       m3, m7
+    pshufd      m2, m2, q3120
+    pshufd      m3, m3, q3120
+    punpcklqdq  m2, m3
+    paddd       m2, m5
+    psrad       m2, 8
+    packssdw    m0, m2
+    movu        [r1 + 1 * 16], m0
+
+    RET
+
+;------------------------------------------------------
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
+;------------------------------------------------------
 INIT_XMM ssse3
 %if ARCH_X86_64
 cglobal dst4, 3, 4, 8+2
diff -r 6245476add8f -r c9debeec039e source/common/x86/dct8.h
--- a/source/common/x86/dct8.h	Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/dct8.h	Wed Jun 10 07:55:00 2015 -0700
@@ -25,6 +25,7 @@
 #define X265_DCT8_H
 void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dst4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
 void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);


More information about the x265-devel mailing list