[x265] [PATCH] asm: dst4 sse2 8bpp and 10bpp
dtyx265 at gmail.com
dtyx265 at gmail.com
Wed Jun 10 16:55:20 CEST 2015
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1433948100 25200
# Node ID c9debeec039e01c501884ab10dc9e32f55092b73
# Parent 6245476add8f0562e3ccb657f572ff94fe96adf0
asm: dst4 sse2 8bpp and 10bpp
This replaces c code.
64-bit
dst4x4 1.43x 1575.01 2249.96
32-bit
dst4x4 2.10x 1452.65 3052.47
10bpp
dst4x4 1.40x 1567.49 2192.50
diff -r 6245476add8f -r c9debeec039e source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Jun 10 07:55:00 2015 -0700
@@ -930,6 +930,7 @@
p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
p.idst4x4 = x265_idst4_sse2;
+ p.dst4x4 = x265_dst4_sse2;
LUMA_VSS_FILTERS(sse2);
@@ -2049,6 +2050,7 @@
p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
#endif
p.idst4x4 = x265_idst4_sse2;
+ p.dst4x4 = x265_dst4_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
diff -r 6245476add8f -r c9debeec039e source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/dct8.asm Wed Jun 10 07:55:00 2015 -0700
@@ -582,6 +582,146 @@
;------------------------------------------------------
;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
+INIT_XMM sse2
+%if ARCH_X86_64
+cglobal dst4, 3, 4, 8+4
+ %define coef0 m8
+ %define coef1 m9
+ %define coef2 m10
+ %define coef3 m11
+%else ; ARCH_X86_64 = 0
+cglobal dst4, 3, 4, 8
+ %define coef0 [r3 + 0 * 16]
+ %define coef1 [r3 + 1 * 16]
+ %define coef2 [r3 + 2 * 16]
+ %define coef3 [r3 + 3 * 16]
+%endif ; ARCH_X86_64
+
+%if BIT_DEPTH == 8
+ %define DST_SHIFT 1
+ mova m5, [pd_1]
+%elif BIT_DEPTH == 10
+ %define DST_SHIFT 3
+ mova m5, [pd_4]
+%endif
+ add r2d, r2d
+ lea r3, [tab_dst4]
+%if ARCH_X86_64
+ mova coef0, [r3 + 0 * 16]
+ mova coef1, [r3 + 1 * 16]
+ mova coef2, [r3 + 2 * 16]
+ mova coef3, [r3 + 3 * 16]
+%endif
+ movh m0, [r0 + 0 * r2] ; load
+ movhps m0, [r0 + 1 * r2]
+ lea r0, [r0 + 2 * r2]
+ movh m1, [r0]
+ movhps m1, [r0 + r2]
+ pmaddwd m2, m0, coef0 ; DST1
+ pmaddwd m3, m1, coef0
+ pshufd m6, m2, q2301
+ pshufd m7, m3, q2301
+ paddd m2, m6
+ paddd m3, m7
+ pshufd m2, m2, q3120
+ pshufd m3, m3, q3120
+ punpcklqdq m2, m3
+ paddd m2, m5
+ psrad m2, DST_SHIFT
+ pmaddwd m3, m0, coef1
+ pmaddwd m4, m1, coef1
+ pshufd m6, m4, q2301
+ pshufd m7, m3, q2301
+ paddd m4, m6
+ paddd m3, m7
+ pshufd m4, m4, q3120
+ pshufd m3, m3, q3120
+ punpcklqdq m3, m4
+ paddd m3, m5
+ psrad m3, DST_SHIFT
+ packssdw m2, m3 ; m2 = T70
+ pmaddwd m3, m0, coef2
+ pmaddwd m4, m1, coef2
+ pshufd m6, m4, q2301
+ pshufd m7, m3, q2301
+ paddd m4, m6
+ paddd m3, m7
+ pshufd m4, m4, q3120
+ pshufd m3, m3, q3120
+ punpcklqdq m3, m4
+ paddd m3, m5
+ psrad m3, DST_SHIFT
+ pmaddwd m0, coef3
+ pmaddwd m1, coef3
+ pshufd m6, m0, q2301
+ pshufd m7, m1, q2301
+ paddd m0, m6
+ paddd m1, m7
+ pshufd m0, m0, q3120
+ pshufd m1, m1, q3120
+ punpcklqdq m0, m1
+ paddd m0, m5
+ psrad m0, DST_SHIFT
+ packssdw m3, m0 ; m3 = T71
+ mova m5, [pd_128]
+
+ pmaddwd m0, m2, coef0 ; DST2
+ pmaddwd m1, m3, coef0
+ pshufd m6, m0, q2301
+ pshufd m7, m1, q2301
+ paddd m0, m6
+ paddd m1, m7
+ pshufd m0, m0, q3120
+ pshufd m1, m1, q3120
+ punpcklqdq m0, m1
+ paddd m0, m5
+ psrad m0, 8
+
+ pmaddwd m4, m2, coef1
+ pmaddwd m1, m3, coef1
+ pshufd m6, m4, q2301
+ pshufd m7, m1, q2301
+ paddd m4, m6
+ paddd m1, m7
+ pshufd m4, m4, q3120
+ pshufd m1, m1, q3120
+ punpcklqdq m4, m1
+ paddd m4, m5
+ psrad m4, 8
+ packssdw m0, m4
+ movu [r1 + 0 * 16], m0
+
+ pmaddwd m0, m2, coef2
+ pmaddwd m1, m3, coef2
+ pshufd m6, m0, q2301
+ pshufd m7, m1, q2301
+ paddd m0, m6
+ paddd m1, m7
+ pshufd m0, m0, q3120
+ pshufd m1, m1, q3120
+ punpcklqdq m0, m1
+ paddd m0, m5
+ psrad m0, 8
+
+ pmaddwd m2, coef3
+ pmaddwd m3, coef3
+ pshufd m6, m2, q2301
+ pshufd m7, m3, q2301
+ paddd m2, m6
+ paddd m3, m7
+ pshufd m2, m2, q3120
+ pshufd m3, m3, q3120
+ punpcklqdq m2, m3
+ paddd m2, m5
+ psrad m2, 8
+ packssdw m0, m2
+ movu [r1 + 1 * 16], m0
+
+ RET
+
+;------------------------------------------------------
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
+;------------------------------------------------------
INIT_XMM ssse3
%if ARCH_X86_64
cglobal dst4, 3, 4, 8+2
diff -r 6245476add8f -r c9debeec039e source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Wed Jun 10 11:54:27 2015 +0530
+++ b/source/common/x86/dct8.h Wed Jun 10 07:55:00 2015 -0700
@@ -25,6 +25,7 @@
#define X265_DCT8_H
void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dst4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
More information about the x265-devel
mailing list