[x265] [PATCH] asm: dst4 sse2 8bpp and 10bpp
chen
chenm003 at 163.com
Wed Jun 10 17:22:54 CEST 2015
right
At 2015-06-10 22:55:20,dtyx265 at gmail.com wrote:
># HG changeset patch
># User David T Yuen <dtyx265 at gmail.com>
># Date 1433948100 25200
># Node ID c9debeec039e01c501884ab10dc9e32f55092b73
># Parent 6245476add8f0562e3ccb657f572ff94fe96adf0
>asm: dst4 sse2 8bpp and 10bpp
>
>This replaces c code.
>
>64-bit
>
>dst4x4 1.43x 1575.01 2249.96
>
>32-bit
>
>dst4x4 2.10x 1452.65 3052.47
>
>10bpp
>
>dst4x4 1.40x 1567.49 2192.50
>
>diff -r 6245476add8f -r c9debeec039e source/common/x86/asm-primitives.cpp
>--- a/source/common/x86/asm-primitives.cpp Wed Jun 10 11:54:27 2015 +0530
>+++ b/source/common/x86/asm-primitives.cpp Wed Jun 10 07:55:00 2015 -0700
>@@ -930,6 +930,7 @@
> p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
>
> p.idst4x4 = x265_idst4_sse2;
>+ p.dst4x4 = x265_dst4_sse2;
>
> LUMA_VSS_FILTERS(sse2);
>
>@@ -2049,6 +2050,7 @@
> p.cu[BLOCK_8x8].idct = x265_idct8_sse2;
> #endif
> p.idst4x4 = x265_idst4_sse2;
>+ p.dst4x4 = x265_dst4_sse2;
>
> p.planecopy_sp = x265_downShift_16_sse2;
> ALL_CHROMA_420_PU(p2s, filterPixelToShort, sse2);
>diff -r 6245476add8f -r c9debeec039e source/common/x86/dct8.asm
>--- a/source/common/x86/dct8.asm Wed Jun 10 11:54:27 2015 +0530
>+++ b/source/common/x86/dct8.asm Wed Jun 10 07:55:00 2015 -0700
>@@ -582,6 +582,146 @@
> ;------------------------------------------------------
> ;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
> ;------------------------------------------------------
>+INIT_XMM sse2
>+%if ARCH_X86_64
>+cglobal dst4, 3, 4, 8+4
>+ %define coef0 m8
>+ %define coef1 m9
>+ %define coef2 m10
>+ %define coef3 m11
>+%else ; ARCH_X86_64 = 0
>+cglobal dst4, 3, 4, 8
>+ %define coef0 [r3 + 0 * 16]
>+ %define coef1 [r3 + 1 * 16]
>+ %define coef2 [r3 + 2 * 16]
>+ %define coef3 [r3 + 3 * 16]
>+%endif ; ARCH_X86_64
>+
>+%if BIT_DEPTH == 8
>+ %define DST_SHIFT 1
>+ mova m5, [pd_1]
>+%elif BIT_DEPTH == 10
>+ %define DST_SHIFT 3
>+ mova m5, [pd_4]
>+%endif
>+ add r2d, r2d
>+ lea r3, [tab_dst4]
>+%if ARCH_X86_64
>+ mova coef0, [r3 + 0 * 16]
>+ mova coef1, [r3 + 1 * 16]
>+ mova coef2, [r3 + 2 * 16]
>+ mova coef3, [r3 + 3 * 16]
>+%endif
>+ movh m0, [r0 + 0 * r2] ; load
>+ movhps m0, [r0 + 1 * r2]
>+ lea r0, [r0 + 2 * r2]
>+ movh m1, [r0]
>+ movhps m1, [r0 + r2]
>+ pmaddwd m2, m0, coef0 ; DST1
>+ pmaddwd m3, m1, coef0
>+ pshufd m6, m2, q2301
>+ pshufd m7, m3, q2301
>+ paddd m2, m6
>+ paddd m3, m7
>+ pshufd m2, m2, q3120
>+ pshufd m3, m3, q3120
>+ punpcklqdq m2, m3
>+ paddd m2, m5
>+ psrad m2, DST_SHIFT
>+ pmaddwd m3, m0, coef1
>+ pmaddwd m4, m1, coef1
>+ pshufd m6, m4, q2301
>+ pshufd m7, m3, q2301
>+ paddd m4, m6
>+ paddd m3, m7
>+ pshufd m4, m4, q3120
>+ pshufd m3, m3, q3120
>+ punpcklqdq m3, m4
>+ paddd m3, m5
>+ psrad m3, DST_SHIFT
>+ packssdw m2, m3 ; m2 = T70
>+ pmaddwd m3, m0, coef2
>+ pmaddwd m4, m1, coef2
>+ pshufd m6, m4, q2301
>+ pshufd m7, m3, q2301
>+ paddd m4, m6
>+ paddd m3, m7
>+ pshufd m4, m4, q3120
>+ pshufd m3, m3, q3120
>+ punpcklqdq m3, m4
>+ paddd m3, m5
>+ psrad m3, DST_SHIFT
>+ pmaddwd m0, coef3
>+ pmaddwd m1, coef3
>+ pshufd m6, m0, q2301
>+ pshufd m7, m1, q2301
>+ paddd m0, m6
>+ paddd m1, m7
>+ pshufd m0, m0, q3120
>+ pshufd m1, m1, q3120
>+ punpcklqdq m0, m1
>+ paddd m0, m5
>+ psrad m0, DST_SHIFT
>+ packssdw m3, m0 ; m3 = T71
>+ mova m5, [pd_128]
>+
>+ pmaddwd m0, m2, coef0 ; DST2
>+ pmaddwd m1, m3, coef0
>+ pshufd m6, m0, q2301
>+ pshufd m7, m1, q2301
>+ paddd m0, m6
>+ paddd m1, m7
>+ pshufd m0, m0, q3120
>+ pshufd m1, m1, q3120
>+ punpcklqdq m0, m1
>+ paddd m0, m5
>+ psrad m0, 8
>+
>+ pmaddwd m4, m2, coef1
>+ pmaddwd m1, m3, coef1
>+ pshufd m6, m4, q2301
>+ pshufd m7, m1, q2301
>+ paddd m4, m6
>+ paddd m1, m7
>+ pshufd m4, m4, q3120
>+ pshufd m1, m1, q3120
>+ punpcklqdq m4, m1
>+ paddd m4, m5
>+ psrad m4, 8
>+ packssdw m0, m4
>+ movu [r1 + 0 * 16], m0
>+
>+ pmaddwd m0, m2, coef2
>+ pmaddwd m1, m3, coef2
>+ pshufd m6, m0, q2301
>+ pshufd m7, m1, q2301
>+ paddd m0, m6
>+ paddd m1, m7
>+ pshufd m0, m0, q3120
>+ pshufd m1, m1, q3120
>+ punpcklqdq m0, m1
>+ paddd m0, m5
>+ psrad m0, 8
>+
>+ pmaddwd m2, coef3
>+ pmaddwd m3, coef3
>+ pshufd m6, m2, q2301
>+ pshufd m7, m3, q2301
>+ paddd m2, m6
>+ paddd m3, m7
>+ pshufd m2, m2, q3120
>+ pshufd m3, m3, q3120
>+ punpcklqdq m2, m3
>+ paddd m2, m5
>+ psrad m2, 8
>+ packssdw m0, m2
>+ movu [r1 + 1 * 16], m0
>+
>+ RET
>+
>+;------------------------------------------------------
>+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
>+;------------------------------------------------------
> INIT_XMM ssse3
> %if ARCH_X86_64
> cglobal dst4, 3, 4, 8+2
>diff -r 6245476add8f -r c9debeec039e source/common/x86/dct8.h
>--- a/source/common/x86/dct8.h Wed Jun 10 11:54:27 2015 +0530
>+++ b/source/common/x86/dct8.h Wed Jun 10 07:55:00 2015 -0700
>@@ -25,6 +25,7 @@
> #define X265_DCT8_H
> void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
>+void x265_dst4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
> void x265_dst4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
> void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
>_______________________________________________
>x265-devel mailing list
>x265-devel at videolan.org
>https://mailman.videolan.org/listinfo/x265-devel
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20150610/c1d62155/attachment-0001.html>
More information about the x265-devel
mailing list