[x265] [PATCH] asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1)
Dnyaneshwar Gorade
dnyaneshwar at multicorewareinc.com
Wed Aug 27 07:08:35 CEST 2014
sorry, ignore this patch, I forgot one more little modification.
On Wed, Aug 27, 2014 at 10:27 AM, <dnyaneshwar at multicorewareinc.com> wrote:
> # HG changeset patch
> # User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
> # Date 1409115349 -19800
> # Wed Aug 27 10:25:49 2014 +0530
> # Node ID f49ed93e3daff100903e5fd7aa1bd874b9e79caf
> # Parent 32891b95f6693a39afbdf7929e12e3e0c6e990d1
> asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2)
> instructions with pshufb(latency 1)
>
> diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/asm-primitives.cpp
> --- a/source/common/x86/asm-primitives.cpp Tue Aug 26 15:03:38 2014
> -0500
> +++ b/source/common/x86/asm-primitives.cpp Wed Aug 27 10:25:49 2014
> +0530
> @@ -1375,7 +1375,7 @@
> p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
> p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
>
> - p.dct[DCT_4x4] = x265_dct4_sse2;
> + p.dct[DCT_4x4] = x265_dct4_ssse3;
> p.idct[IDCT_4x4] = x265_idct4_sse2;
> p.idct[IDST_4x4] = x265_idst4_sse2;
>
> @@ -1545,7 +1545,7 @@
> p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
> p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
> p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
> - p.dct[DCT_4x4] = x265_dct4_sse2;
> + p.dct[DCT_4x4] = x265_dct4_ssse3;
> p.idct[IDCT_4x4] = x265_idct4_sse2;
> p.idct[IDST_4x4] = x265_idst4_sse2;
> p.planecopy_sp = x265_downShift_16_sse2;
> diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/dct8.asm
> --- a/source/common/x86/dct8.asm Tue Aug 26 15:03:38 2014 -0500
> +++ b/source/common/x86/dct8.asm Wed Aug 27 10:25:49 2014 +0530
> @@ -30,6 +30,8 @@
>
> SECTION_RODATA 32
>
> +dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
> +
> tab_dct4: times 4 dw 64, 64
> times 4 dw 83, 36
> times 4 dw 64, -64
> @@ -98,7 +100,7 @@
> ;------------------------------------------------------
> ;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
> ;------------------------------------------------------
> -INIT_XMM sse2
> +INIT_XMM ssse3
> cglobal dct4, 3, 4, 8
> %if BIT_DEPTH == 10
> %define DCT_SHIFT 3
> @@ -112,22 +114,21 @@
> add r2d, r2d
> lea r3, [tab_dct4]
>
> + mova m3, [dct4_shuf]
> mova m4, [r3 + 0 * 16]
> mova m5, [r3 + 1 * 16]
> mova m6, [r3 + 2 * 16]
> movh m0, [r0 + 0 * r2]
> movh m1, [r0 + 1 * r2]
> punpcklqdq m0, m1
> - pshufd m0, m0, 0xD8
> - pshufhw m0, m0, 0xB1
> + pshufb m0, m3
>
> lea r0, [r0 + 2 * r2]
> movh m1, [r0]
> movh m2, [r0 + r2]
> punpcklqdq m1, m2
> - pshufd m1, m1, 0xD8
> - pshufhw m1, m1, 0xB1
>
> + pshufb m1, m3
> punpcklqdq m2, m0, m1
> punpckhqdq m0, m1
>
> @@ -140,8 +141,8 @@
> paddd m3, m7
> psrad m3, DCT_SHIFT
> packssdw m0, m3
> - pshufd m0, m0, 0xD8
> - pshufhw m0, m0, 0xB1
> + mova m3, [dct4_shuf]
> + pshufb m0, m3
> pmaddwd m1, m6
> paddd m1, m7
> psrad m1, DCT_SHIFT
> @@ -149,9 +150,8 @@
> paddd m2, m7
> psrad m2, DCT_SHIFT
> packssdw m1, m2
> - pshufd m1, m1, 0xD8
> - pshufhw m1, m1, 0xB1
>
> + pshufb m1, m3
> punpcklqdq m2, m0, m1
> punpckhqdq m0, m1
>
> diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/dct8.h
> --- a/source/common/x86/dct8.h Tue Aug 26 15:03:38 2014 -0500
> +++ b/source/common/x86/dct8.h Wed Aug 27 10:25:49 2014 +0530
> @@ -24,7 +24,7 @@
> #ifndef X265_DCT8_H
> #define X265_DCT8_H
>
> -void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
> +void x265_dct4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
> void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
> void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
> void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20140827/18fcb53e/attachment-0001.html>
More information about the x265-devel
mailing list