[x265] [PATCH] asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1)
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Aug 27 07:08:51 CEST 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1409115913 -19800
# Wed Aug 27 10:35:13 2014 +0530
# Node ID 9e19a59e1de22bc39924365626c48fdb2557592e
# Parent 32891b95f6693a39afbdf7929e12e3e0c6e990d1
asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1)
diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Aug 26 15:03:38 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 27 10:35:13 2014 +0530
@@ -1375,7 +1375,6 @@
p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;
p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;
- p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
@@ -1388,6 +1387,7 @@
INTRA_ANG_SSSE3(ssse3);
+ p.dct[DCT_4x4] = x265_dct4_ssse3;
p.dct[DST_4x4] = x265_dst4_ssse3;
p.idct[IDCT_8x8] = x265_idct8_ssse3;
p.count_nonzero = x265_count_nonzero_ssse3;
@@ -1545,7 +1545,6 @@
p.transpose[BLOCK_64x64] = x265_transpose64_sse2;
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;
p.ssim_end_4 = x265_pixel_ssim_end4_sse2;
- p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
@@ -1582,6 +1581,7 @@
p.chroma_p2s[X265_CSP_I422] = x265_chroma_p2s_ssse3;
p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
+ p.dct[DCT_4x4] = x265_dct4_ssse3;
p.dct[DST_4x4] = x265_dst4_ssse3;
p.idct[IDCT_8x8] = x265_idct8_ssse3;
p.count_nonzero = x265_count_nonzero_ssse3;
diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Tue Aug 26 15:03:38 2014 -0500
+++ b/source/common/x86/dct8.asm Wed Aug 27 10:35:13 2014 +0530
@@ -30,6 +30,8 @@
SECTION_RODATA 32
+dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
+
tab_dct4: times 4 dw 64, 64
times 4 dw 83, 36
times 4 dw 64, -64
@@ -98,7 +100,7 @@
;------------------------------------------------------
;void dct4(int16_t *src, int32_t *dst, intptr_t stride)
;------------------------------------------------------
-INIT_XMM sse2
+INIT_XMM ssse3
cglobal dct4, 3, 4, 8
%if BIT_DEPTH == 10
%define DCT_SHIFT 3
@@ -112,22 +114,21 @@
add r2d, r2d
lea r3, [tab_dct4]
+ mova m3, [dct4_shuf]
mova m4, [r3 + 0 * 16]
mova m5, [r3 + 1 * 16]
mova m6, [r3 + 2 * 16]
movh m0, [r0 + 0 * r2]
movh m1, [r0 + 1 * r2]
punpcklqdq m0, m1
- pshufd m0, m0, 0xD8
- pshufhw m0, m0, 0xB1
+ pshufb m0, m3
lea r0, [r0 + 2 * r2]
movh m1, [r0]
movh m2, [r0 + r2]
punpcklqdq m1, m2
- pshufd m1, m1, 0xD8
- pshufhw m1, m1, 0xB1
+ pshufb m1, m3
punpcklqdq m2, m0, m1
punpckhqdq m0, m1
@@ -140,8 +141,8 @@
paddd m3, m7
psrad m3, DCT_SHIFT
packssdw m0, m3
- pshufd m0, m0, 0xD8
- pshufhw m0, m0, 0xB1
+ mova m3, [dct4_shuf]
+ pshufb m0, m3
pmaddwd m1, m6
paddd m1, m7
psrad m1, DCT_SHIFT
@@ -149,9 +150,8 @@
paddd m2, m7
psrad m2, DCT_SHIFT
packssdw m1, m2
- pshufd m1, m1, 0xD8
- pshufhw m1, m1, 0xB1
+ pshufb m1, m3
punpcklqdq m2, m0, m1
punpckhqdq m0, m1
diff -r 32891b95f669 -r 9e19a59e1de2 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Tue Aug 26 15:03:38 2014 -0500
+++ b/source/common/x86/dct8.h Wed Aug 27 10:35:13 2014 +0530
@@ -24,7 +24,7 @@
#ifndef X265_DCT8_H
#define X265_DCT8_H
-void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_dct4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
More information about the x265-devel
mailing list