<div dir="ltr"><div class="gmail_default" style="font-family:trebuchet ms,sans-serif;font-size:small">sorry, ignore this patch, I forgot one more little modification.<br></div></div><div class="gmail_extra"><br><br><div class="gmail_quote">
On Wed, Aug 27, 2014 at 10:27 AM, <span dir="ltr"><<a href="mailto:dnyaneshwar@multicorewareinc.com" target="_blank">dnyaneshwar@multicorewareinc.com</a>></span> wrote:<br><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">
# HG changeset patch<br>
# User Dnyaneshwar G <<a href="mailto:dnyaneshwar@multicorewareinc.com">dnyaneshwar@multicorewareinc.com</a>><br>
# Date 1409115349 -19800<br>
# Wed Aug 27 10:25:49 2014 +0530<br>
# Node ID f49ed93e3daff100903e5fd7aa1bd874b9e79caf<br>
# Parent 32891b95f6693a39afbdf7929e12e3e0c6e990d1<br>
asm: optimize dct4, replaced pshufd(latency 4-6)+pshufhw(latency 2) instructions with pshufb(latency 1)<br>
<br>
diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/asm-primitives.cpp<br>
--- a/source/common/x86/asm-primitives.cpp Tue Aug 26 15:03:38 2014 -0500<br>
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 27 10:25:49 2014 +0530<br>
@@ -1375,7 +1375,7 @@<br>
p.calcrecon[BLOCK_16x16] = x265_calcRecons16_sse2;<br>
p.calcrecon[BLOCK_32x32] = x265_calcRecons32_sse2;<br>
<br>
- p.dct[DCT_4x4] = x265_dct4_sse2;<br>
+ p.dct[DCT_4x4] = x265_dct4_ssse3;<br>
p.idct[IDCT_4x4] = x265_idct4_sse2;<br>
p.idct[IDST_4x4] = x265_idst4_sse2;<br>
<br>
@@ -1545,7 +1545,7 @@<br>
p.transpose[BLOCK_64x64] = x265_transpose64_sse2;<br>
p.ssim_4x4x2_core = x265_pixel_ssim_4x4x2_core_sse2;<br>
p.ssim_end_4 = x265_pixel_ssim_end4_sse2;<br>
- p.dct[DCT_4x4] = x265_dct4_sse2;<br>
+ p.dct[DCT_4x4] = x265_dct4_ssse3;<br>
p.idct[IDCT_4x4] = x265_idct4_sse2;<br>
p.idct[IDST_4x4] = x265_idst4_sse2;<br>
p.planecopy_sp = x265_downShift_16_sse2;<br>
diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/dct8.asm<br>
--- a/source/common/x86/dct8.asm Tue Aug 26 15:03:38 2014 -0500<br>
+++ b/source/common/x86/dct8.asm Wed Aug 27 10:25:49 2014 +0530<br>
@@ -30,6 +30,8 @@<br>
<br>
SECTION_RODATA 32<br>
<br>
+dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13<br>
+<br>
tab_dct4: times 4 dw 64, 64<br>
times 4 dw 83, 36<br>
times 4 dw 64, -64<br>
@@ -98,7 +100,7 @@<br>
;------------------------------------------------------<br>
;void dct4(int16_t *src, int32_t *dst, intptr_t stride)<br>
;------------------------------------------------------<br>
-INIT_XMM sse2<br>
+INIT_XMM ssse3<br>
cglobal dct4, 3, 4, 8<br>
%if BIT_DEPTH == 10<br>
%define DCT_SHIFT 3<br>
@@ -112,22 +114,21 @@<br>
add r2d, r2d<br>
lea r3, [tab_dct4]<br>
<br>
+ mova m3, [dct4_shuf]<br>
mova m4, [r3 + 0 * 16]<br>
mova m5, [r3 + 1 * 16]<br>
mova m6, [r3 + 2 * 16]<br>
movh m0, [r0 + 0 * r2]<br>
movh m1, [r0 + 1 * r2]<br>
punpcklqdq m0, m1<br>
- pshufd m0, m0, 0xD8<br>
- pshufhw m0, m0, 0xB1<br>
+ pshufb m0, m3<br>
<br>
lea r0, [r0 + 2 * r2]<br>
movh m1, [r0]<br>
movh m2, [r0 + r2]<br>
punpcklqdq m1, m2<br>
- pshufd m1, m1, 0xD8<br>
- pshufhw m1, m1, 0xB1<br>
<br>
+ pshufb m1, m3<br>
punpcklqdq m2, m0, m1<br>
punpckhqdq m0, m1<br>
<br>
@@ -140,8 +141,8 @@<br>
paddd m3, m7<br>
psrad m3, DCT_SHIFT<br>
packssdw m0, m3<br>
- pshufd m0, m0, 0xD8<br>
- pshufhw m0, m0, 0xB1<br>
+ mova m3, [dct4_shuf]<br>
+ pshufb m0, m3<br>
pmaddwd m1, m6<br>
paddd m1, m7<br>
psrad m1, DCT_SHIFT<br>
@@ -149,9 +150,8 @@<br>
paddd m2, m7<br>
psrad m2, DCT_SHIFT<br>
packssdw m1, m2<br>
- pshufd m1, m1, 0xD8<br>
- pshufhw m1, m1, 0xB1<br>
<br>
+ pshufb m1, m3<br>
punpcklqdq m2, m0, m1<br>
punpckhqdq m0, m1<br>
<br>
diff -r 32891b95f669 -r f49ed93e3daf source/common/x86/dct8.h<br>
--- a/source/common/x86/dct8.h Tue Aug 26 15:03:38 2014 -0500<br>
+++ b/source/common/x86/dct8.h Wed Aug 27 10:25:49 2014 +0530<br>
@@ -24,7 +24,7 @@<br>
#ifndef X265_DCT8_H<br>
#define X265_DCT8_H<br>
<br>
-void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);<br>
+void x265_dct4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);<br>
void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);<br>
void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);<br>
void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);<br>
</blockquote></div><br></div>