[x265-commits] [x265] asm: idct[8x8] sse2 12232 -> 3500 over c code 3550 -> 35...
David T Yuen
dtyx265 at gmail.com
Sat Dec 20 18:52:37 CET 2014
details: http://hg.videolan.org/x265/rev/7b816fdb393d
branches:
changeset: 8996:7b816fdb393d
user: David T Yuen <dtyx265 at gmail.com>
date: Thu Dec 18 15:51:34 2014 -0800
description:
asm: idct[8x8] sse2 12232 -> 3500 over c code 3550 -> 3500 over intrinsic
Subject: [x265] fix 4:4:4 rd<=1
details: http://hg.videolan.org/x265/rev/8d2f418829c8
branches:
changeset: 8997:8d2f418829c8
user: Satoshi Nakagawa <nakagawa424 at oki.com>
date: Sat Dec 20 21:27:14 2014 +0900
description:
fix 4:4:4 rd<=1
diffstat:
source/common/x86/asm-primitives.cpp | 6 +
source/common/x86/dct8.asm | 376 +++++++++++++++++++++++++++++++++++
source/common/x86/dct8.h | 1 +
source/encoder/search.cpp | 6 +-
4 files changed, 387 insertions(+), 2 deletions(-)
diffs (truncated from 445 to 300 lines):
diff -r 78ae7996a1ce -r 8d2f418829c8 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Dec 17 14:31:50 2014 -0600
+++ b/source/common/x86/asm-primitives.cpp Sat Dec 20 21:27:14 2014 +0900
@@ -1387,6 +1387,9 @@ void Setup_Assembly_Primitives(EncoderPr
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
+#if X86_64
+ p.idct[IDCT_8x8] = x265_idct8_sse2;
+#endif
p.idct[IDST_4x4] = x265_idst4_sse2;
LUMA_SS_FILTERS(_sse2);
@@ -1593,6 +1596,9 @@ void Setup_Assembly_Primitives(EncoderPr
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
+#if X86_64
+ p.idct[IDCT_8x8] = x265_idct8_sse2;
+#endif
p.idct[IDST_4x4] = x265_idst4_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
diff -r 78ae7996a1ce -r 8d2f418829c8 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Dec 17 14:31:50 2014 -0600
+++ b/source/common/x86/dct8.asm Sat Dec 20 21:27:14 2014 +0900
@@ -976,6 +976,382 @@ cglobal dct8, 3,6,7,0-16*mmsize
;-------------------------------------------------------
; void idct8(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
+%if ARCH_X86_64
+INIT_XMM sse2
+%if BIT_DEPTH == 10
+ %define IDCT_SHIFT 10
+ %define IDCT_ADD pd_512
+%elif BIT_DEPTH == 8
+ %define IDCT_SHIFT 12
+ %define IDCT_ADD pd_2048
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+cglobal idct8, 3, 6, 16, 0-5*mmsize
+ mova m9, [r0 + 1 * mmsize]
+ mova m1, [r0 + 3 * mmsize]
+ mova m7, m9
+ punpcklwd m7, m1
+ punpckhwd m9, m1
+ mova m14, [tab_idct8_3]
+ mova m3, m14
+ pmaddwd m14, m7
+ pmaddwd m3, m9
+ mova m0, [r0 + 5 * mmsize]
+ mova m10, [r0 + 7 * mmsize]
+ mova m2, m0
+ punpcklwd m2, m10
+ punpckhwd m0, m10
+ mova m15, [tab_idct8_3 + 1 * mmsize]
+ mova m11, [tab_idct8_3 + 1 * mmsize]
+ pmaddwd m15, m2
+ mova m4, [tab_idct8_3 + 2 * mmsize]
+ pmaddwd m11, m0
+ mova m1, [tab_idct8_3 + 2 * mmsize]
+ paddd m15, m14
+ mova m5, [tab_idct8_3 + 4 * mmsize]
+ mova m12, [tab_idct8_3 + 4 * mmsize]
+ paddd m11, m3
+ mova [rsp + 0 * mmsize], m11
+ mova [rsp + 1 * mmsize], m15
+ pmaddwd m4, m7
+ pmaddwd m1, m9
+ mova m14, [tab_idct8_3 + 3 * mmsize]
+ mova m3, [tab_idct8_3 + 3 * mmsize]
+ pmaddwd m14, m2
+ pmaddwd m3, m0
+ paddd m14, m4
+ paddd m3, m1
+ mova [rsp + 2 * mmsize], m3
+ pmaddwd m5, m9
+ pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
+ mova m6, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m12, m7
+ pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
+ mova m4, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m6, m2
+ paddd m6, m12
+ pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
+ paddd m7, m2
+ mova [rsp + 3 * mmsize], m6
+ pmaddwd m4, m0
+ pmaddwd m0, [tab_idct8_3 + 7 * mmsize]
+ paddd m9, m0
+ paddd m5, m4
+ mova m6, [r0 + 0 * mmsize]
+ mova m0, [r0 + 4 * mmsize]
+ mova m4, m6
+ punpcklwd m4, m0
+ punpckhwd m6, m0
+ mova m12, [r0 + 2 * mmsize]
+ mova m0, [r0 + 6 * mmsize]
+ mova m13, m12
+ mova m8, [tab_dct4]
+ punpcklwd m13, m0
+ mova m10, [tab_dct4]
+ punpckhwd m12, m0
+ pmaddwd m8, m4
+ mova m3, m8
+ pmaddwd m4, [tab_dct4 + 2 * mmsize]
+ pmaddwd m10, m6
+ mova m2, [tab_dct4 + 1 * mmsize]
+ mova m1, m10
+ pmaddwd m6, [tab_dct4 + 2 * mmsize]
+ mova m0, [tab_dct4 + 1 * mmsize]
+ pmaddwd m2, m13
+ paddd m3, m2
+ psubd m8, m2
+ mova m2, m6
+ pmaddwd m13, [tab_dct4 + 3 * mmsize]
+ pmaddwd m0, m12
+ paddd m1, m0
+ psubd m10, m0
+ mova m0, m4
+ pmaddwd m12, [tab_dct4 + 3 * mmsize]
+ paddd m3, [pd_64]
+ paddd m1, [pd_64]
+ paddd m8, [pd_64]
+ paddd m10, [pd_64]
+ paddd m0, m13
+ paddd m2, m12
+ paddd m0, [pd_64]
+ paddd m2, [pd_64]
+ psubd m4, m13
+ psubd m6, m12
+ paddd m4, [pd_64]
+ paddd m6, [pd_64]
+ mova m12, m8
+ psubd m8, m7
+ psrad m8, 7
+ paddd m15, m3
+ psubd m3, [rsp + 1 * mmsize]
+ psrad m15, 7
+ paddd m12, m7
+ psrad m12, 7
+ paddd m11, m1
+ mova m13, m14
+ psrad m11, 7
+ packssdw m15, m11
+ psubd m1, [rsp + 0 * mmsize]
+ psrad m1, 7
+ mova m11, [rsp + 2 * mmsize]
+ paddd m14, m0
+ psrad m14, 7
+ psubd m0, m13
+ psrad m0, 7
+ paddd m11, m2
+ mova m13, [rsp + 3 * mmsize]
+ psrad m11, 7
+ packssdw m14, m11
+ mova m11, m6
+ psubd m6, m5
+ paddd m13, m4
+ psrad m13, 7
+ psrad m6, 7
+ paddd m11, m5
+ psrad m11, 7
+ packssdw m13, m11
+ mova m11, m10
+ psubd m4, [rsp + 3 * mmsize]
+ psubd m10, m9
+ psrad m4, 7
+ psrad m10, 7
+ packssdw m4, m6
+ packssdw m8, m10
+ paddd m11, m9
+ psrad m11, 7
+ packssdw m12, m11
+ psubd m2, [rsp + 2 * mmsize]
+ mova m5, m15
+ psrad m2, 7
+ packssdw m0, m2
+ mova m2, m14
+ psrad m3, 7
+ packssdw m3, m1
+ mova m6, m13
+ punpcklwd m5, m8
+ punpcklwd m2, m4
+ mova m1, m12
+ punpcklwd m6, m0
+ punpcklwd m1, m3
+ mova m9, m5
+ punpckhwd m13, m0
+ mova m0, m2
+ punpcklwd m9, m6
+ punpckhwd m5, m6
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ punpckhwd m15, m8
+ mova m1, m5
+ punpckhwd m14, m4
+ punpckhwd m12, m3
+ mova m6, m9
+ punpckhwd m9, m0
+ punpcklwd m1, m2
+ mova m4, [tab_idct8_3 + 0 * mmsize]
+ punpckhwd m5, m2
+ punpcklwd m6, m0
+ mova m2, m15
+ mova m0, m14
+ mova m7, m9
+ punpcklwd m2, m13
+ punpcklwd m0, m12
+ punpcklwd m7, m5
+ punpckhwd m14, m12
+ mova m10, m2
+ punpckhwd m15, m13
+ punpckhwd m9, m5
+ pmaddwd m4, m7
+ mova m13, m1
+ punpckhwd m2, m0
+ punpcklwd m10, m0
+ mova m0, m15
+ punpckhwd m15, m14
+ mova m12, m1
+ mova m3, [tab_idct8_3 + 0 * mmsize]
+ punpcklwd m0, m14
+ pmaddwd m3, m9
+ mova m11, m2
+ punpckhwd m2, m15
+ punpcklwd m11, m15
+ mova m8, [tab_idct8_3 + 1 * mmsize]
+ punpcklwd m13, m0
+ punpckhwd m12, m0
+ pmaddwd m8, m11
+ paddd m8, m4
+ mova [rsp + 4 * mmsize], m8
+ mova m4, [tab_idct8_3 + 2 * mmsize]
+ pmaddwd m4, m7
+ mova m15, [tab_idct8_3 + 2 * mmsize]
+ mova m5, [tab_idct8_3 + 1 * mmsize]
+ pmaddwd m15, m9
+ pmaddwd m5, m2
+ paddd m5, m3
+ mova [rsp + 3 * mmsize], m5
+ mova m14, [tab_idct8_3 + 3 * mmsize]
+ mova m5, [tab_idct8_3 + 3 * mmsize]
+ pmaddwd m14, m11
+ paddd m14, m4
+ mova [rsp + 2 * mmsize], m14
+ pmaddwd m5, m2
+ paddd m5, m15
+ mova [rsp + 1 * mmsize], m5
+ mova m15, [tab_idct8_3 + 4 * mmsize]
+ mova m5, [tab_idct8_3 + 4 * mmsize]
+ pmaddwd m15, m7
+ pmaddwd m7, [tab_idct8_3 + 6 * mmsize]
+ pmaddwd m5, m9
+ pmaddwd m9, [tab_idct8_3 + 6 * mmsize]
+ mova m4, [tab_idct8_3 + 5 * mmsize]
+ pmaddwd m4, m2
+ paddd m5, m4
+ mova m4, m6
+ mova m8, [tab_idct8_3 + 5 * mmsize]
+ punpckhwd m6, m10
+ pmaddwd m2, [tab_idct8_3 + 7 * mmsize]
+ punpcklwd m4, m10
+ paddd m9, m2
+ pmaddwd m8, m11
+ mova m10, [tab_dct4]
+ paddd m8, m15
+ pmaddwd m11, [tab_idct8_3 + 7 * mmsize]
+ paddd m7, m11
+ mova [rsp + 0 * mmsize], m8
+ pmaddwd m10, m6
+ pmaddwd m6, [tab_dct4 + 2 * mmsize]
+ mova m1, m10
+ mova m8, [tab_dct4]
+ mova m3, [tab_dct4 + 1 * mmsize]
+ pmaddwd m8, m4
+ pmaddwd m4, [tab_dct4 + 2 * mmsize]
+ mova m0, m8
+ mova m2, [tab_dct4 + 1 * mmsize]
+ pmaddwd m3, m13
+ psubd m8, m3
+ paddd m0, m3
+ mova m3, m6
+ pmaddwd m13, [tab_dct4 + 3 * mmsize]
+ pmaddwd m2, m12
+ paddd m1, m2
+ psubd m10, m2
+ mova m2, m4
+ pmaddwd m12, [tab_dct4 + 3 * mmsize]
+ paddd m0, [IDCT_ADD]
+ paddd m1, [IDCT_ADD]
+ paddd m8, [IDCT_ADD]
+ paddd m10, [IDCT_ADD]
+ paddd m2, m13
+ paddd m3, m12
+ paddd m2, [IDCT_ADD]
+ paddd m3, [IDCT_ADD]
+ psubd m4, m13
More information about the x265-commits
mailing list