[x265] [PATCH] idct8 sse2 assembler primitive
dtyx265 at gmail.com
dtyx265 at gmail.com
Thu Nov 20 23:40:30 CET 2014
# HG changeset patch
# User David T Yuen <dtyx265 at gmail.com>
# Date 1416523024 28800
# Node ID 0dca25ca4cf6f1cd54f9828b430c320ac9ee072b
# Parent 1d17ec0cb9548194b90495c5d7c94552c71abbf5
idct8 sse2 assembler primitive
derived from assembler generated by gcc Debian 4.7.2-5 of dct-sse3.cpp
diff -r 1d17ec0cb954 -r 0dca25ca4cf6 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Thu Nov 20 14:37:04 2014 -0800
@@ -1376,6 +1376,7 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
+ p.idct[IDCT_8x8] = x265_idct8_sse2;
LUMA_SS_FILTERS(_sse2);
}
@@ -1565,6 +1566,7 @@
p.dct[DCT_4x4] = x265_dct4_sse2;
p.idct[IDCT_4x4] = x265_idct4_sse2;
p.idct[IDST_4x4] = x265_idst4_sse2;
+ p.idct[IDCT_8x8] = x265_idct8_sse2;
p.planecopy_sp = x265_downShift_16_sse2;
p.copy_shl[BLOCK_4x4] = x265_copy_shl_4_sse2;
diff -r 1d17ec0cb954 -r 0dca25ca4cf6 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/x86/dct8.asm Thu Nov 20 14:37:04 2014 -0800
@@ -302,6 +302,19 @@
pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
+tab_idct8: times 4 dw 89, 75
+ times 4 dw 50, 18
+ times 4 dw 75, -18
+ times 4 dw -89, -50
+ times 4 dw 50, -89
+ times 4 dw 18, 75
+ times 4 dw 18, -50
+ times 4 dw 75, -89
+ times 4 dw 64, 64
+ times 4 dw 64, -64
+ times 4 dw 83, 36
+ times 4 dw 36, -83
+
SECTION .text
cextern pd_1
cextern pd_2
@@ -974,6 +987,380 @@
RET
;-------------------------------------------------------
+; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+;-------------------------------------------------------
+INIT_XMM sse2
+
+%if BIT_DEPTH == 10
+ %define IDCT_SHIFT 10
+ %define IDCT_ADD pd_512
+%elif BIT_DEPTH == 8
+ %define IDCT_SHIFT 12
+ %define IDCT_ADD pd_2048
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+
+cglobal idct8, 3,7, 16
+ lea r2, [r2 + r2] ;set r2 to index of 1
+ lea r4, [r2 + r2] ;set r4 to index of 2
+ lea r3, [r4 + r2] ;set r3 to index of 3
+ lea r4, [r4 + r3] ;set r4 to index of 5
+ mova m9, [r0 + 16]
+ movu m1, [r0 + 48]
+ mova m7, m9
+ punpcklwd m7, m1
+ punpckhwd m9, m1
+ mova m14, [tab_idct8]
+ mova m3, m14
+ pmaddwd m14, m7
+ pmaddwd m3, m9
+ mova m0, [r0 + 80]
+ mova m10, [r0 + 112]
+ mova m2, m0
+ punpcklwd m2, m10
+ punpckhwd m0, m10
+ mova m15, [tab_idct8 + 16]
+ mova m11, [tab_idct8 + 16]
+ pmaddwd m15, m2
+ mova m4, [tab_idct8 + 32]
+ pmaddwd m11, m0
+ mova m1, [tab_idct8 + 32]
+ paddd m15, m14
+ mova m5, [tab_idct8 + 64]
+ mova m12, [tab_idct8 + 64]
+ paddd m11, m3
+ mova [rsp - 72], m11
+ mova [rsp - 88], m15
+ pmaddwd m4, m7
+ pmaddwd m1, m9
+ mova m14, [tab_idct8 + 48]
+ mova m3, [tab_idct8 + 48]
+ pmaddwd m14, m2
+ pmaddwd m3, m0
+ paddd m14, m4
+ paddd m3, m1
+ mova [rsp - 40], m3
+ pmaddwd m5, m9
+ pmaddwd m9, [tab_idct8 + 96]
+ mova m6, [tab_idct8 + 80]
+ pmaddwd m12, m7
+ pmaddwd m7, [tab_idct8 + 96]
+ mova m4, [tab_idct8 + 80]
+ pmaddwd m6, m2
+ paddd m6, m12
+ pmaddwd m2, [tab_idct8 + 112]
+ paddd m7, m2
+ mova [rsp - 24], m6
+ pmaddwd m4, m0
+ pmaddwd m0, [tab_idct8 + 112]
+ paddd m9, m0
+ paddd m5, m4
+ mova m6, [r0]
+ mova m0, [r0 + 64]
+ mova m4, m6
+ punpcklwd m4, m0
+ punpckhwd m6, m0
+ mova m12, [r0 + 32]
+ mova m0, [r0 + 96]
+ mova m13, m12
+ mova m8, [tab_idct8 + 128]
+ punpcklwd m13, m0
+ mova m10, [tab_idct8 + 128]
+ punpckhwd m12, m0
+ pmaddwd m8, m4
+ mova m3, m8
+ pmaddwd m4, [tab_idct8 + 144]
+ pmaddwd m10, m6
+ mova m2, [tab_idct8 + 160]
+ mova m1, m10
+ pmaddwd m6, [tab_idct8 + 144]
+ mova m0, [tab_idct8 + 160]
+ pmaddwd m2, m13
+ paddd m3, m2
+ psubd m8, m2
+ mova m2, m6
+ pmaddwd m13, [tab_idct8 + 176]
+ pmaddwd m0, m12
+ paddd m1, m0
+ psubd m10, m0
+ mova m0, m4
+ pmaddwd m12, [tab_idct8 + 176]
+ paddd m3, [pd_64]
+ paddd m1, [pd_64]
+ paddd m8, [pd_64]
+ paddd m10, [pd_64]
+ paddd m0, m13
+ paddd m2, m12
+ paddd m0, [pd_64]
+ paddd m2, [pd_64]
+ psubd m4, m13
+ psubd m6, m12
+ paddd m4, [pd_64]
+ paddd m6, [pd_64]
+ mova m12, m8
+ psubd m8, m7
+ psrad m8, 7
+ paddd m15, m3
+ psubd m3, [rsp - 88]
+ psrad m15, 7
+ paddd m12, m7
+ psrad m12, 7
+ paddd m11, m1
+ mova m13, m14
+ psrad m11, 7
+ packssdw m15, m11
+ psubd m1, [rsp - 72]
+ psrad m1, 7
+ mova m11, [rsp - 40]
+ paddd m14, m0
+ psrad m14, 7
+ psubd m0, m13
+ psrad m0, 7
+ paddd m11, m2
+ mova m13, [rsp - 24]
+ psrad m11, 7
+ packssdw m14, m11
+ mova m11, m6
+ psubd m6, m5
+ paddd m13, m4
+ psrad m13, 7
+ psrad m6, 7
+ paddd m11, m5
+ psrad m11, 7
+ packssdw m13, m11
+ mova m11, m10
+ psubd m4, [rsp - 24]
+ psubd m10, m9
+ psrad m4, 7
+ psrad m10, 7
+ packssdw m4, m6
+ packssdw m8, m10
+ paddd m11, m9
+ psrad m11, 7
+ packssdw m12, m11
+ psubd m2, [rsp - 40]
+ mova m5, m15
+ psrad m2, 7
+ packssdw m0, m2
+ mova m2, m14
+ psrad m3, 7
+ packssdw m3, m1
+ mova m6, m13
+ punpcklwd m5, m8
+ punpcklwd m2, m4
+ mova m1, m12
+ punpcklwd m6, m0
+ punpcklwd m1, m3
+ mova m9, m5
+ punpckhwd m13, m0
+ mova m0, m2
+ punpcklwd m9, m6
+ punpckhwd m5, m6
+ punpcklwd m0, m1
+ punpckhwd m2, m1
+ punpckhwd m15, m8
+ mova m1, m5
+ punpckhwd m14, m4
+ punpckhwd m12, m3
+ mova m6, m9
+ punpckhwd m9, m0
+ punpcklwd m1, m2
+ mova m4, [tab_idct8]
+ punpckhwd m5, m2
+ punpcklwd m6, m0
+ mova m2, m15
+ mova m0, m14
+ mova m7, m9
+ punpcklwd m2, m13
+ punpcklwd m0, m12
+ punpcklwd m7, m5
+ punpckhwd m14, m12
+ mova m10, m2
+ punpckhwd m15, m13
+ punpckhwd m9, m5
+ pmaddwd m4, m7
+ mova m13, m1
+ punpckhwd m2, m0
+ punpcklwd m10, m0
+ mova m0, m15
+ punpckhwd m15, m14
+ mova m12, m1
+ mova m3, [tab_idct8]
+ punpcklwd m0, m14
+ pmaddwd m3, m9
+ mova m11, m2
+ punpckhwd m2, m15
+ punpcklwd m11, m15
+ mova m8, [tab_idct8 + 16]
+ punpcklwd m13, m0
+ punpckhwd m12, m0
+ pmaddwd m8, m11
+ paddd m8, m4
+ mova [rsp - 88], m8
+ mova m4, [tab_idct8 + 32]
+ pmaddwd m4, m7
+ mova m15, [tab_idct8 + 32]
+ mova m5, [tab_idct8 + 16]
+ pmaddwd m15, m9
+ pmaddwd m5, m2
+ paddd m5, m3
+ mova [rsp - 72], m5
+ mova m14, [tab_idct8 + 48]
+ mova m5, [tab_idct8 + 48]
+ pmaddwd m14, m11
+ paddd m14, m4
+ mova [rsp - 56], m14
+ pmaddwd m5, m2
+ paddd m5, m15
+ mova [rsp - 40], m5
+ mova m15, [tab_idct8 + 64]
+ mova m5, [tab_idct8 + 64]
+ pmaddwd m15, m7
+ pmaddwd m7, [tab_idct8 + 96]
+ pmaddwd m5, m9
+ pmaddwd m9, [tab_idct8 + 96]
+ mova m4, [tab_idct8 + 80]
+ pmaddwd m4, m2
+ paddd m5, m4
+ mova m4, m6
+ mova m8, [tab_idct8 + 80]
+ punpckhwd m6, m10
+ pmaddwd m2, [tab_idct8 + 112]
+ punpcklwd m4, m10
+ paddd m9, m2
+ pmaddwd m8, m11
+ mova m10, [tab_idct8 + 128]
+ paddd m8, m15
+ pmaddwd m11, [tab_idct8 + 112]
+ paddd m7, m11
+ mova [rsp - 24], m8
+ pmaddwd m10, m6
+ pmaddwd m6, [tab_idct8 + 144]
+ mova m1, m10
+ mova m8, [tab_idct8 + 128]
+ mova m3, [tab_idct8 + 160]
+ pmaddwd m8, m4
+ pmaddwd m4, [tab_idct8 + 144]
+ mova m0, m8
+ mova m2, [tab_idct8 + 160]
+ pmaddwd m3, m13
+ psubd m8, m3
+ paddd m0, m3
+ mova m3, m6
+ pmaddwd m13, [tab_idct8 + 176]
+ pmaddwd m2, m12
+ paddd m1, m2
+ psubd m10, m2
+ mova m2, m4
+ pmaddwd m12, [tab_idct8 + 176]
+ paddd m0, [IDCT_ADD]
+ paddd m1, [IDCT_ADD]
+ paddd m8, [IDCT_ADD]
+ paddd m10, [IDCT_ADD]
+ paddd m2, m13
+ paddd m3, m12
+ paddd m2, [IDCT_ADD]
+ paddd m3, [IDCT_ADD]
+ psubd m4, m13
+ psubd m6, m12
+ paddd m4, [IDCT_ADD]
+ paddd m6, [IDCT_ADD]
+ mova m15, [rsp - 88]
+ mova m12, m8
+ psubd m8, m7
+ psrad m8, IDCT_SHIFT
+ mova m11, [rsp - 72]
+ paddd m15, m0
+ psrad m15, IDCT_SHIFT
+ psubd m0, [rsp - 88]
+ psrad m0, IDCT_SHIFT
+ paddd m12, m7
+ paddd m11, m1
+ mova m14, [rsp - 56]
+ psrad m11, IDCT_SHIFT
+ packssdw m15, m11
+ psubd m1, [rsp - 72]
+ psrad m1, IDCT_SHIFT
+ mova m11, [rsp - 40]
+ paddd m14, m2
+ psrad m14, IDCT_SHIFT
+ packssdw m0, m1
+ psrad m12, IDCT_SHIFT
+ psubd m2, [rsp - 56]
+ paddd m11, m3
+ mova m13, [rsp - 24]
+ psrad m11, IDCT_SHIFT
+ packssdw m14, m11
+ mova m11, m6
+ psubd m6, m5
+ paddd m13, m4
+ psrad m13, IDCT_SHIFT
+ mova m1, m15
+ paddd m11, m5
+ psrad m11, IDCT_SHIFT
+ packssdw m13, m11
+ mova m11, m10
+ psubd m10, m9
+ psrad m10, IDCT_SHIFT
+ packssdw m8, m10
+ psrad m6, IDCT_SHIFT
+ psubd m4, [rsp - 24]
+ paddd m11, m9
+ psrad m11, IDCT_SHIFT
+ packssdw m12, m11
+ punpcklwd m1, m14
+ mova m5, m13
+ psrad m4, IDCT_SHIFT
+ packssdw m4, m6
+ psubd m3, [rsp - 40]
+ psrad m2, IDCT_SHIFT
+ mova m6, m8
+ psrad m3, IDCT_SHIFT
+ punpcklwd m5, m12
+ packssdw m2, m3
+ punpcklwd m6, m4
+ punpckhwd m8, m4
+ mova m4, m1
+ mova m3, m2
+ punpckhdq m1, m5
+ punpckldq m4, m5
+ punpcklwd m3, m0
+ punpckhwd m2, m0
+ mova m0, m6
+ lea r0, [r4 + r2 * 2] ;set r0 to index of 7
+ movq [r1], m4
+ punpckhwd m15, m14
+ movhps [r1 + r2], m4
+ punpckhdq m0, m3
+ movq [r1 + r2 * 2], m1
+ punpckhwd m13, m12
+ movhps [r1 + r3], m1
+ mova m1, m6
+ punpckldq m1, m3
+ movq [r1 + 8], m1
+ movhps [r1 + r2 + 8], m1
+ movq [r1 + r2 * 2 + 8], m0
+ movhps [r1 + r3 + 8], m0
+ mova m0, m15
+ punpckhdq m15, m13
+ punpckldq m0, m13
+ movq [r1 + r2 * 4], m0
+ movhps [r1 + r4], m0
+ mova m0, m8
+ punpckhdq m8, m2
+ movq [r1 + r3 * 2], m15
+ punpckldq m0, m2
+ movhps [r1 + r0], m15
+ movq [r1 + r2 * 4 + 8], m0
+ movhps [r1 + r4 + 8], m0
+ movq [r1 + r3 * 2 + 8], m8
+ movhps [r1 + r0 + 8], m8
+ RET
+%undef IDCT_SHIFT
+%undef IDCT_ADD
+
+;-------------------------------------------------------
; void idct8(int16_t *src, int16_t *dst, intptr_t stride)
;-------------------------------------------------------
INIT_XMM ssse3
diff -r 1d17ec0cb954 -r 0dca25ca4cf6 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Thu Nov 20 20:04:02 2014 +0530
+++ b/source/common/x86/dct8.h Thu Nov 20 14:37:04 2014 -0800
@@ -35,6 +35,7 @@
void x265_idst4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct4_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct4_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
+void x265_idct8_sse2(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct8_ssse3(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct8_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
void x265_idct16_avx2(const int16_t* src, int16_t* dst, intptr_t stride);
More information about the x265-devel
mailing list