[x265] [PATCH] asm: 8bpp and 10bpp code for idct8x8
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Thu Mar 13 11:54:25 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1394013245 -19800
# Wed Mar 05 15:24:05 2014 +0530
# Node ID 5da0d3c12446a75505abd6082ea93ab3e5569993
# Parent 1b6c943dd7e4d5d7f53be90d99596fe004cb3c54
asm: 8bpp and 10bpp code for idct8x8
diff -r 1b6c943dd7e4 -r 5da0d3c12446 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 13 14:40:21 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Mar 05 15:24:05 2014 +0530
@@ -1025,6 +1025,7 @@
INTRA_ANG_SSSE3(ssse3);
p.dct[DST_4x4] = x265_dst4_ssse3;
+ p.idct[IDCT_8x8] = x265_idct8_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -1199,6 +1200,7 @@
p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
p.dct[DST_4x4] = x265_dst4_ssse3;
+ p.idct[IDCT_8x8] = x265_idct8_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
diff -r 1b6c943dd7e4 -r 5da0d3c12446 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Mar 13 14:40:21 2014 +0530
+++ b/source/common/x86/dct8.asm Wed Mar 05 15:24:05 2014 +0530
@@ -61,8 +61,26 @@
times 1 dd 50, -89, 18, 75
times 1 dd 18, -50, 75, -89
+tab_idct8_3: times 4 dw 89, 75
+ times 4 dw 50, 18
+ times 4 dw 75, -18
+ times 4 dw -89, -50
+ times 4 dw 50, -89
+ times 4 dw 18, 75
+ times 4 dw 18, -50
+ times 4 dw 75, -89
+
pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
+pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13
+
+tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36
+
+tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
+ times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
+
+pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
+
SECTION .text
cextern pd_1
cextern pd_2
@@ -665,3 +683,187 @@
dec r2
jnz .pass2
RET
+
+;-------------------------------------------------------
+; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+;-------------------------------------------------------
+INIT_XMM ssse3
+
+cglobal patial_butterfly_inverse_internal_pass1
+ movu m0, [r0]
+ movu m1, [r0 + 4 * 32]
+ movu m2, [r0 + 2 * 32]
+ movu m3, [r0 + 6 * 32]
+ packssdw m0, m2
+ packssdw m1, m3
+ punpckhwd m2, m0, m1 ; [2 6]
+ punpcklwd m0, m1 ; [0 4]
+ pmaddwd m1, m0, [r6] ; EE[0]
+ pmaddwd m0, [r6 + 32] ; EE[1]
+ pmaddwd m3, m2, [r6 + 16] ; EO[0]
+ pmaddwd m2, [r6 + 48] ; EO[1]
+
+ paddd m4, m1, m3 ; E[0]
+ psubd m1, m3 ; E[3]
+ paddd m3, m0, m2 ; E[1]
+ psubd m0, m2 ; E[2]
+
+ ;E[K] = E[k] + add
+ mova m5, [pd_64]
+ paddd m0, m5
+ paddd m1, m5
+ paddd m3, m5
+ paddd m4, m5
+
+ movu m2, [r0 + 32]
+ movu m5, [r0 + 5 * 32]
+ packssdw m2, m5
+ movu m5, [r0 + 3 * 32]
+ movu m6, [r0 + 7 * 32]
+ packssdw m5, m6
+ punpcklwd m6, m2, m5 ;[1 3]
+ punpckhwd m2, m5 ;[5 7]
+
+ pmaddwd m5, m6, [r4]
+ pmaddwd m7, m2, [r4 + 16]
+ paddd m5, m7 ; O[0]
+
+ paddd m7, m4, m5
+ psrad m7, 7
+
+ psubd m4, m5
+ psrad m4, 7
+
+ packssdw m7, m4
+ movh [r5 + 0 * 16], m7
+ movhps [r5 + 7 * 16], m7
+
+ pmaddwd m5, m6, [r4 + 32]
+ pmaddwd m4, m2, [r4 + 48]
+ paddd m5, m4 ; O[1]
+
+ paddd m4, m3, m5
+ psrad m4, 7
+
+ psubd m3, m5
+ psrad m3, 7
+
+ packssdw m4, m3
+ movh [r5 + 1 * 16], m4
+ movhps [r5 + 6 * 16], m4
+
+ pmaddwd m5, m6, [r4 + 64]
+ pmaddwd m4, m2, [r4 + 80]
+ paddd m5, m4 ; O[2]
+
+ paddd m4, m0, m5
+ psrad m4, 7
+
+ psubd m0, m5
+ psrad m0, 7
+
+ packssdw m4, m0
+ movh [r5 + 2 * 16], m4
+ movhps [r5 + 5 * 16], m4
+
+ pmaddwd m5, m6, [r4 + 96]
+ pmaddwd m4, m2, [r4 + 112]
+ paddd m5, m4 ; O[3]
+
+ paddd m4, m1, m5
+ psrad m4, 7
+
+ psubd m1, m5
+ psrad m1, 7
+
+ packssdw m4, m1
+ movh [r5 + 3 * 16], m4
+ movhps [r5 + 4 * 16], m4
+
+ ret
+
+%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
+%if BIT_DEPTH == 10
+ %define IDCT_SHIFT 10
+%elif BIT_DEPTH == 8
+ %define IDCT_SHIFT 12
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ pshufb m4, %1, [pb_idct8even]
+ pmaddwd m4, [tab_idct8_1]
+ phsubd m5, m4
+ pshufd m4, m4, 0x4E
+ phaddd m4, m4
+ punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3]
+ paddd m4, m6
+
+ pshufb %1, %1, [r6]
+ pmaddwd m5, %1, [r4]
+ pmaddwd %1, [r4 + 16]
+ phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3]
+
+ paddd %1, m4, m5
+ psrad %1, IDCT_SHIFT
+
+ psubd m4, m5
+ psrad m4, IDCT_SHIFT
+ pshufd m4, m4, 0x1B
+
+ packssdw %1, m4
+%undef IDCT_SHIFT
+%endmacro
+
+cglobal patial_butterfly_inverse_internal_pass2
+
+ mova m0, [r5]
+ PARTIAL_BUTTERFLY_PROCESS_ROW m0
+ movu [r1], m0
+
+ mova m2, [r5 + 16]
+ PARTIAL_BUTTERFLY_PROCESS_ROW m2
+ movu [r1 + r2], m2
+
+ mova m1, [r5 + 32]
+ PARTIAL_BUTTERFLY_PROCESS_ROW m1
+ movu [r1 + 2 * r2], m1
+
+ mova m3, [r5 + 48]
+ PARTIAL_BUTTERFLY_PROCESS_ROW m3
+ movu [r1 + r3], m3
+
+ ret
+
+cglobal idct8, 3,7,8,0-16*mmsize
+ mov r5, rsp
+ lea r4, [tab_idct8_3]
+ lea r6, [tab_dct4]
+
+ call patial_butterfly_inverse_internal_pass1
+
+ add r0, 16
+ add r5, 8
+
+ call patial_butterfly_inverse_internal_pass1
+
+%if BIT_DEPTH == 10
+ mova m6, [pd_512]
+%elif BIT_DEPTH == 8
+ mova m6, [pd_2048]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ add r2, r2
+ lea r3, [r2 * 3]
+ lea r4, [tab_idct8_2]
+ lea r6, [pb_idct8odd]
+ sub r5, 8
+
+ call patial_butterfly_inverse_internal_pass2
+
+ lea r1, [r1 + 4 * r2]
+ add r5, 64
+
+ call patial_butterfly_inverse_internal_pass2
+
+ RET
diff -r 1b6c943dd7e4 -r 5da0d3c12446 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Thu Mar 13 14:40:21 2014 +0530
+++ b/source/common/x86/dct8.h Wed Mar 05 15:24:05 2014 +0530
@@ -26,6 +26,7 @@
void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
diff -r 1b6c943dd7e4 -r 5da0d3c12446 source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Thu Mar 13 14:40:21 2014 +0530
+++ b/source/test/mbdstharness.cpp Wed Mar 05 15:24:05 2014 +0530
@@ -30,7 +30,7 @@
#define ITERS 100
#define TEST_CASES 3
-#define IDCTMAX (1 << (BIT_DEPTH + 4)) - 1;
+#define IDCTMAX (1 << (BIT_DEPTH + 7)) - 1
using namespace x265;
struct DctConf_t
{
@@ -97,10 +97,10 @@
int_idct_test_buff[0][i] = rand() % IDCTMAX;
short_test_buff[1][i] = -PIXEL_MAX;
int_test_buff[1][i] = -PIXEL_MAX;
- int_idct_test_buff[1][i] = 0;
+ int_idct_test_buff[1][i] = SHORT_MIN;
short_test_buff[2][i] = PIXEL_MAX;
int_test_buff[2][i] = PIXEL_MAX;
- int_idct_test_buff[2][i] = IDCTMAX;
+ int_idct_test_buff[2][i] = SHORT_MAX;
}
for (int i = 0; i < mb_t_size; i++)
More information about the x265-devel
mailing list