[x265] [PATCH] asm: 8bpp and 10bpp code for idct8x8 module
murugan at multicorewareinc.com
murugan at multicorewareinc.com
Fri Mar 14 08:19:03 CET 2014
# HG changeset patch
# User Murugan Vairavel <murugan at multicorewareinc.com>
# Date 1394781526 -19800
# Fri Mar 14 12:48:46 2014 +0530
# Node ID 63eea432b81cf350e34db82f28b21b7e6bf8333b
# Parent 7b5699e6bb75d28631d9fc942f3f30a3652ef8a2
asm: 8bpp and 10bpp code for idct8x8 module
diff -r 7b5699e6bb75 -r 63eea432b81c source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Mar 13 18:29:54 2014 -0500
+++ b/source/common/x86/asm-primitives.cpp Fri Mar 14 12:48:46 2014 +0530
@@ -1025,6 +1025,7 @@
INTRA_ANG_SSSE3(ssse3);
p.dct[DST_4x4] = x265_dst4_ssse3;
+ p.idct[IDCT_8x8] = x265_idct8_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
@@ -1197,6 +1198,7 @@
p.chroma_p2s[X265_CSP_I444] = x265_luma_p2s_ssse3; // for i444 , chroma_p2s can be replaced by luma_p2s
p.dct[DST_4x4] = x265_dst4_ssse3;
+ p.idct[IDCT_8x8] = x265_idct8_ssse3;
}
if (cpuMask & X265_CPU_SSE4)
{
diff -r 7b5699e6bb75 -r 63eea432b81c source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Mar 13 18:29:54 2014 -0500
+++ b/source/common/x86/dct8.asm Fri Mar 14 12:48:46 2014 +0530
@@ -61,8 +61,26 @@
times 1 dd 50, -89, 18, 75
times 1 dd 18, -50, 75, -89
+tab_idct8_3: times 4 dw 89, 75
+ times 4 dw 50, 18
+ times 4 dw 75, -18
+ times 4 dw -89, -50
+ times 4 dw 50, -89
+ times 4 dw 18, 75
+ times 4 dw 18, -50
+ times 4 dw 75, -89
+
pb_unpackhlw1: db 0,1,8,9,2,3,10,11,4,5,12,13,6,7,14,15
+pb_idct8even: db 0, 1, 8, 9, 4, 5, 12, 13, 0, 1, 8, 9, 4, 5, 12, 13
+
+tab_idct8_1: times 1 dw 64, -64, 36, -83, 64, 64, 83, 36
+
+tab_idct8_2: times 1 dw 89, 75, 50, 18, 75, -18, -89, -50
+ times 1 dw 50, -89, 18, 75, 18, -50, 75, -89
+
+pb_idct8odd: db 2, 3, 6, 7, 10, 11, 14, 15, 2, 3, 6, 7, 10, 11, 14, 15
+
SECTION .text
cextern pd_1
cextern pd_2
@@ -665,3 +683,187 @@
dec r2
jnz .pass2
RET
+
+;-------------------------------------------------------
+; void idct8(int32_t *src, int16_t *dst, intptr_t stride)
+;-------------------------------------------------------
+INIT_XMM ssse3
+
+cglobal patial_butterfly_inverse_internal_pass1
+ movu m0, [r0]
+ movu m1, [r0 + 4 * 32]
+ movu m2, [r0 + 2 * 32]
+ movu m3, [r0 + 6 * 32]
+ packssdw m0, m2
+ packssdw m1, m3
+ punpckhwd m2, m0, m1 ; [2 6]
+ punpcklwd m0, m1 ; [0 4]
+ pmaddwd m1, m0, [r6] ; EE[0]
+ pmaddwd m0, [r6 + 32] ; EE[1]
+ pmaddwd m3, m2, [r6 + 16] ; EO[0]
+ pmaddwd m2, [r6 + 48] ; EO[1]
+
+ paddd m4, m1, m3 ; E[0]
+ psubd m1, m3 ; E[3]
+ paddd m3, m0, m2 ; E[1]
+ psubd m0, m2 ; E[2]
+
+ ;E[K] = E[k] + add
+ mova m5, [pd_64]
+ paddd m0, m5
+ paddd m1, m5
+ paddd m3, m5
+ paddd m4, m5
+
+ movu m2, [r0 + 32]
+ movu m5, [r0 + 5 * 32]
+ packssdw m2, m5
+ movu m5, [r0 + 3 * 32]
+ movu m6, [r0 + 7 * 32]
+ packssdw m5, m6
+ punpcklwd m6, m2, m5 ;[1 3]
+ punpckhwd m2, m5 ;[5 7]
+
+ pmaddwd m5, m6, [r4]
+ pmaddwd m7, m2, [r4 + 16]
+ paddd m5, m7 ; O[0]
+
+ paddd m7, m4, m5
+ psrad m7, 7
+
+ psubd m4, m5
+ psrad m4, 7
+
+ packssdw m7, m4
+ movh [r5 + 0 * 16], m7
+ movhps [r5 + 7 * 16], m7
+
+ pmaddwd m5, m6, [r4 + 32]
+ pmaddwd m4, m2, [r4 + 48]
+ paddd m5, m4 ; O[1]
+
+ paddd m4, m3, m5
+ psrad m4, 7
+
+ psubd m3, m5
+ psrad m3, 7
+
+ packssdw m4, m3
+ movh [r5 + 1 * 16], m4
+ movhps [r5 + 6 * 16], m4
+
+ pmaddwd m5, m6, [r4 + 64]
+ pmaddwd m4, m2, [r4 + 80]
+ paddd m5, m4 ; O[2]
+
+ paddd m4, m0, m5
+ psrad m4, 7
+
+ psubd m0, m5
+ psrad m0, 7
+
+ packssdw m4, m0
+ movh [r5 + 2 * 16], m4
+ movhps [r5 + 5 * 16], m4
+
+ pmaddwd m5, m6, [r4 + 96]
+ pmaddwd m4, m2, [r4 + 112]
+ paddd m5, m4 ; O[3]
+
+ paddd m4, m1, m5
+ psrad m4, 7
+
+ psubd m1, m5
+ psrad m1, 7
+
+ packssdw m4, m1
+ movh [r5 + 3 * 16], m4
+ movhps [r5 + 4 * 16], m4
+
+ ret
+
+%macro PARTIAL_BUTTERFLY_PROCESS_ROW 1
+%if BIT_DEPTH == 10
+ %define IDCT_SHIFT 10
+%elif BIT_DEPTH == 8
+ %define IDCT_SHIFT 12
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ pshufb m4, %1, [pb_idct8even]
+ pmaddwd m4, [tab_idct8_1]
+ phsubd m5, m4
+ pshufd m4, m4, 0x4E
+ phaddd m4, m4
+ punpckhqdq m4, m5 ;m4 = dd e[ 0 1 2 3]
+ paddd m4, m6
+
+ pshufb %1, %1, [r6]
+ pmaddwd m5, %1, [r4]
+ pmaddwd %1, [r4 + 16]
+ phaddd m5, %1 ; m5 = dd O[0, 1, 2, 3]
+
+ paddd %1, m4, m5
+ psrad %1, IDCT_SHIFT
+
+ psubd m4, m5
+ psrad m4, IDCT_SHIFT
+ pshufd m4, m4, 0x1B
+
+ packssdw %1, m4
+%undef IDCT_SHIFT
+%endmacro
+
+cglobal patial_butterfly_inverse_internal_pass2
+
+ mova m0, [r5]
+ PARTIAL_BUTTERFLY_PROCESS_ROW m0
+ movu [r1], m0
+
+ mova m2, [r5 + 16]
+ PARTIAL_BUTTERFLY_PROCESS_ROW m2
+ movu [r1 + r2], m2
+
+ mova m1, [r5 + 32]
+ PARTIAL_BUTTERFLY_PROCESS_ROW m1
+ movu [r1 + 2 * r2], m1
+
+ mova m3, [r5 + 48]
+ PARTIAL_BUTTERFLY_PROCESS_ROW m3
+ movu [r1 + r3], m3
+
+ ret
+
+cglobal idct8, 3,7,8,0-16*mmsize
+ mov r5, rsp
+ lea r4, [tab_idct8_3]
+ lea r6, [tab_dct4]
+
+ call patial_butterfly_inverse_internal_pass1
+
+ add r0, 16
+ add r5, 8
+
+ call patial_butterfly_inverse_internal_pass1
+
+%if BIT_DEPTH == 10
+ mova m6, [pd_512]
+%elif BIT_DEPTH == 8
+ mova m6, [pd_2048]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ add r2, r2
+ lea r3, [r2 * 3]
+ lea r4, [tab_idct8_2]
+ lea r6, [pb_idct8odd]
+ sub r5, 8
+
+ call patial_butterfly_inverse_internal_pass2
+
+ lea r1, [r1 + 4 * r2]
+ add r5, 64
+
+ call patial_butterfly_inverse_internal_pass2
+
+ RET
diff -r 7b5699e6bb75 -r 63eea432b81c source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Thu Mar 13 18:29:54 2014 -0500
+++ b/source/common/x86/dct8.h Fri Mar 14 12:48:46 2014 +0530
@@ -26,6 +26,7 @@
void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
+void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
diff -r 7b5699e6bb75 -r 63eea432b81c source/test/mbdstharness.cpp
--- a/source/test/mbdstharness.cpp Thu Mar 13 18:29:54 2014 -0500
+++ b/source/test/mbdstharness.cpp Fri Mar 14 12:48:46 2014 +0530
@@ -30,7 +30,6 @@
#define ITERS 100
#define TEST_CASES 3
-#define IDCTMAX (1 << (BIT_DEPTH + 4)) - 1;
using namespace x265;
struct DctConf_t
{
@@ -94,13 +93,13 @@
{
short_test_buff[0][i] = (rand() & PIXEL_MAX) - (rand() & PIXEL_MAX);
int_test_buff[0][i] = rand() % PIXEL_MAX;
- int_idct_test_buff[0][i] = rand() % IDCTMAX;
+ int_idct_test_buff[0][i] = (rand() % (SHORT_MAX - SHORT_MIN)) - SHORT_MAX;
short_test_buff[1][i] = -PIXEL_MAX;
int_test_buff[1][i] = -PIXEL_MAX;
- int_idct_test_buff[1][i] = 0;
+ int_idct_test_buff[1][i] = SHORT_MIN;
short_test_buff[2][i] = PIXEL_MAX;
int_test_buff[2][i] = PIXEL_MAX;
- int_idct_test_buff[2][i] = IDCTMAX;
+ int_idct_test_buff[2][i] = SHORT_MAX;
}
for (int i = 0; i < mb_t_size; i++)
More information about the x265-devel
mailing list