[x265] [PATCH] asm: avx2 assembly code for dct16
yuvaraj at multicorewareinc.com
yuvaraj at multicorewareinc.com
Fri Sep 5 13:51:58 CEST 2014
# HG changeset patch
# User Yuvaraj Venkatesh <yuvaraj at multicorewareinc.com>
# Date 1409917643 -19800
# Fri Sep 05 17:17:23 2014 +0530
# Node ID 78143d079d48b0b0cbcda7bb208389342d433c55
# Parent 93db2f53fe573537bcd4eb53ca3cdb69af557eb5
asm: avx2 assembly code for dct16
diff -r 93db2f53fe57 -r 78143d079d48 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Sep 04 16:42:24 2014 -0700
+++ b/source/common/x86/asm-primitives.cpp Fri Sep 05 17:17:23 2014 +0530
@@ -1734,8 +1734,8 @@
p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
p.denoiseDct = x265_denoise_dct_avx2;
-
p.dct[DCT_4x4] = x265_dct4_avx2;
+ p.dct[DCT_16x16] = x265_dct16_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r 93db2f53fe57 -r 78143d079d48 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Sep 04 16:42:24 2014 -0700
+++ b/source/common/x86/dct8.asm Fri Sep 05 17:17:23 2014 +0530
@@ -29,13 +29,61 @@
%include "x86util.asm"
SECTION_RODATA 32
+tab_dct16_1: times 2 dw 64, 64, 64, 64, 64, 64, 64, 64
+ times 2 dw 90, 87, 80, 70, 57, 43, 25, 9
+ times 2 dw 89, 75, 50, 18, -18, -50, -75, -89
+ times 2 dw 87, 57, 9, -43, -80, -90, -70, -25
+ times 2 dw 83, 36, -36, -83, -83, -36, 36, 83
+ times 2 dw 80, 9, -70, -87, -25, 57, 90, 43
+ times 2 dw 75, -18, -89, -50, 50, 89, 18, -75
+ times 2 dw 70, -43, -87, 9, 90, 25, -80, -57
+ times 2 dw 64, -64, -64, 64, 64, -64, -64, 64
+ times 2 dw 57, -80, -25, 90, -9, -87, 43, 70
+ times 2 dw 50, -89, 18, 75, -75, -18, 89, -50
+ times 2 dw 43, -90, 57, 25, -87, 70, 9, -80
+ times 2 dw 36, -83, 83, -36, -36, 83, -83, 36
+ times 2 dw 25, -70, 90, -80, 43, 9, -57, 87
+ times 2 dw 18, -50, 75, -89, 89, -75, 50, -18
+ times 2 dw 9, -25, 43, -57, 70, -80, 87, -90
+
+
+tab_dct16_2: times 2 dw 64, 64, 64, 64, 64, 64, 64, 64
+ times 2 dw -9, -25, -43, -57, -70, -80, -87, -90
+ times 2 dw -89, -75, -50, -18, 18, 50, 75, 89
+ times 2 dw 25, 70, 90, 80, 43, -9, -57, -87
+ times 2 dw 83, 36, -36, -83, -83, -36, 36, 83
+ times 2 dw -43, -90, -57, 25, 87, 70, -9, -80
+ times 2 dw -75, 18, 89, 50, -50, -89, -18, 75
+ times 2 dw 57, 80, -25, -90, -9, 87, 43, -70
+ times 2 dw 64, -64, -64, 64, 64, -64, -64, 64
+ times 2 dw -70, -43, 87, 9, -90, 25, 80, -57
+ times 2 dw -50, 89, -18, -75, 75, 18, -89, 50
+ times 2 dw 80, -9, -70, 87, -25, -57, 90, -43
+ times 2 dw 36, -83, 83, -36, -36, 83, -83, 36
+ times 2 dw -87, 57, -9, -43, 80, -90, 70, -25
+ times 2 dw -18, 50, -75, 89, -89, 75, -50, 18
+ times 2 dw 90, -87, 80, -70, 57, -43, 25, -9
+
+tab_dct16_3: times 4 dw 64, 64, 64, 64
+ times 4 dw 89, 75, 50, 18
+ times 4 dw 83, 36, -36, -83
+ times 4 dw 75, -18, -89, -50
+ times 4 dw 64, -64, -64, 64
+ times 4 dw 50, -89, 18, 75
+ times 4 dw 36, -83, 83, -36
+ times 4 dw 18, -50, 75, -89
+
+dct16_shuf1: times 2 db 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1
+
+dct16_shuf2: times 2 db 0, 1, 14, 15, 2, 3, 12, 13, 4, 5, 10, 11, 6, 7, 8, 9
+
+avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
+ dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
tab_dct4: times 4 dw 64, 64
times 4 dw 83, 36
times 4 dw 64, -64
times 4 dw 36, -83
-avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
- dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
@@ -1067,3 +1115,277 @@
RET
%endif ; !HIGH_BIT_DEPTH
+
+%macro DCT16_PASS_1_E 2
+ mova m7, [tab_dct16_3 + %1]
+
+ pmaddwd m4, m0, m7
+ phaddd m4, m4
+
+ pmaddwd m6, m2, m7
+ phaddd m6, m6
+
+ punpcklqdq m4, m6
+
+ paddd m4, m9
+ psrad m4, DCT_SHIFT
+
+ packssdw m4, m4
+ vpermq m4, m4, 0x08
+
+ mova [r5 + %2], xm4
+%endmacro
+
+%macro DCT16_PASS_1_O 2
+ mova m7, [tab_dct16_1 + %1]
+
+ pmaddwd m10, m0, m7
+ phaddd m10, m10
+
+ pmaddwd m11, m2, m7
+ phaddd m11, m11
+
+ punpcklqdq m10, m11
+ phaddd m10, m10 ; [d0 d1 -- -- d4 d5 -- --]
+
+ pmaddwd m11, m4, m7
+ phaddd m11, m11
+
+ pmaddwd m12, m6, m7
+ phaddd m12, m12
+
+ punpcklqdq m11, m12
+ phaddd m11, m11 ; [d2 d3 -- -- d6 d7 -- --]
+
+ punpcklqdq m10, m11 ; [d0 d1 d2 d3 d4 d5 d6 d7]
+
+ paddd m10, m9
+ psrad m10, DCT_SHIFT
+
+ packssdw m10, m10 ; [w0 w1 w2 w3 - - - - w4 w5 w6 w7 - - - -]
+ vpermq m10, m10, 0x08
+
+ mova [r5 + %2], xm10
+%endmacro
+
+%macro DCT16_PASS_2 1
+ mova m8, [tab_dct16_1 + %1]
+ mova m13, [tab_dct16_2 + %1]
+
+ pmaddwd m10, m0, m8
+ pmaddwd m11, m1, m13
+ paddd m10, m11
+ phaddd m10, m10
+
+ pmaddwd m11, m2, m8
+ pmaddwd m12, m3, m13
+ paddd m11, m12
+ phaddd m11, m11
+
+ punpcklqdq m10, m11
+ phaddd m10, m10
+
+ pmaddwd m11, m4, m8
+ pmaddwd m12, m5, m13
+ paddd m11, m12
+ phaddd m11, m11
+
+ pmaddwd m12, m6, m8
+ pmaddwd m13, m7, m13
+ paddd m12, m13
+ phaddd m12, m12
+
+ punpcklqdq m11, m12
+ phaddd m11, m11
+ punpcklqdq m10, m11
+ paddd m10, m9
+ psrad m10, DCT_SHIFT2
+%endmacro
+
+INIT_YMM avx2
+cglobal dct16, 3, 9, 15, 0-16*mmsize
+%if BIT_DEPTH == 10
+ %define DCT_SHIFT 5
+ vpbroadcastd m9, [pd_16]
+%elif BIT_DEPTH == 8
+ %define DCT_SHIFT 3
+ vpbroadcastd m9, [pd_4]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+%define DCT_SHIFT2 10
+
+ add r2d, r2d
+
+ lea r3, [r2 * 3]
+ mov r5, rsp
+ mov r4, 2
+ mova m13, [dct16_shuf1]
+ mova m14, [dct16_shuf2]
+
+.pass1:
+ lea r6, [r0 + r2 * 4]
+
+ mova m2, [r0]
+ mova m1, [r6]
+ vperm2i128 m0, m2, m1, 0x20 ; [row0lo row4lo]
+ vperm2i128 m1, m1, m2, 0x13 ; [row0hi row4hi]
+
+ mova m4, [r0 + r2]
+ mova m3, [r6 + r2]
+ vperm2i128 m2, m4, m3, 0x20 ; [row1lo row5lo]
+ vperm2i128 m3, m3, m4, 0x13 ; [row1hi row5hi]
+
+ mova m6, [r0 + r2 * 2]
+ mova m5, [r6 + r2 * 2]
+ vperm2i128 m4, m6, m5, 0x20 ; [row2lo row6lo]
+ vperm2i128 m5, m5, m6, 0x13 ; [row2hi row6hi]
+
+ mova m8, [r0 + r3]
+ mova m7, [r6 + r3]
+ vperm2i128 m6, m8, m7, 0x20 ; [row3lo row7lo]
+ vperm2i128 m7, m7, m8, 0x13 ; [row3hi row7hi]
+
+ pshufb m1, m13
+ pshufb m3, m13
+ pshufb m5, m13
+ pshufb m7, m13
+
+ paddw m8, m0, m1 ;E
+ psubw m0, m1 ;O
+
+ paddw m1, m2, m3 ;E
+ psubw m2, m3 ;O
+
+ paddw m3, m4, m5 ;E
+ psubw m4, m5 ;O
+
+ paddw m5, m6, m7 ;E
+ psubw m6, m7 ;O
+
+ DCT16_PASS_1_O 1 * 32, 1 * 32
+ DCT16_PASS_1_O 3 * 32, 3 * 32
+ DCT16_PASS_1_O 5 * 32, 1 * 32 + 16
+ DCT16_PASS_1_O 7 * 32, 3 * 32 + 16
+ DCT16_PASS_1_O 9 * 32, 5 * 32
+ DCT16_PASS_1_O 11 * 32, 7 * 32
+ DCT16_PASS_1_O 13 * 32, 5 * 32 + 16
+ DCT16_PASS_1_O 15 * 32, 7 * 32 + 16
+
+ pshufb m8, m14
+ mova m0, m8
+ phaddw m0, m0
+
+ pshufb m1, m14
+ mova m2, m1
+ phaddw m2, m2
+
+ punpcklqdq m0, m2
+
+ pshufb m3, m14
+ mova m2, m3
+ phaddw m2, m2
+
+ pshufb m5, m14
+ mova m4, m5
+ phaddw m4, m4
+
+ punpcklqdq m2, m4
+
+ DCT16_PASS_1_E 0 * 32, 0 * 32
+ DCT16_PASS_1_E 2 * 32, 0 * 32 + 16
+ DCT16_PASS_1_E 4 * 32, 4 * 32
+ DCT16_PASS_1_E 6 * 32, 4 * 32 + 16
+
+ mova m0, m8
+ phsubw m0, m0
+
+ mova m2, m1
+ phsubw m2, m2
+
+ punpcklqdq m0, m2
+
+ mova m2, m3
+ phsubw m2, m2
+
+ mova m4, m5
+ phsubw m4, m4
+
+ punpcklqdq m2, m4
+
+ DCT16_PASS_1_E 1 * 32, 2 * 32
+ DCT16_PASS_1_E 3 * 32, 2 * 32 + 16
+ DCT16_PASS_1_E 5 * 32, 6 * 32
+ DCT16_PASS_1_E 7 * 32, 6 * 32 + 16
+
+ lea r0, [r0 + 8 * r2]
+ add r5, 256
+
+ dec r4
+ jnz .pass1
+
+ mov r5, rsp
+ mov r4, 2
+ add r2d, r2d
+ lea r3, [r2 * 3]
+ vpbroadcastd m9, [pd_512]
+
+.pass2:
+ mova m0, [r5 + 0 * 32] ; [row0lo row4lo]
+ mova m1, [r5 + 8 * 32] ; [row0hi row4hi]
+
+ mova m2, [r5 + 1 * 32] ; [row1lo row5lo]
+ mova m3, [r5 + 9 * 32] ; [row1hi row5hi]
+
+ mova m4, [r5 + 2 * 32] ; [row2lo row6lo]
+ mova m5, [r5 + 10 * 32] ; [row2hi row6hi]
+
+ mova m6, [r5 + 3 * 32] ; [row3lo row7lo]
+ mova m7, [r5 + 11 * 32] ; [row3hi row7hi]
+
+ DCT16_PASS_2 0 * 32
+ mova [r1], m10
+ DCT16_PASS_2 1 * 32
+ mova [r1 + r2], m10
+ DCT16_PASS_2 2 * 32
+ mova [r1 + r2 * 2], m10
+ DCT16_PASS_2 3 * 32
+ mova [r1 + r3], m10
+
+ lea r6, [r1 + r2 * 4]
+ DCT16_PASS_2 4 * 32
+ mova [r6], m10
+ DCT16_PASS_2 5 * 32
+ mova [r6 + r2], m10
+ DCT16_PASS_2 6 * 32
+ mova [r6 + r2 * 2], m10
+ DCT16_PASS_2 7 * 32
+ mova [r6 + r3], m10
+
+ lea r6, [r6 + r2 * 4]
+ DCT16_PASS_2 8 * 32
+ mova [r6], m10
+ DCT16_PASS_2 9 * 32
+ mova [r6 + r2], m10
+ DCT16_PASS_2 10 * 32
+ mova [r6 + r2 * 2], m10
+ DCT16_PASS_2 11 * 32
+ mova [r6 + r3], m10
+
+ lea r6, [r6 + r2 * 4]
+ DCT16_PASS_2 12 * 32
+ mova [r6], m10
+ DCT16_PASS_2 13 * 32
+ mova [r6 + r2], m10
+ DCT16_PASS_2 14 * 32
+ mova [r6 + r2 * 2], m10
+ DCT16_PASS_2 15 * 32
+ mova [r6 + r3], m10
+
+ add r1, 32
+ add r5, 128
+
+ dec r4
+ jnz .pass2
+
+ RET
diff -r 93db2f53fe57 -r 78143d079d48 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Thu Sep 04 16:42:24 2014 -0700
+++ b/source/common/x86/dct8.h Fri Sep 05 17:17:23 2014 +0530
@@ -30,6 +30,7 @@
void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idst4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dct8_sse4(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_dct16_avx2(int16_t *src, int32_t *dst, intptr_t stride);
void x265_denoise_dct_mmx(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
void x265_denoise_dct_sse2(int32_t *dct, uint32_t *sum, uint16_t *offset, int size);
More information about the x265-devel
mailing list