[x265] [PATCH] asm: avx2 asm code for dct4
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Aug 27 15:32:32 CEST 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1409145968 -19800
# Wed Aug 27 18:56:08 2014 +0530
# Node ID 27193515d4417c142fff97a1d96a3d7111b9d6d5
# Parent 77fe0cc583e8ec10275bc1b3c4bb116d5ceb51ac
asm: avx2 asm code for dct4
previous perf: 4.3x, with avx2: 5.4x
diff -r 77fe0cc583e8 -r 27193515d441 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Aug 27 18:56:08 2014 +0530
@@ -1715,6 +1715,8 @@
p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
p.denoiseDct = x265_denoise_dct_avx2;
+
+ p.dct[DCT_4x4] = x265_dct4_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r 77fe0cc583e8 -r 27193515d441 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/dct8.asm Wed Aug 27 18:56:08 2014 +0530
@@ -35,6 +35,11 @@
times 4 dw 64, -64
times 4 dw 36, -83
+avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
+ dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
+
+dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
+
tab_dst4: times 2 dw 29, 55, 74, 84
times 2 dw 74, 74, 0, -74
times 2 dw 84, -29, -74, 55
@@ -187,6 +192,77 @@
RET
+
+; DCT 4x4
+;
+; Input parameters:
+; - r0: source
+; - r1: destination
+; - r2: source stride
+INIT_YMM avx2
+cglobal dct4, 3, 4, 8, src, dst, srcStride
+%if BIT_DEPTH == 10
+ %define DCT_SHIFT 3
+ vbroadcasti128 m7, [pd_4]
+%elif BIT_DEPTH == 8
+ %define DCT_SHIFT 1
+ vbroadcasti128 m7, [pd_1]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ add r2d, r2d
+ lea r3, [avx2_dct4]
+
+ vbroadcasti128 m4, [dct4_shuf]
+ mova m5, [r3]
+ mova m6, [r3 + 32]
+ movq xm0, [r0]
+ movq xm1, [r0 + r2]
+ punpcklqdq m0, m1
+ lea r0, [r0 + 2 * r2]
+ movq xm1, [r0]
+ movq xm2, [r0 + r2]
+ punpcklqdq m1, m2
+
+ vinserti128 m0, m0, xm1, 1
+ pshufb m0, m4
+ vpermq m1, m0, 11011101b
+ vpermq m0, m0, 10001000b
+ paddw m2, m0, m1
+ psubw m0, m1
+
+ pmaddwd m2, m5
+ paddd m2, m7
+ psrad m2, DCT_SHIFT
+
+ pmaddwd m0, m6
+ paddd m0, m7
+ psrad m0, DCT_SHIFT
+
+ packssdw m2, m0
+ pshufb m2, m4
+ vpermq m1, m2, 11011101b
+ vpermq m2, m2, 10001000b
+ vbroadcasti128 m7, [pd_128]
+
+ pmaddwd m0, m2, m5
+ pmaddwd m3, m1, m5
+ paddd m3, m0
+ paddd m3, m7
+ psrad m3, 8
+
+ pmaddwd m2, m6
+ pmaddwd m1, m6
+ psubd m2, m1
+ paddd m2, m7
+ psrad m2, 8
+
+ vinserti128 m0, m3, xm2, 1
+ vpermq m3, m3, 11101110b
+ vinserti128 m2, m2, xm3, 0
+ movu [r1], m0
+ movu [r1 + mmsize], m2
+ RET
;-------------------------------------------------------
;void idct4(int32_t *src, int16_t *dst, intptr_t stride)
;-------------------------------------------------------
diff -r 77fe0cc583e8 -r 27193515d441 source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Wed Aug 27 14:25:17 2014 +0530
+++ b/source/common/x86/dct8.h Wed Aug 27 18:56:08 2014 +0530
@@ -25,6 +25,7 @@
#define X265_DCT8_H
void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
More information about the x265-devel
mailing list