[x265] [PATCH] asm: avx2 asm code for dct4
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Mon Sep 1 12:04:19 CEST 2014
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1409565805 -19800
# Mon Sep 01 15:33:25 2014 +0530
# Node ID 3da801ede69d028c24ded368a3c7a14efd07b34a
# Parent c5624effb73c74e63fd2e42d2a48ea4490074dce
asm: avx2 asm code for dct4
diff -r c5624effb73c -r 3da801ede69d source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Mon Sep 01 14:13:37 2014 +0530
+++ b/source/common/x86/asm-primitives.cpp Mon Sep 01 15:33:25 2014 +0530
@@ -1432,8 +1432,8 @@
}
if (cpuMask & X265_CPU_AVX2)
{
+ p.dct[DCT_4x4] = x265_dct4_avx2;
}
-
/* at HIGH_BIT_DEPTH, pixel == short so we can reuse a number of primitives */
for (int i = 0; i < NUM_LUMA_PARTITIONS; i++)
{
@@ -1715,6 +1715,8 @@
p.cvt32to16_shl[BLOCK_16x16] = x265_cvt32to16_shl_16_avx2;
p.cvt32to16_shl[BLOCK_32x32] = x265_cvt32to16_shl_32_avx2;
p.denoiseDct = x265_denoise_dct_avx2;
+
+ p.dct[DCT_4x4] = x265_dct4_avx2;
}
#endif // if HIGH_BIT_DEPTH
}
diff -r c5624effb73c -r 3da801ede69d source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Mon Sep 01 14:13:37 2014 +0530
+++ b/source/common/x86/dct8.asm Mon Sep 01 15:33:25 2014 +0530
@@ -34,6 +34,10 @@
times 4 dw 83, 36
times 4 dw 64, -64
times 4 dw 36, -83
+avx2_dct4: dw 64, 64, 64, 64, 64, 64, 64, 64, 64, -64, 64, -64, 64, -64, 64, -64
+ dw 83, 36, 83, 36, 83, 36, 83, 36, 36, -83, 36, -83, 36, -83, 36, -83
+
+dct4_shuf: db 0, 1, 2, 3, 8, 9, 10, 11, 6, 7, 4, 5, 14, 15, 12, 13
tab_dst4: times 2 dw 29, 55, 74, 84
times 2 dw 74, 74, 0, -74
@@ -184,7 +188,74 @@
paddd m2, m7
psrad m2, 8
movu [r1 + 3 * 16], m2
+ RET
+; DCT 4x4
+;
+; Input parameters:
+; - r0: source
+; - r1: destination
+; - r2: source stride
+INIT_YMM avx2
+cglobal dct4, 3, 4, 8, src, dst, srcStride
+%if BIT_DEPTH == 10
+ %define DCT_SHIFT 3
+ vbroadcasti128 m7, [pd_4]
+%elif BIT_DEPTH == 8
+ %define DCT_SHIFT 1
+ vbroadcasti128 m7, [pd_1]
+%else
+ %error Unsupported BIT_DEPTH!
+%endif
+ add r2d, r2d
+ lea r3, [avx2_dct4]
+
+ vbroadcasti128 m4, [dct4_shuf]
+ mova m5, [r3]
+ mova m6, [r3 + 32]
+ movq xm0, [r0]
+ movhps xm0, [r0 + r2]
+ lea r0, [r0 + 2 * r2]
+ movq xm1, [r0]
+ movhps xm1, [r0 + r2]
+
+ vinserti128 m0, m0, xm1, 1
+ pshufb m0, m4
+ vpermq m1, m0, 11011101b
+ vpermq m0, m0, 10001000b
+ paddw m2, m0, m1
+ psubw m0, m1
+
+ pmaddwd m2, m5
+ paddd m2, m7
+ psrad m2, DCT_SHIFT
+
+ pmaddwd m0, m6
+ paddd m0, m7
+ psrad m0, DCT_SHIFT
+
+ packssdw m2, m0
+ pshufb m2, m4
+ vpermq m1, m2, 11011101b
+ vpermq m2, m2, 10001000b
+ vbroadcasti128 m7, [pd_128]
+
+ pmaddwd m0, m2, m5
+ pmaddwd m3, m1, m5
+ paddd m3, m0
+ paddd m3, m7
+ psrad m3, 8
+
+ pmaddwd m2, m6
+ pmaddwd m1, m6
+ psubd m2, m1
+ paddd m2, m7
+ psrad m2, 8
+
+ movu [r1], xm3
+ movu [r1 + mmsize/2], m2
+ vextracti128 [r1 + mmsize], m3, 1
+ vextracti128 [r1 + mmsize + mmsize/2], m2, 1
RET
;-------------------------------------------------------
diff -r c5624effb73c -r 3da801ede69d source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Mon Sep 01 14:13:37 2014 +0530
+++ b/source/common/x86/dct8.h Mon Sep 01 15:33:25 2014 +0530
@@ -23,8 +23,8 @@
#ifndef X265_DCT8_H
#define X265_DCT8_H
-
void x265_dct4_sse2(int16_t *src, int32_t *dst, intptr_t stride);
+void x265_dct4_avx2(int16_t *src, int32_t *dst, intptr_t stride);
void x265_idct4_sse2(int32_t *src, int16_t *dst, intptr_t stride);
void x265_idct8_ssse3(int32_t *src, int16_t *dst, intptr_t stride);
void x265_dst4_ssse3(int16_t *src, int32_t *dst, intptr_t stride);
More information about the x265-devel
mailing list