[x265] [PATCH] asm: avx2 code for dst4x4
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Apr 1 08:26:51 CEST 2015
# HG changeset patch
# User Dnyaneshwar G <dnyaneshwar at multicorewareinc.com>
# Date 1427803043 -19800
# Tue Mar 31 17:27:23 2015 +0530
# Node ID ffa14b40f0fff3f6f22fe273458f2a4c83c50acf
# Parent ac85c775620f1dcb0df056874633cbf916098bd2
asm: avx2 code for dst4x4
AVX2:
dst4x4 4.53x 277.57 1256.64
SSE2:
dst4x4 2.91x 431.31 1255.54
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/asm-primitives.cpp Tue Mar 31 17:27:23 2015 +0530
@@ -1447,6 +1447,7 @@
#if X86_64
if (cpuMask & X265_CPU_AVX2)
{
+ p.dst4x4 = x265_dst4_avx2;
p.scale2D_64to32 = x265_scale2D_64to32_avx2;
p.cu[BLOCK_4x4].psy_cost_ss = x265_psyCost_ss_4x4_avx2;
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/const-a.asm
--- a/source/common/x86/const-a.asm Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/const-a.asm Tue Mar 31 17:27:23 2015 +0530
@@ -105,6 +105,9 @@
const multiH2, dw 17, 18, 19, 20, 21, 22, 23, 24
const multiH3, dw 25, 26, 27, 28, 29, 30, 31, 32
+ALIGN 32
+const trans8_shuf, dd 0, 4, 1, 5, 2, 6, 3, 7
+
const popcnt_table
%assign x 0
%rep 256
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/dct8.asm Tue Mar 31 17:27:23 2015 +0530
@@ -261,6 +261,11 @@
times 2 dw 84, -29, -74, 55
times 2 dw 55, -84, 74, -29
+pw_dst4_tab: times 4 dw 29, 55, 74, 84
+ times 4 dw 74, 74, 0, -74
+ times 4 dw 84, -29, -74, 55
+ times 4 dw 55, -84, 74, -29
+
tab_idst4: times 4 dw 29, +84
times 4 dw +74, +55
times 4 dw 55, -29
@@ -316,7 +321,7 @@
cextern pd_1024
cextern pd_2048
cextern pw_ppppmmmm
-
+cextern trans8_shuf
;------------------------------------------------------
;void dct4(const int16_t* src, int16_t* dst, intptr_t srcStride)
;------------------------------------------------------
@@ -656,6 +661,59 @@
RET
+;------------------------------------------------------------------
+;void dst4(const int16_t* src, int16_t* dst, intptr_t srcStride)
+;------------------------------------------------------------------
+INIT_YMM avx2
+cglobal dst4, 3, 4, 6
+%if BIT_DEPTH == 8
+ %define DST_SHIFT 1
+ vpbroadcastd m5, [pd_1]
+%elif BIT_DEPTH == 10
+ %define DST_SHIFT 3
+ vpbroadcastd m5, [pd_4]
+%endif
+ mova m4, [trans8_shuf]
+ add r2d, r2d
+ lea r3, [pw_dst4_tab]
+
+ movq xm0, [r0 + 0 * r2]
+ movhps xm0, [r0 + 1 * r2]
+ lea r0, [r0 + 2 * r2]
+ movq xm1, [r0]
+ movhps xm1, [r0 + r2]
+
+ vinserti128 m0, m0, xm1, 1 ; m0 = src[0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15]
+
+ pmaddwd m2, m0, [r3 + 0 * 32]
+ pmaddwd m1, m0, [r3 + 1 * 32]
+ phaddd m2, m1
+ paddd m2, m5
+ psrad m2, DST_SHIFT
+ pmaddwd m3, m0, [r3 + 2 * 32]
+ pmaddwd m1, m0, [r3 + 3 * 32]
+ phaddd m3, m1
+ paddd m3, m5
+ psrad m3, DST_SHIFT
+ packssdw m2, m3
+ vpermd m2, m4, m2
+
+ vpbroadcastd m5, [pd_128]
+ pmaddwd m0, m2, [r3 + 0 * 32]
+ pmaddwd m1, m2, [r3 + 1 * 32]
+ phaddd m0, m1
+ paddd m0, m5
+ psrad m0, 8
+ pmaddwd m3, m2, [r3 + 2 * 32]
+ pmaddwd m2, m2, [r3 + 3 * 32]
+ phaddd m3, m2
+ paddd m3, m5
+ psrad m3, 8
+ packssdw m0, m3
+ vpermd m0, m4, m0
+ movu [r1], m0
+ RET
+
;-------------------------------------------------------
;void idst4(const int16_t* src, int16_t* dst, intptr_t dstStride)
;-------------------------------------------------------
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/dct8.h
--- a/source/common/x86/dct8.h Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/dct8.h Tue Mar 31 17:27:23 2015 +0530
@@ -26,6 +26,7 @@
void x265_dct4_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dct8_sse2(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dst4_ssse3(const int16_t* src, int16_t* dst, intptr_t srcStride);
+void x265_dst4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dct8_sse4(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dct4_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
void x265_dct8_avx2(const int16_t* src, int16_t* dst, intptr_t srcStride);
diff -r ac85c775620f -r ffa14b40f0ff source/common/x86/intrapred8.asm
--- a/source/common/x86/intrapred8.asm Tue Mar 31 20:04:28 2015 -0500
+++ b/source/common/x86/intrapred8.asm Tue Mar 31 17:27:23 2015 +0530
@@ -58,7 +58,6 @@
c_mode16_18: db 0, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1
ALIGN 32
-trans8_shuf: dd 0, 4, 1, 5, 2, 6, 3, 7
c_ang8_src1_9_2_10: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
c_ang8_26_20: db 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 6, 26, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20, 12, 20
c_ang8_src3_11_4_12: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
@@ -553,6 +552,7 @@
cextern multiH2
cextern multiH3
cextern multi_2Row
+cextern trans8_shuf
;---------------------------------------------------------------------------------------------
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
More information about the x265-devel
mailing list