[x265] [PATCH] asm: Fix dct[8x8] main12 AVX2
aasaipriya at multicorewareinc.com
aasaipriya at multicorewareinc.com
Fri Oct 23 11:39:58 CEST 2015
# HG changeset patch
# User Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
# Date 1445590185 -19800
# Fri Oct 23 14:19:45 2015 +0530
# Node ID 285f4165db02250aba9bfe8e83c889d923a5b58d
# Parent a7251c3e0ef810b95bb25be5371035208e36996d
asm: Fix dct[8x8] main12 AVX2
diff -r a7251c3e0ef8 -r 285f4165db02 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Thu Oct 22 09:12:28 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Fri Oct 23 14:19:45 2015 +0530
@@ -1583,9 +1583,9 @@
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
#if X265_DEPTH <= 10
- ALL_LUMA_TU_S(dct, dct, avx2);
ALL_LUMA_TU_S(idct, idct, avx2);
#endif
+ ALL_LUMA_TU_S(dct, dct, avx2);
ALL_LUMA_CU_S(transpose, transpose, avx2);
ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
diff -r a7251c3e0ef8 -r 285f4165db02 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Thu Oct 22 09:12:28 2015 +0530
+++ b/source/common/x86/dct8.asm Fri Oct 23 14:19:45 2015 +0530
@@ -2174,7 +2174,7 @@
pmaddwd m0, m%4
phaddd m2, m0
paddd m2, m5
- psrad m2, DCT_SHIFT
+ psrad m2, DCT8_SHIFT1
packssdw m2, m2
vpermq m2, m2, 0x08
mova [r5 + %2], xm2
@@ -2190,7 +2190,7 @@
phaddd m8, m9
phaddd m6, m8
paddd m6, m5
- psrad m6, DCT_SHIFT2
+ psrad m6, DCT8_SHIFT2
vbroadcasti128 m4, [r6 + %2]
pmaddwd m10, m0, m4
@@ -2201,7 +2201,7 @@
phaddd m8, m9
phaddd m10, m8
paddd m10, m5
- psrad m10, DCT_SHIFT2
+ psrad m10, DCT8_SHIFT2
packssdw m6, m10
vpermq m10, m6, 0xD8
@@ -2210,18 +2210,7 @@
INIT_YMM avx2
cglobal dct8, 3, 7, 11, 0-8*16
-%if BIT_DEPTH == 12
- %define DCT_SHIFT 6
- vbroadcasti128 m5, [pd_16]
-%elif BIT_DEPTH == 10
- %define DCT_SHIFT 4
- vbroadcasti128 m5, [pd_8]
-%elif BIT_DEPTH == 8
- %define DCT_SHIFT 2
- vbroadcasti128 m5, [pd_2]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+vbroadcasti128 m5, [pd_ %+ DCT8_ROUND1]
%define DCT_SHIFT2 9
add r2d, r2d
@@ -2265,7 +2254,7 @@
DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
;pass2
- vbroadcasti128 m5, [pd_256]
+ vbroadcasti128 m5, [pd_ %+ DCT8_ROUND2]
mova m0, [r5]
mova m1, [r5 + 32]
More information about the x265-devel
mailing list