[x265] [PATCH 3 of 3] asm: fix dct[8x8] AVX2 asm for main12
dnyaneshwar at multicorewareinc.com
dnyaneshwar at multicorewareinc.com
Wed Dec 9 09:50:52 CET 2015
# HG changeset patch
# User Aasaipriya Chandran <aasaipriya at multicorewareinc.com>
# Date 1449648215 -19800
# Wed Dec 09 13:33:35 2015 +0530
# Node ID 9e3f71d784e59527a14702e83de474bc3f12fd15
# Parent 9357c1f448a7b987cebfd3cc5542cc6c65e63fe2
asm: fix dct[8x8] AVX2 asm for main12
diff -r 9357c1f448a7 -r 9e3f71d784e5 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Dec 01 15:16:12 2015 +0530
+++ b/source/common/x86/asm-primitives.cpp Wed Dec 09 13:33:35 2015 +0530
@@ -1573,9 +1573,8 @@
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32_avx2);
ALL_LUMA_TU_S(idct, idct, avx2);
-#if X265_DEPTH <= 10
ALL_LUMA_TU_S(dct, dct, avx2);
-#endif
+
ALL_LUMA_CU_S(transpose, transpose, avx2);
ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, avx2);
diff -r 9357c1f448a7 -r 9e3f71d784e5 source/common/x86/dct8.asm
--- a/source/common/x86/dct8.asm Tue Dec 01 15:16:12 2015 +0530
+++ b/source/common/x86/dct8.asm Wed Dec 09 13:33:35 2015 +0530
@@ -2174,7 +2174,7 @@
pmaddwd m0, m%4
phaddd m2, m0
paddd m2, m5
- psrad m2, DCT_SHIFT
+ psrad m2, DCT8_SHIFT1
packssdw m2, m2
vpermq m2, m2, 0x08
mova [r5 + %2], xm2
@@ -2190,7 +2190,7 @@
phaddd m8, m9
phaddd m6, m8
paddd m6, m5
- psrad m6, DCT_SHIFT2
+ psrad m6, DCT8_SHIFT2
vbroadcasti128 m4, [r6 + %2]
pmaddwd m10, m0, m4
@@ -2201,7 +2201,7 @@
phaddd m8, m9
phaddd m10, m8
paddd m10, m5
- psrad m10, DCT_SHIFT2
+ psrad m10, DCT8_SHIFT2
packssdw m6, m10
vpermq m10, m6, 0xD8
@@ -2210,18 +2210,7 @@
INIT_YMM avx2
cglobal dct8, 3, 7, 11, 0-8*16
-%if BIT_DEPTH == 12
- %define DCT_SHIFT 6
- vbroadcasti128 m5, [pd_16]
-%elif BIT_DEPTH == 10
- %define DCT_SHIFT 4
- vbroadcasti128 m5, [pd_8]
-%elif BIT_DEPTH == 8
- %define DCT_SHIFT 2
- vbroadcasti128 m5, [pd_2]
-%else
- %error Unsupported BIT_DEPTH!
-%endif
+vbroadcasti128 m5, [pd_ %+ DCT8_ROUND1]
%define DCT_SHIFT2 9
add r2d, r2d
@@ -2265,7 +2254,7 @@
DCT8_PASS_1 7 * 16, 7 * 16, 4, 1
;pass2
- vbroadcasti128 m5, [pd_256]
+ vbroadcasti128 m5, [pd_ %+ DCT8_ROUND2]
mova m0, [r5]
mova m1, [r5 + 32]
More information about the x265-devel
mailing list