[x265] [PATCH 3 of 4] asm: fix Main12 fault on intra_dc_avx2
Min Chen
chenm003 at 163.com
Wed Jul 22 01:20:04 CEST 2015
# HG changeset patch
# User Min Chen <chenm003 at 163.com>
# Date 1437514216 25200
# Node ID 486e77fdd864304b5016411daf507e3fdde5b618
# Parent 668adf85074fd29025f0a6ff3784a83bf04e4968
asm: fix Main12 fault on intra_dc_avx2
---
source/common/x86/asm-primitives.cpp | 2 +-
source/common/x86/intrapred16.asm | 32 ++++++++++++++++----------------
2 files changed, 17 insertions(+), 17 deletions(-)
diff -r 668adf85074f -r 486e77fdd864 source/common/x86/asm-primitives.cpp
--- a/source/common/x86/asm-primitives.cpp Tue Jul 21 14:30:14 2015 -0700
+++ b/source/common/x86/asm-primitives.cpp Tue Jul 21 14:30:16 2015 -0700
@@ -1485,10 +1485,10 @@
p.cu[BLOCK_64x64].psy_cost_pp = PFX(psyCost_pp_64x64_avx2);
p.cu[BLOCK_16x16].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar16_avx2);
p.cu[BLOCK_32x32].intra_pred[PLANAR_IDX] = PFX(intra_pred_planar32_avx2);
+#endif
p.cu[BLOCK_16x16].intra_pred[DC_IDX] = PFX(intra_pred_dc16_avx2);
p.cu[BLOCK_32x32].intra_pred[DC_IDX] = PFX(intra_pred_dc32_avx2);
-#endif
p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_avx2);
diff -r 668adf85074f -r 486e77fdd864 source/common/x86/intrapred16.asm
--- a/source/common/x86/intrapred16.asm Tue Jul 21 14:30:14 2015 -0700
+++ b/source/common/x86/intrapred16.asm Tue Jul 21 14:30:16 2015 -0700
@@ -473,14 +473,14 @@
add r1d, r1d
movu m0, [r2 + 66]
movu m2, [r2 + 2]
- paddw m0, m2
+ paddw m0, m2 ; dynamic range 13 bits
vextracti128 xm1, m0, 1
- paddw xm0, xm1
+ paddw xm0, xm1 ; dynamic range 14 bits
movhlps xm1, xm0
- paddw xm0, xm1
- phaddw xm0, xm0
+ paddw xm0, xm1 ; dynamic range 15 bits
pmaddwd xm0, [pw_1]
+ phaddd xm0, xm0
paddd xm0, [pd_16]
psrld xm0, 5
movd r5d, xm0
@@ -580,25 +580,25 @@
; void intra_pred_dc(pixel* dst, intptr_t dstStride, pixel *srcPix, int dirMode, int bFilter)
;---------------------------------------------------------------------------------------------
INIT_YMM avx2
-cglobal intra_pred_dc32, 3, 3, 2
+cglobal intra_pred_dc32, 3,3,3
add r2, 2
add r1d, r1d
movu m0, [r2]
movu m1, [r2 + 32]
- add r2, mmsize*4 ; r2 += 128
- paddw m0, m1
+ add r2, mmsize*4 ; r2 += 128
+ paddw m0, m1 ; dynamic range 13 bits
movu m1, [r2]
- paddw m0, m1
- movu m1, [r2 + 32]
- paddw m0, m1
+ movu m2, [r2 + 32]
+ paddw m1, m2 ; dynamic range 13 bits
+ paddw m0, m1 ; dynamic range 14 bits
vextracti128 xm1, m0, 1
- paddw xm0, xm1
+ paddw xm0, xm1 ; dynamic range 15 bits
+ pmaddwd xm0, [pw_1]
movhlps xm1, xm0
- paddw xm0, xm1
- phaddw xm0, xm0
- pmaddwd xm0, [pw_1]
- paddd xm0, [pd_32] ; sum = sum + 32
- psrld xm0, 6 ; sum = sum / 64
+ paddd xm0, xm1
+ phaddd xm0, xm0
+ paddd xm0, [pd_32] ; sum = sum + 32
+ psrld xm0, 6 ; sum = sum / 64
vpbroadcastw m0, xm0
lea r2, [r1 * 3]
More information about the x265-devel
mailing list