[x265] [PATCH 2 of 2] asm: fix output change bug in pixel_sa8d_16x16, the reason is intermedia result overflow
Min Chen
chenm003 at 163.com
Tue Jun 14 23:33:42 CEST 2016
# HG changeset patch
# User Min Chen <min.chen at multicorewareinc.com>
# Date 1465938978 18000
# Node ID 362976c6bf6853e75cec0e94e48941eed4737269
# Parent 3d8e1d324c9f4bd50eeb1addf85507b668ef3fe9
asm: fix output change bug in pixel_sa8d_16x16, the reason is intermedia result overflow
---
source/common/x86/pixel-a.asm | 22 +++++++++++++---------
1 files changed, 13 insertions(+), 9 deletions(-)
diff -r 3d8e1d324c9f -r 362976c6bf68 source/common/x86/pixel-a.asm
--- a/source/common/x86/pixel-a.asm Tue Jun 14 16:16:14 2016 -0500
+++ b/source/common/x86/pixel-a.asm Tue Jun 14 16:16:18 2016 -0500
@@ -13910,7 +13910,50 @@
lea r7, [r2+4*r3]
vbroadcasti128 m7, [pw_1]
- ;call pixel_sa8d_8x8_internal ; pix[0]
+ ; Top 16x8
+ ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
+ movu m0, [r0] ; 10 bits
+ movu m5, [r2]
+ psubw m0, m5 ; 11 bits
+ movu m1, [r0 + r1]
+ movu m6, [r2 + r3]
+ psubw m1, m6
+ movu m2, [r0 + r1 * 2]
+ movu m5, [r2 + r3 * 2]
+ psubw m2, m5
+ movu m8, [r0 + r4]
+ movu m6, [r2 + r5]
+ psubw m8, m6
+
+ ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
+ movu m4, [r6]
+ movu m11, [r7]
+ psubw m4, m11
+ movu m5, [r6 + r1]
+ movu m6, [r7 + r3]
+ psubw m5, m6
+ movu m3, [r6 + r1 * 2]
+ movu m11, [r7 + r3 * 2]
+ psubw m3, m11
+ movu m9, [r6 + r4]
+ movu m6, [r7 + r5]
+ psubw m9, m6
+
+ HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax ; 16 bits
+ pmaddwd m0, m7
+ pmaddwd m1, m7
+ pmaddwd m2, m7
+ pmaddwd m8, m7
+ paddd m0, m1
+ paddd m2, m8
+ paddd m10, m0, m2
+
+ lea r0, [r0+8*r1]
+ lea r2, [r2+8*r3]
+ lea r6, [r6+8*r1]
+ lea r7, [r7+8*r3]
+
+ ; Bottom 16x8
;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
movu m0, [r0]
movu m5, [r2]
@@ -13940,51 +13983,12 @@
psubw m9, m6
HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
- paddw m0, m1
- paddw m2, m8
pmaddwd m0, m7
+ pmaddwd m1, m7
pmaddwd m2, m7
- paddd m10, m0, m2
-
- lea r0, [r0+8*r1]
- lea r2, [r2+8*r3]
- lea r6, [r6+8*r1]
- lea r7, [r7+8*r3]
-
- ;call pixel_sa8d_8x8_internal ; pix[8*stride+8]
- ;LOAD_DIFF_8x4P_AVX2 0, 1, 2, 8, 5, 6, r0, r2
- movu m0, [r0]
- movu m5, [r2]
- psubw m0, m5
- movu m1, [r0 + r1]
- movu m6, [r2 + r3]
- psubw m1, m6
- movu m2, [r0 + r1 * 2]
- movu m5, [r2 + r3 * 2]
- psubw m2, m5
- movu m8, [r0 + r4]
- movu m6, [r2 + r5]
- psubw m8, m6
-
- ;LOAD_DIFF_8x4P_AVX2 4, 5, 3, 9, 11, 6, r6, r7
- movu m4, [r6]
- movu m11, [r7]
- psubw m4, m11
- movu m5, [r6 + r1]
- movu m6, [r7 + r3]
- psubw m5, m6
- movu m3, [r6 + r1 * 2]
- movu m11, [r7 + r3 * 2]
- psubw m3, m11
- movu m9, [r6 + r4]
- movu m6, [r7 + r5]
- psubw m9, m6
-
- HADAMARD8_2D 0, 1, 2, 8, 4, 5, 3, 9, 6, amax
- paddw m0, m1
- paddw m2, m8
- pmaddwd m0, m7
- pmaddwd m2, m7
+ pmaddwd m8, m7
+ paddd m0, m1
+ paddd m2, m8
paddd m10, m0
paddd m10, m2
More information about the x265-devel
mailing list