[x264-devel] [PATCHv2 2/6] arm: Don't use vcmp.f64 for testing for an all-zeros register
Martin Storsjö
martin at martin.st
Wed Nov 16 09:56:14 CET 2016
On iOS, vcmp.f64 can behave as if the register was zero, if the
register (interpreted as a f64), was a denormal number.
The vcmp.f64 (and other VFP instructions) will trap to the kernel
(which is supposed to implement the FP operation, which it apparently
doesn't do properly on iOS) if the value is a denormal. If this happens,
the whole comparison ends up way more costly.
---
Updated to use lr as temp register.
---
common/arm/deblock-a.S | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index d781828..41306e2 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -211,8 +211,8 @@ endfunc
vclt.u8 q13, q4, q14 @ < (alpha >> 2) + 2 if_2
vand q12, q7, q6 @ if_1
vshrn.u16 d28, q12, #4
- vcmp.f64 d28, #0
- vmrs APSR_nzcv, FPSCR
+ vmov r2, lr, d28
+ orrs r2, r2, lr
beq 9f
sub sp, sp, #32
@@ -325,6 +325,7 @@ endfunc
.endm
function x264_deblock_v_luma_intra_neon
+ push {lr}
vld1.64 {d0, d1}, [r0,:128], r1
vld1.64 {d2, d3}, [r0,:128], r1
vld1.64 {d4, d5}, [r0,:128], r1
@@ -348,10 +349,11 @@ function x264_deblock_v_luma_intra_neon
vst1.64 {d4, d5}, [r0,:128]
9:
align_pop_regs
- bx lr
+ pop {pc}
endfunc
function x264_deblock_h_luma_intra_neon
+ push {lr}
sub r0, r0, #4
vld1.64 {d22}, [r0], r1
vld1.64 {d20}, [r0], r1
@@ -397,7 +399,7 @@ function x264_deblock_h_luma_intra_neon
vst1.64 {d7}, [r0], r1
9:
align_pop_regs
- bx lr
+ pop {pc}
endfunc
.macro h264_loop_filter_chroma
--
2.7.4
More information about the x264-devel
mailing list