[x264-devel] arm: Don't use vcmp.f64 for testing for an all-zeros register
Martin Storsjö
git at videolan.org
Thu Dec 1 21:01:43 CET 2016
x264 | branch: master | Martin Storsjö <martin at martin.st> | Wed Nov 16 10:56:14 2016 +0200| [834e1b11e174f2694a4c81b4922c0c5f8778796a] | committer: Henrik Gramner
arm: Don't use vcmp.f64 for testing for an all-zeros register
On iOS, vcmp.f64 can behave as if the register was zero, if the
register (interpreted as a f64), was a denormal number.
The vcmp.f64 (and other VFP instructions) will trap to the kernel
(which is supposed to implement the FP operation, which it apparently
doesn't do properly on iOS) if the value is a denormal. If this happens,
the whole comparison ends up way more costly.
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=834e1b11e174f2694a4c81b4922c0c5f8778796a
---
common/arm/deblock-a.S | 10 ++++++----
1 file changed, 6 insertions(+), 4 deletions(-)
diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index d781828..41306e2 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -211,8 +211,8 @@ endfunc
vclt.u8 q13, q4, q14 @ < (alpha >> 2) + 2 if_2
vand q12, q7, q6 @ if_1
vshrn.u16 d28, q12, #4
- vcmp.f64 d28, #0
- vmrs APSR_nzcv, FPSCR
+ vmov r2, lr, d28
+ orrs r2, r2, lr
beq 9f
sub sp, sp, #32
@@ -325,6 +325,7 @@ endfunc
.endm
function x264_deblock_v_luma_intra_neon
+ push {lr}
vld1.64 {d0, d1}, [r0,:128], r1
vld1.64 {d2, d3}, [r0,:128], r1
vld1.64 {d4, d5}, [r0,:128], r1
@@ -348,10 +349,11 @@ function x264_deblock_v_luma_intra_neon
vst1.64 {d4, d5}, [r0,:128]
9:
align_pop_regs
- bx lr
+ pop {pc}
endfunc
function x264_deblock_h_luma_intra_neon
+ push {lr}
sub r0, r0, #4
vld1.64 {d22}, [r0], r1
vld1.64 {d20}, [r0], r1
@@ -397,7 +399,7 @@ function x264_deblock_h_luma_intra_neon
vst1.64 {d7}, [r0], r1
9:
align_pop_regs
- bx lr
+ pop {pc}
endfunc
.macro h264_loop_filter_chroma
More information about the x264-devel
mailing list