[x264-devel] arm: Don't use vcmp.f64 for testing for an all-zeros register

Martin Storsjö git at videolan.org
Thu Dec 1 21:01:43 CET 2016


x264 | branch: master | Martin Storsjö <martin at martin.st> | Wed Nov 16 10:56:14 2016 +0200| [834e1b11e174f2694a4c81b4922c0c5f8778796a] | committer: Henrik Gramner

arm: Don't use vcmp.f64 for testing for an all-zeros register

On iOS, vcmp.f64 can behave as if the register was zero, if the
register (interpreted as a f64), was a denormal number.

The vcmp.f64 (and other VFP instructions) will trap to the kernel
(which is supposed to implement the FP operation, which it apparently
doesn't do properly on iOS) if the value is a denormal. If this happens,
the whole comparison ends up way more costly.

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=834e1b11e174f2694a4c81b4922c0c5f8778796a
---

 common/arm/deblock-a.S | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/common/arm/deblock-a.S b/common/arm/deblock-a.S
index d781828..41306e2 100644
--- a/common/arm/deblock-a.S
+++ b/common/arm/deblock-a.S
@@ -211,8 +211,8 @@ endfunc
     vclt.u8         q13, q4,  q14   @ < (alpha >> 2) + 2 if_2
     vand            q12, q7,  q6    @ if_1
     vshrn.u16       d28, q12,  #4
-    vcmp.f64        d28, #0
-    vmrs            APSR_nzcv, FPSCR
+    vmov            r2,  lr,  d28
+    orrs            r2,  r2,  lr
     beq             9f
 
     sub             sp,  sp,  #32
@@ -325,6 +325,7 @@ endfunc
 .endm
 
 function x264_deblock_v_luma_intra_neon
+    push            {lr}
     vld1.64         {d0, d1},  [r0,:128], r1
     vld1.64         {d2, d3},  [r0,:128], r1
     vld1.64         {d4, d5},  [r0,:128], r1
@@ -348,10 +349,11 @@ function x264_deblock_v_luma_intra_neon
     vst1.64         {d4, d5},  [r0,:128]
 9:
     align_pop_regs
-    bx              lr
+    pop             {pc}
 endfunc
 
 function x264_deblock_h_luma_intra_neon
+    push            {lr}
     sub             r0,  r0,  #4
     vld1.64         {d22}, [r0], r1
     vld1.64         {d20}, [r0], r1
@@ -397,7 +399,7 @@ function x264_deblock_h_luma_intra_neon
     vst1.64         {d7},  [r0], r1
 9:
     align_pop_regs
-    bx              lr
+    pop             {pc}
 endfunc
 
 .macro h264_loop_filter_chroma



More information about the x264-devel mailing list