[x264-devel] arm: Implement x264_plane_copy_neon

Martin Storsjö git at videolan.org
Sun Oct 11 19:01:07 CEST 2015


x264 | branch: master | Martin Storsjö <martin at martin.st> | Fri Aug 28 00:15:01 2015 +0300| [5db8b6b93aa91079ab785b9b49413625430536fd] | committer: Henrik Gramner

arm: Implement x264_plane_copy_neon

checkasm timing       Cortex-A7      A8     A9
plane_copy_c                 13124   10925  9106
plane_copy_neon              7349    5103   8945

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=5db8b6b93aa91079ab785b9b49413625430536fd
---

 common/arm/mc-a.S |   32 ++++++++++++++++++++++++++++++++
 common/arm/mc-c.c |    3 +++
 2 files changed, 35 insertions(+)

diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 36ce86f..5e0c117 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -6,6 +6,7 @@
  * Authors: David Conrad <lessen42 at gmail.com>
  *          Mans Rullgard <mans at mansr.com>
  *          Stefan Groenroos <stefan.gronroos at gmail.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -1461,6 +1462,37 @@ function x264_load_deinterleave_chroma_fenc_neon
     bx              lr
 endfunc
 
+function x264_plane_copy_neon
+    push            {r4,lr}
+    ldr             r4,  [sp, #8]
+    ldr             lr,  [sp, #12]
+    add             r12, r4,  #15
+    bic             r4,  r12, #15
+    sub             r1,  r1,  r4
+    sub             r3,  r3,  r4
+1:
+    mov             r12, r4
+16:
+    tst             r12, #16
+    beq             32f
+    subs            r12, r12, #16
+    vld1.8          {q0}, [r2]!
+    vst1.8          {q0}, [r0]!
+    beq             0f
+32:
+    subs            r12, r12, #32
+    vld1.8          {q0, q1}, [r2]!
+    vst1.8          {q0, q1}, [r0]!
+    bgt             32b
+0:
+    subs            lr,  lr,  #1
+    add             r2,  r2,  r3
+    add             r0,  r0,  r1
+    bgt             1b
+
+    pop             {r4,pc}
+endfunc
+
 function x264_plane_copy_deinterleave_neon
     push            {r4-r7, lr}
     ldrd            r6, r7, [sp, #28]
diff --git a/common/arm/mc-c.c b/common/arm/mc-c.c
index 3fa18ec..dd86fb2 100644
--- a/common/arm/mc-c.c
+++ b/common/arm/mc-c.c
@@ -47,6 +47,8 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
+void x264_plane_copy_neon( pixel *dst, intptr_t i_dst,
+                           pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                          pixel *dstv, intptr_t i_dstv,
                                          pixel *src,  intptr_t i_src, int w, int h );
@@ -244,6 +246,7 @@ void x264_mc_init_arm( int cpu, x264_mc_functions_t *pf )
     pf->copy[PIXEL_8x8]   = x264_mc_copy_w8_neon;
     pf->copy[PIXEL_4x4]   = x264_mc_copy_w4_neon;
 
+    pf->plane_copy              = x264_plane_copy_neon;
     pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_neon;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
     pf->plane_copy_interleave = x264_plane_copy_interleave_neon;



More information about the x264-devel mailing list