[x264-devel] aarch64: implement x264_plane_copy_swap_neon

Tue Sep 20 20:57:51 CEST 2016

x264 | branch: master | Janne Grunau <janne-x264 at jannau.net> | Fri Aug 26 20:26:56 2016 +0300| [dc0fe73636d34baeb3a64918b52db64d2a9e83bb] | committer: Anton Mitrofanov

aarch64: implement x264_plane_copy_swap_neon

plane_copy_swap_c: 27054
plane_copy_swap_neon: 4152

> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=dc0fe73636d34baeb3a64918b52db64d2a9e83bb
---

 common/aarch64/mc-a.S | 28 ++++++++++++++++++++++++++++
 common/aarch64/mc-c.c |  4 ++++
 2 files changed, 32 insertions(+)

diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
index fe0f870..3a99fbe 100644
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1281,6 +1281,34 @@ function x264_plane_copy_core_neon, export=1
     ret
 endfunc
 
+function x264_plane_copy_swap_core_neon, export=1
+    lsl         w4,  w4,  #1
+    sub         x1,  x1,  x4
+    sub         x3,  x3,  x4
+1:
+    mov         w8,  w4
+    tbz         w4,  #4,  32f
+    subs        w8,  w8,  #16
+    ld1         {v0.16b}, [x2], #16
+    rev16       v0.16b, v0.16b
+    st1         {v0.16b}, [x0], #16
+    b.eq        0f
+32:
+    subs        w8,  w8,  #32
+    ld1         {v0.16b,v1.16b}, [x2], #32
+    rev16       v0.16b, v0.16b
+    rev16       v1.16b, v1.16b
+    st1         {v0.16b,v1.16b}, [x0], #32
+    b.gt        32b
+0:
+    subs        w5,  w5,  #1
+    add         x2,  x2,  x3
+    add         x0,  x0,  x1
+    b.gt        1b
+
+    ret
+endfunc
+
 function x264_plane_copy_deinterleave_neon, export=1
     add         w9,  w6,  #15
     and         w9,  w9,  #0xfffffff0
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index 4f93965..09794d8 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -51,6 +51,8 @@ void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
 
 void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
                                 pixel *src, intptr_t i_src, int w, int h );
+void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
+                                     pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
                                          pixel *dstv, intptr_t i_dstv,
                                          pixel *src,  intptr_t i_src, int w, int h );
@@ -208,6 +210,7 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                             int height, int16_t *buf );
 
 PLANE_COPY(16, neon)
+PLANE_COPY_SWAP(16, neon)
 PLANE_INTERLEAVE(neon)
 #endif // !HIGH_BIT_DEPTH
 
@@ -232,6 +235,7 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
     pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
 
     pf->plane_copy                  = x264_plane_copy_neon;
+    pf->plane_copy_swap             = x264_plane_copy_swap_neon;
     pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
     pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
     pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;