[x264-devel] [PATCH 7/9] aarch64: motion compensation NEON asm

Sat Jul 19 20:57:53 CEST 2014

Ported from the ARM NEON asm.
---
 Makefile              |    3 +-
 common/aarch64/mc-a.S | 1390 +++++++++++++++++++++++++++++++++++++++++++++++++
 common/aarch64/mc-c.c |  253 +++++++++
 common/aarch64/mc.h   |   29 ++
 common/mc.c           |    6 +
 5 files changed, 1680 insertions(+), 1 deletion(-)
 create mode 100644 common/aarch64/mc-a.S
 create mode 100644 common/aarch64/mc-c.c
 create mode 100644 common/aarch64/mc.h

diff --git a/Makefile b/Makefile
index d903393..b0d4a14 100644
--- a/Makefile
+++ b/Makefile
@@ -127,9 +127,10 @@ endif
 ifeq ($(ARCH),AARCH64)
 ifneq ($(AS),)
 ASMSRC += common/aarch64/dct-a.S     \
+          common/aarch64/mc-a.S      \
           common/aarch64/pixel-a.S   \
           common/aarch64/quant-a.S
-SRCS   +=
+SRCS   += common/aarch64/mc-c.c
 OBJASM  = $(ASMSRC:%.S=%.o)
 endif
 endif
diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
new file mode 100644
index 0000000..4a99a64
--- /dev/null
+++ b/common/aarch64/mc-a.S
@@ -0,0 +1,1390 @@
+/*****************************************************************************
+ * mc.S: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
+ *          Mans Rullgard <mans at mansr.com>
+ *          Stefan Groenroos <stefan.gronroos at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+// note: prefetch stuff assumes 64-byte cacheline
+
+// void prefetch_ref( uint8_t *pix, intptr_t stride, int parity )
+function x264_prefetch_ref_aarch64, export=1
+    cmp         w2,  #1
+    csel        x2,  xzr, x1, eq
+    add         x0,  x0,  #64
+    add         x0,  x0,  x2,  lsl #3
+
+    lsl         x2,  x1,  #1
+    add         x3,  x1,  x1,  lsl #1
+    add         x4,  x0,  x1,  lsl #2
+
+    prfm        pldl1strm, [x0]
+    prfm        pldl1strm, [x0,  x1]
+    prfm        pldl1strm, [x0,  x2]
+    prfm        pldl1strm, [x0,  x3]
+    prfm        pldl1strm, [x4]
+    prfm        pldl1strm, [x4,  x1]
+    prfm        pldl1strm, [x4,  x2]
+    prfm        pldl1strm, [x4,  x3]
+    ret
+endfunc
+
+// void prefetch_fenc( uint8_t *pix_y,  intptr_t stride_y,
+//                     uint8_t *pix_uv, intptr_t stride_uv, int mb_x )
+.macro x264_prefetch_fenc sub
+function x264_prefetch_fenc_\sub\()_aarch64, export=1
+    and         w6,  w5,  #3
+    and         w7,  w5,  #3
+    mul         x6,  x6,  x1
+    mul         x7,  x7,  x3
+    add         x0,  x0,  #64
+    add         x2,  x2,  #64
+
+    add         x0,  x0,  x6,  lsl #2
+    add         x6,  x0,  x1,  lsl #1
+    prfm        pldl1strm, [x0]
+    prfm        pldl1strm, [x0,  x1]
+    prfm        pldl1strm, [x6]
+    prfm        pldl1strm, [x6, x1]
+
+    add         x2,  x2,  x7,  lsl #1
+    prfm        pldl1strm, [x2]
+    prfm        pldl1strm, [x2,  x3]
+.ifc \sub, 422
+    add         x7,  x2,  x3,  lsl #1
+    prfm        pldl1strm, [x7]
+    prfm        pldl1strm, [x7,  x3]
+.endif
+    ret
+endfunc
+.endm
+
+x264_prefetch_fenc 420
+x264_prefetch_fenc 422
+
+// void pixel_avg( uint8_t *dst,  intptr_t dst_stride,
+//                 uint8_t *src1, intptr_t src1_stride,
+//                 uint8_t *src2, intptr_t src2_stride, int weight );
+.macro AVGH w h
+function x264_pixel_avg_\w\()x\h\()_neon, export=1
+    mov         w10, #64
+    cmp         w6,  #32
+    mov         w9, #\h
+    b.eq        pixel_avg_w\w\()_neon
+    subs        w7,  w10,  w6
+    b.lt        pixel_avg_weight_w\w\()_add_sub_neon     // weight > 64
+    cmp         w6,  #0
+    b.ge        pixel_avg_weight_w\w\()_add_add_neon
+    b           pixel_avg_weight_w\w\()_sub_add_neon     // weight < 0
+endfunc
+.endm
+
+AVGH  4, 2
+AVGH  4, 4
+AVGH  4, 8
+AVGH  4, 16
+AVGH  8, 4
+AVGH  8, 8
+AVGH  8, 16
+AVGH 16, 8
+AVGH 16, 16
+
+// 0 < weight < 64
+.macro load_weights_add_add
+    mov         w6,  w6
+.endm
+.macro weight_add_add dst, s1, s2, h=
+.ifc \h, 2
+    umull2      \dst, \s1, v30.16b
+    umlal2      \dst, \s2, v31.16b
+.else
+    umull       \dst, \s1, v30.8b
+    umlal       \dst, \s2, v31.8b
+.endif
+.endm
+
+// weight > 64
+.macro load_weights_add_sub
+    neg         w7,  w7
+.endm
+.macro weight_add_sub dst, s1, s2, h=
+.ifc \h, 2
+    umull2      \dst, \s1, v30.16b
+    umlsl2      \dst, \s2, v31.16b
+.else
+    umull       \dst, \s1, v30.8b
+    umlsl       \dst, \s2, v31.8b
+.endif
+.endm
+
+// weight < 0
+.macro load_weights_sub_add
+    neg         w6,  w6
+.endm
+.macro weight_sub_add dst, s1, s2, h=
+.ifc \h, 2
+    umull2      \dst, \s2, v31.16b
+    umlsl2      \dst, \s1, v30.16b
+.else
+    umull       \dst, \s2, v31.8b
+    umlsl       \dst, \s1, v30.8b
+.endif
+.endm
+
+.macro AVG_WEIGHT ext
+function pixel_avg_weight_w4_\ext\()_neon
+    load_weights_\ext
+    dup         v30.8b, w6
+    dup         v31.8b, w7
+1:  // height loop
+    subs        w9,  w9,  #2
+    ld1        {v0.s}[0], [x2], x3
+    ld1        {v1.s}[0], [x4], x5
+    weight_\ext v4.8h,  v0.8b,  v1.8b
+    ld1        {v2.s}[0], [x2], x3
+    ld1        {v3.s}[0], [x4], x5
+    sqrshrun    v0.8b,  v4.8h,  #6
+    weight_\ext v5.8h,  v2.8b,  v3.8b
+    st1        {v0.s}[0], [x0], x1
+    sqrshrun    v1.8b,  v5.8h,  #6
+    st1        {v1.s}[0], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function pixel_avg_weight_w8_\ext\()_neon
+    load_weights_\ext
+    dup         v30.8b, w6
+    dup         v31.8b, w7
+1:  // height loop
+    subs        w9,  w9,  #4
+    ld1        {v0.8b}, [x2], x3
+    ld1        {v1.8b}, [x4], x5
+    weight_\ext v16.8h, v0.8b,  v1.8b
+    ld1        {v2.8b}, [x2], x3
+    ld1        {v3.8b}, [x4], x5
+    weight_\ext v17.8h, v2.8b,  v3.8b
+    ld1        {v4.8b}, [x2], x3
+    ld1        {v5.8b}, [x4], x5
+    weight_\ext v18.8h, v4.8b,  v5.8b
+    ld1        {v6.8b}, [x2], x3
+    ld1        {v7.8b}, [x4], x5
+    weight_\ext v19.8h, v6.8b,  v7.8b
+    sqrshrun    v0.8b,  v16.8h, #6
+    sqrshrun    v1.8b,  v17.8h, #6
+    sqrshrun    v2.8b,  v18.8h, #6
+    sqrshrun    v3.8b,  v19.8h, #6
+    st1        {v0.8b}, [x0], x1
+    st1        {v1.8b}, [x0], x1
+    st1        {v2.8b}, [x0], x1
+    st1        {v3.8b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function pixel_avg_weight_w16_\ext\()_neon
+    load_weights_\ext
+    dup         v30.16b, w6
+    dup         v31.16b, w7
+1:  // height loop
+    subs        w9,  w9,  #2
+    ld1        {v0.16b}, [x2], x3
+    ld1        {v1.16b}, [x4], x5
+    weight_\ext v16.8h, v0.8b,  v1.8b
+    weight_\ext v17.8h, v0.16b, v1.16b, 2
+    ld1        {v2.16b}, [x2], x3
+    ld1        {v3.16b}, [x4], x5
+    weight_\ext v18.8h, v2.8b,  v3.8b
+    weight_\ext v19.8h, v2.16b, v3.16b, 2
+    sqrshrun    v0.8b,  v16.8h, #6
+    sqrshrun    v1.8b,  v18.8h, #6
+    sqrshrun2   v0.16b, v17.8h, #6
+    sqrshrun2   v1.16b, v19.8h, #6
+    st1        {v0.16b}, [x0], x1
+    st1        {v1.16b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+.endm
+
+AVG_WEIGHT add_add
+AVG_WEIGHT add_sub
+AVG_WEIGHT sub_add
+
+function pixel_avg_w4_neon
+1:  subs        w9,  w9,  #2
+    ld1        {v0.s}[0], [x2], x3
+    ld1        {v2.s}[0], [x4], x5
+    urhadd      v0.8b,  v0.8b,  v2.8b
+    ld1        {v1.s}[0], [x2], x3
+    ld1        {v3.s}[0], [x4], x5
+    urhadd      v1.8b,  v1.8b,  v3.8b
+    st1        {v0.s}[0], [x0], x1
+    st1        {v1.s}[0], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function pixel_avg_w8_neon
+1:  subs        w9,  w9,  #4
+    ld1        {v0.8b}, [x2], x3
+    ld1        {v1.8b}, [x4], x5
+    ld1        {v2.8b}, [x2], x3
+    urhadd      v0.8b,  v0.8b,  v1.8b
+    ld1        {v3.8b}, [x4], x5
+    st1        {v0.8b}, [x0], x1
+    ld1        {v4.8b}, [x2], x3
+    urhadd      v1.8b,  v2.8b,  v3.8b
+    ld1        {v5.8b}, [x4], x5
+    st1        {v1.8b}, [x0], x1
+    ld1        {v6.8b}, [x2], x3
+    ld1        {v7.8b}, [x4], x5
+    urhadd      v2.8b,  v4.8b,  v5.8b
+    urhadd      v3.8b,  v6.8b,  v7.8b
+    st1        {v2.8b}, [x0], x1
+    st1        {v3.8b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function pixel_avg_w16_neon
+1:  subs        w9,  w9,  #4
+    ld1        {v0.16b}, [x2], x3
+    ld1        {v1.16b}, [x4], x5
+    ld1        {v2.16b}, [x2], x3
+    urhadd      v0.16b, v0.16b, v1.16b
+    ld1        {v3.16b}, [x4], x5
+    st1        {v0.16b}, [x0], x1
+    ld1        {v4.16b}, [x2], x3
+    urhadd      v1.16b, v2.16b, v3.16b
+    ld1        {v5.16b}, [x4], x5
+    st1        {v1.16b}, [x0], x1
+    ld1        {v6.16b}, [x2], x3
+    ld1        {v7.16b}, [x4], x5
+    urhadd      v2.16b, v4.16b, v5.16b
+    urhadd      v3.16b, v6.16b, v7.16b
+    st1        {v2.16b}, [x0], x1
+    st1        {v3.16b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_pixel_avg2_w4_neon, export=1
+1:
+    subs        w5,  w5,  #2
+    ld1        {v0.s}[0],  [x2], x3
+    ld1        {v2.s}[0],  [x4], x3
+    urhadd      v0.8b,  v0.8b,  v2.8b
+    ld1        {v1.s}[0],  [x2], x3
+    ld1        {v3.s}[0],  [x4], x3
+    urhadd      v1.8b,  v1.8b,  v3.8b
+    st1        {v0.s}[0], [x0], x1
+    st1        {v1.s}[0], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_pixel_avg2_w8_neon, export=1
+1:
+    subs        w5,  w5,  #2
+    ld1        {v0.8b}, [x2], x3
+    ld1        {v2.8b}, [x4], x3
+    urhadd      v0.8b,  v0.8b,  v2.8b
+    ld1        {v1.8b}, [x2], x3
+    ld1        {v3.8b}, [x4], x3
+    urhadd      v1.8b,  v1.8b,  v3.8b
+    st1        {v0.8b}, [x0], x1
+    st1        {v1.8b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_pixel_avg2_w16_neon, export=1
+1:
+    subs        w5,  w5,  #2
+    ld1        {v0.16b}, [x2], x3
+    ld1        {v2.16b}, [x4], x3
+    urhadd      v0.16b, v0.16b, v2.16b
+    ld1        {v1.16b}, [x2], x3
+    ld1        {v3.16b}, [x4], x3
+    urhadd      v1.16b, v1.16b, v3.16b
+    st1        {v0.16b}, [x0], x1
+    st1        {v1.16b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_pixel_avg2_w20_neon, export=1
+    sub         x1,  x1,  #16
+1:
+    subs        w5,  w5,  #2
+    ld1        {v0.16b,v1.16b}, [x2], x3
+    ld1        {v2.16b,v3.16b}, [x4], x3
+    urhadd      v0.16b, v0.16b, v2.16b
+    urhadd      v1.8b,  v1.8b,  v3.8b
+    ld1        {v4.16b,v5.16b}, [x2], x3
+    ld1        {v6.16b,v7.16b}, [x4], x3
+    urhadd      v4.16b, v4.16b, v6.16b
+    urhadd      v5.8b,  v5.8b,  v7.8b
+    st1        {v0.16b},  [x0], #16
+    st1        {v1.s}[0], [x0], x1
+    st1        {v4.16b},  [x0], #16
+    st1        {v5.s}[0], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+.macro weight_prologue type
+    mov         w9,  w5                 // height
+.ifc \type, full
+    ldr         w12, [x4, #32]          // denom
+.endif
+    ldp         w4,  w5,  [x4, #32+4]   // scale, offset
+    dup         v0.8h,  w4
+    dup         v1.8h,  w5
+.ifc \type, full
+    neg         w12, w12
+    dup         v2.8h,  w12
+.endif
+.endm
+
+// void mc_weight( uint8_t *src, intptr_t src_stride, uint8_t *dst,
+//                 intptr_t dst_stride, const x264_weight_t *weight, int h )
+function x264_mc_weight_w20_neon, export=1
+    weight_prologue full
+    sub         x1,  x1,  #16
+1:
+    subs        w9,  w9,  #2
+    ld1        {v16.8b,v17.8b,v18.8b}, [x2], x3
+    ld1        {v19.8b,v20.8b,v21.8b}, [x2], x3
+    uxtl        v22.8h, v16.8b
+    uxtl        v23.8h, v17.8b
+    zip1        v18.2s, v18.2s, v21.2s
+    uxtl        v24.8h, v19.8b
+    uxtl        v25.8h, v20.8b
+    uxtl        v26.8h, v18.8b
+    mul         v22.8h, v22.8h, v0.8h
+    mul         v23.8h, v23.8h, v0.8h
+    mul         v24.8h, v24.8h, v0.8h
+    mul         v25.8h, v25.8h, v0.8h
+    mul         v26.8h, v26.8h, v0.8h
+    srshl       v22.8h, v22.8h, v2.8h
+    srshl       v23.8h, v23.8h, v2.8h
+    srshl       v24.8h, v24.8h, v2.8h
+    srshl       v25.8h, v25.8h, v2.8h
+    srshl       v26.8h, v26.8h, v2.8h
+    add         v22.8h, v22.8h, v1.8h
+    add         v23.8h, v23.8h, v1.8h
+    add         v24.8h, v24.8h, v1.8h
+    add         v25.8h, v25.8h, v1.8h
+    add         v26.8h, v26.8h, v1.8h
+    sqxtun      v4.8b,  v22.8h
+    sqxtun2     v4.16b, v23.8h
+    sqxtun      v5.8b,  v24.8h
+    sqxtun2     v5.16b, v25.8h
+    sqxtun      v6.8b,  v26.8h
+    st1        {v4.16b},  [x0], #16
+    st1        {v6.s}[0], [x0], x1
+    st1        {v5.16b},  [x0], #16
+    st1        {v6.s}[1], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w16_neon, export=1
+    weight_prologue full
+weight16_loop:
+1:
+    subs        w9,  w9,  #2
+    ld1        {v4.16b}, [x2], x3
+    ld1        {v5.16b}, [x2], x3
+    uxtl        v22.8h, v4.8b
+    uxtl2       v23.8h, v4.16b
+    uxtl        v24.8h, v5.8b
+    uxtl2       v25.8h, v5.16b
+    mul         v22.8h, v22.8h, v0.8h
+    mul         v23.8h, v23.8h, v0.8h
+    mul         v24.8h, v24.8h, v0.8h
+    mul         v25.8h, v25.8h, v0.8h
+    srshl       v22.8h, v22.8h, v2.8h
+    srshl       v23.8h, v23.8h, v2.8h
+    srshl       v24.8h, v24.8h, v2.8h
+    srshl       v25.8h, v25.8h, v2.8h
+    add         v22.8h, v22.8h, v1.8h
+    add         v23.8h, v23.8h, v1.8h
+    add         v24.8h, v24.8h, v1.8h
+    add         v25.8h, v25.8h, v1.8h
+    sqxtun      v4.8b,  v22.8h
+    sqxtun2     v4.16b, v23.8h
+    sqxtun      v5.8b,  v24.8h
+    sqxtun2     v5.16b, v25.8h
+    st1        {v4.16b}, [x0], x1
+    st1        {v5.16b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w8_neon, export=1
+    weight_prologue full
+1:
+    subs        w9,  w9,  #2
+    ld1        {v16.8b}, [x2], x3
+    ld1        {v17.8b}, [x2], x3
+    uxtl        v4.8h,  v16.8b
+    uxtl        v5.8h,  v17.8b
+    mul         v4.8h,  v4.8h,  v0.8h
+    mul         v5.8h,  v5.8h,  v0.8h
+    srshl       v4.8h,  v4.8h,  v2.8h
+    srshl       v5.8h,  v5.8h,  v2.8h
+    add         v4.8h,  v4.8h,  v1.8h
+    add         v5.8h,  v5.8h,  v1.8h
+    sqxtun      v16.8b, v4.8h
+    sqxtun      v17.8b, v5.8h
+    st1        {v16.8b}, [x0], x1
+    st1        {v17.8b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w4_neon, export=1
+    weight_prologue full
+1:
+    subs        w9,  w9,  #2
+    ld1        {v16.s}[0], [x2], x3
+    ld1        {v16.s}[1], [x2], x3
+    uxtl        v4.8h,  v16.8b
+    mul         v4.8h,  v4.8h,  v0.8h
+    srshl       v4.8h,  v4.8h,  v2.8h
+    add         v4.8h,  v4.8h,  v1.8h
+    sqxtun      v16.8b, v4.8h
+    st1        {v16.s}[0], [x0], x1
+    st1        {v16.s}[1], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w20_nodenom_neon, export=1
+    weight_prologue nodenom
+    sub         x1,  x1,  #16
+1:
+    subs        w9,  w9,  #2
+    ld1        {v16.8b,v17.8b,v18.8b}, [x2], x3
+    mov         v27.16b, v1.16b
+    mov         v28.16b, v1.16b
+    ld1        {v19.8b,v20.8b,v21.8b}, [x2], x3
+    mov         v29.16b, v1.16b
+    mov         v30.16b, v1.16b
+    uxtl        v22.8h, v16.8b
+    uxtl        v23.8h, v17.8b
+    zip1        v18.2s, v18.2s, v21.2s
+    mov         v31.16b, v1.16b
+    uxtl        v24.8h, v19.8b
+    uxtl        v25.8h, v20.8b
+    uxtl        v26.8h, v18.8b
+    mla         v27.8h, v22.8h, v0.8h
+    mla         v28.8h, v23.8h, v0.8h
+    mla         v29.8h, v24.8h, v0.8h
+    mla         v30.8h, v25.8h, v0.8h
+    mla         v31.8h, v26.8h, v0.8h
+    sqxtun      v4.8b,  v27.8h
+    sqxtun2     v4.16b, v28.8h
+    sqxtun      v5.8b,  v29.8h
+    sqxtun2     v5.16b, v30.8h
+    sqxtun      v6.8b,  v31.8h
+    st1        {v4.16b},  [x0], #16
+    st1        {v6.s}[0], [x0], x1
+    st1        {v5.16b},  [x0], #16
+    st1        {v6.s}[1], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w16_nodenom_neon, export=1
+    weight_prologue nodenom
+1:
+    subs        w9,  w9,  #2
+    ld1        {v16.16b}, [x2], x3
+    mov         v27.16b, v1.16b
+    mov         v28.16b, v1.16b
+    ld1        {v17.16b}, [x2], x3
+    mov         v29.16b, v1.16b
+    mov         v30.16b, v1.16b
+    uxtl        v22.8h, v16.8b
+    uxtl2       v23.8h, v16.16b
+    uxtl        v24.8h, v17.8b
+    uxtl2       v25.8h, v17.16b
+    mla         v27.8h, v22.8h, v0.8h
+    mla         v28.8h, v23.8h, v0.8h
+    mla         v29.8h, v24.8h, v0.8h
+    mla         v30.8h, v25.8h, v0.8h
+    sqxtun      v4.8b,  v27.8h
+    sqxtun2     v4.16b, v28.8h
+    sqxtun      v5.8b,  v29.8h
+    sqxtun2     v5.16b, v30.8h
+    st1        {v4.16b},  [x0], x1
+    st1        {v5.16b},  [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w8_nodenom_neon, export=1
+    weight_prologue nodenom
+1:
+    subs        w9,  w9,  #2
+    ld1        {v16.8b}, [x2], x3
+    mov         v27.16b, v1.16b
+    ld1        {v17.8b}, [x2], x3
+    mov         v29.16b, v1.16b
+    uxtl        v22.8h, v16.8b
+    uxtl        v24.8h, v17.8b
+    mla         v27.8h, v22.8h, v0.8h
+    mla         v29.8h, v24.8h, v0.8h
+    sqxtun      v4.8b,  v27.8h
+    sqxtun      v5.8b,  v29.8h
+    st1        {v4.8b},  [x0], x1
+    st1        {v5.8b},  [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w4_nodenom_neon, export=1
+    weight_prologue nodenom
+1:
+    subs        w9,  w9,  #2
+    ld1        {v16.s}[0], [x2], x3
+    ld1        {v16.s}[1], [x2], x3
+    mov         v27.16b, v1.16b
+    uxtl        v22.8h, v16.8b
+    mla         v27.8h, v22.8h, v0.8h
+    sqxtun      v4.8b,  v27.8h
+    st1        {v4.s}[0],  [x0], x1
+    st1        {v4.s}[1],  [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+.macro weight_simple_prologue
+    ldr         w6,  [x4]               // offset
+    dup         v1.16b,  w6
+.endm
+
+.macro weight_simple name op
+function x264_mc_weight_w20_\name\()_neon, export=1
+    weight_simple_prologue
+1:
+    subs        w5,  w5,  #2
+    ldr         s18, [x2, #16]
+    ld1        {v16.16b}, [x2], x3
+    ldr         s19, [x2, #16]
+    ld1        {v17.16b}, [x2], x3
+    \op         v18.8b,  v18.8b,  v1.8b
+    \op         v16.16b, v16.16b, v1.16b
+    \op         v19.8b,  v19.8b,  v1.8b
+    \op         v17.16b, v17.16b, v1.16b
+    str         s18, [x0, #16]
+    st1        {v16.16b}, [x0], x1
+    str         s19, [x0, #16]
+    st1        {v17.16b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w16_\name\()_neon, export=1
+    weight_simple_prologue
+1:
+    subs        w5,  w5,  #2
+    ld1        {v16.16b}, [x2], x3
+    ld1        {v17.16b}, [x2], x3
+    \op         v16.16b, v16.16b, v1.16b
+    \op         v17.16b, v17.16b, v1.16b
+    st1        {v16.16b}, [x0], x1
+    st1        {v17.16b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w8_\name\()_neon, export=1
+    weight_simple_prologue
+1:
+    subs        w5,  w5,  #2
+    ld1        {v16.8b}, [x2], x3
+    ld1        {v17.8b}, [x2], x3
+    \op         v16.8b, v16.8b, v1.8b
+    \op         v17.8b, v17.8b, v1.8b
+    st1        {v16.8b}, [x0], x1
+    st1        {v17.8b}, [x0], x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_weight_w4_\name\()_neon, export=1
+    weight_simple_prologue
+1:
+    subs        w5,  w5,  #2
+    ld1        {v16.s}[0], [x2], x3
+    ld1        {v16.s}[1], [x2], x3
+    \op         v16.8b, v16.8b, v1.8b
+    st1        {v16.s}[0], [x0], x1
+    st1        {v16.s}[1], [x0], x1
+    b.gt        1b
+    ret
+endfunc
+.endm
+
+weight_simple offsetadd, uqadd
+weight_simple offsetsub, uqsub
+
+
+// void mc_copy( uint8_t *dst, intptr_t dst_stride, uint8_t *src, intptr_t src_stride, int height )
+function x264_mc_copy_w4_neon, export=1
+1:
+    subs        w4,  w4,  #4
+    ld1        {v0.s}[0],  [x2],  x3
+    ld1        {v1.s}[0],  [x2],  x3
+    ld1        {v2.s}[0],  [x2],  x3
+    ld1        {v3.s}[0],  [x2],  x3
+    st1        {v0.s}[0],  [x0],  x1
+    st1        {v1.s}[0],  [x0],  x1
+    st1        {v2.s}[0],  [x0],  x1
+    st1        {v3.s}[0],  [x0],  x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_copy_w8_neon, export=1
+1:  subs        w4,  w4,  #4
+    ld1        {v0.8b},  [x2],  x3
+    ld1        {v1.8b},  [x2],  x3
+    ld1        {v2.8b},  [x2],  x3
+    ld1        {v3.8b},  [x2],  x3
+    st1        {v0.8b},  [x0],  x1
+    st1        {v1.8b},  [x0],  x1
+    st1        {v2.8b},  [x0],  x1
+    st1        {v3.8b},  [x0],  x1
+    b.gt        1b
+    ret
+endfunc
+
+function x264_mc_copy_w16_neon, export=1
+1:  subs        w4,  w4,  #4
+    ld1        {v0.16b}, [x2],  x3
+    ld1        {v1.16b}, [x2],  x3
+    ld1        {v2.16b}, [x2],  x3
+    ld1        {v3.16b}, [x2],  x3
+    st1        {v0.16b}, [x0],  x1
+    st1        {v1.16b}, [x0],  x1
+    st1        {v2.16b}, [x0],  x1
+    st1        {v3.16b}, [x0],  x1
+    b.gt        1b
+    ret
+endfunc
+
+// void x264_mc_chroma_neon( uint8_t *dst_u, uint8_t *dst_v,
+//                           intptr_t i_dst_stride,
+//                           uint8_t *src, intptr_t i_src_stride,
+//                           int dx, int dy, int i_width, int i_height );
+function x264_mc_chroma_neon, export=1
+    ldr         w15, [sp]               // height
+    sbfx        x12, x6,  #3,  #29      // asr(3) and sign extend
+    sbfx        x11, x5,  #3,  #29      // asr(3) and sign extend
+    cmp         w7,  #4
+    mul         x12, x12, x4
+    add         x3,  x3,  x11, lsl #1
+
+    and         w5,  w5,  #7
+    and         w6,  w6,  #7
+
+    add         x3,  x3,  x12
+
+    //pld             [x3]
+    //pld             [x3, x4]
+
+    b.gt        mc_chroma_w8_neon
+    b.eq        mc_chroma_w4_neon
+endfunc
+
+.macro CHROMA_MC_START r00, r01, r10, r11
+    mul         w12, w5,  w6            // cD = d8x    *d8y
+    lsl         w13, w5,  #3
+    add         w9,  w12,  #64
+    lsl         w14, w6,  #3
+    tst         w12, w12
+    sub         w9,  w9,  w13
+    sub         w10, w13, w12           // cB = d8x    *(8-d8y);
+    sub         w11, w14, w12           // cC = (8-d8x)*d8y
+    sub         w9,  w9,  w14           // cA = (8-d8x)*(8-d8y);
+.endm
+
+.macro CHROMA_MC width, vsize
+function mc_chroma_w\width\()_neon
+// since the element size varies, there's a different index for the 2nd store
+.if \width == 4
+    .set st2, 1
+.else
+    .set st2, 2
+.endif
+    CHROMA_MC_START
+    b.eq        2f
+
+    ld2        {v28.8b,v29.8b}, [x3], x4
+    dup         v0.8b,  w9               // cA
+    dup         v1.8b,  w10              // cB
+
+    ext         v6.8b, v28.8b, v6.8b,  #1
+    ext         v7.8b, v29.8b, v7.8b,  #1
+
+    ld2        {v30.8b,v31.8b}, [x3], x4
+    dup         v2.8b,  w11              // cC
+    dup         v3.8b,  w12              // cD
+
+    ext         v22.8b, v30.8b, v22.8b,  #1
+    ext         v23.8b, v31.8b, v23.8b,  #1
+
+    trn1        v0.2s,  v0.2s,  v1.2s
+    trn1        v2.2s,  v2.2s,  v3.2s
+
+    trn1        v4.2s,  v28.2s, v6.2s
+    trn1        v5.2s,  v29.2s, v7.2s
+    trn1        v20.2s, v30.2s, v22.2s
+    trn1        v21.2s, v31.2s, v23.2s
+1:  // height loop, interpolate xy
+    subs        w15, w15, #2
+    umull       v16.8h, v4.8b,  v0.8b
+    umlal       v16.8h, v20.8b, v2.8b
+    umull       v17.8h, v5.8b,  v0.8b
+    umlal       v17.8h, v21.8b, v2.8b
+
+    ld2        {v28.8b,v29.8b}, [x3], x4
+    transpose   v24.2d, v25.2d, v16.2d, v17.2d
+
+    ext         v6.8b, v28.8b, v6.8b,  #1
+    ext         v7.8b, v29.8b, v7.8b,  #1
+
+    trn1        v4.2s,  v28.2s, v6.2s
+    trn1        v5.2s,  v29.2s, v7.2s
+
+    add         v16.8h, v24.8h, v25.8h
+
+    umull       v18.8h, v20.8b, v0.8b
+    umlal       v18.8h, v4.8b,  v2.8b
+    umull       v19.8h, v21.8b, v0.8b
+    umlal       v19.8h, v5.8b,  v2.8b
+
+    ld2        {v30.8b,v31.8b}, [x3], x4
+    transpose   v26.2d, v27.2d, v18.2d, v19.2d
+
+    ext         v22.8b, v30.8b, v22.8b,  #1
+    ext         v23.8b, v31.8b, v23.8b,  #1
+    trn1        v20.2s, v30.2s, v22.2s
+    trn1        v21.2s, v31.2s, v23.2s
+
+    add         v17.8h, v26.8h, v27.8h
+
+    rshrn       v16.8b, v16.8h, #6
+    rshrn       v17.8b, v17.8h, #6
+
+    //pld         [x3]
+    //pld         [x3, x4]
+
+    st1        {v16.\vsize}[0],   [x0], x2
+    st1        {v16.\vsize}[st2], [x1], x2
+    st1        {v17.\vsize}[0],   [x0], x2
+    st1        {v17.\vsize}[st2], [x1], x2
+    b.gt        1b
+
+    ret
+2:  // dx or dy are 0
+    tst         w11, w11
+    add         w10, w10,  w11
+    dup         v0.8b,  w9
+    dup         v1.8b,  w10
+
+    b.eq        4f
+
+    ld1        {v4.8b}, [x3], x4
+    ld1        {v6.8b}, [x3], x4
+3:  // vertical interpolation loop
+    subs        w15, w15, #2
+    umull       v16.8h, v4.8b,  v0.8b
+    ld1        {v4.8b}, [x3], x4
+    umlal       v16.8h, v6.8b,  v1.8b
+    umull       v17.8h, v6.8b,  v0.8b
+    ld1        {v6.8b}, [x3], x4
+    umlal       v17.8h, v4.8b,  v1.8b
+
+    rshrn       v20.8b, v16.8h, #6      // uvuvuvuv
+    rshrn       v21.8b, v17.8h, #6      // uvuvuvuv
+
+    uzp1        v16.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
+    uzp2        v17.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
+
+    //pld         [x3]
+    //pld         [x3, x4]
+
+    st1        {v16.\vsize}[0],   [x0], x2
+    st1        {v16.\vsize}[st2], [x0], x2
+    st1        {v17.\vsize}[0],   [x1], x2
+    st1        {v17.\vsize}[st2], [x1], x2
+    b.gt        3b
+
+    ret
+
+4:  // dy is 0
+    ld1        {v4.8b,v5.8b}, [x3], x4
+    ld1        {v6.8b,v7.8b}, [x3], x4
+
+    ext         v5.8b,  v4.8b,  v5.8b,  #2
+    ext         v7.8b,  v6.8b,  v7.8b,  #2
+5:  // horizontal interpolation loop
+    subs        w15, w15, #2
+    umull       v16.8h, v4.8b,  v0.8b
+    umlal       v16.8h, v5.8b,  v1.8b
+    umull       v17.8h, v6.8b,  v0.8b
+    umlal       v17.8h, v7.8b,  v1.8b
+
+    ld1        {v4.8b,v5.8b}, [x3], x4
+    ld1        {v6.8b,v7.8b}, [x3], x4
+    rshrn       v20.8b, v16.8h, #6
+    rshrn       v21.8b, v17.8h, #6
+    ext         v5.8b,  v4.8b,  v5.8b,  #2
+    ext         v7.8b,  v6.8b,  v7.8b,  #2
+    uzp1        v16.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
+    uzp2        v17.8b, v20.8b, v21.8b  // d16=uuuu|uuuu, d17=vvvv|vvvv
+
+    //pld         [x3]
+    //pld         [x3, x4]
+
+    st1        {v16.\vsize}[0],   [x0], x2
+    st1        {v16.\vsize}[st2], [x0], x2
+    st1        {v17.\vsize}[0],   [x1], x2
+    st1        {v17.\vsize}[st2], [x1], x2
+    b.gt        5b
+
+    ret
+endfunc
+.endm
+
+    CHROMA_MC 2, h
+    CHROMA_MC 4, s
+
+function mc_chroma_w8_neon
+    CHROMA_MC_START
+    b.eq        2f
+    ld2        {v4.16b,v5.16b}, [x3], x4
+    ld2        {v20.16b,v21.16b}, [x3], x4
+    dup         v0.8b, w9               // cA
+    dup         v1.8b, w10              // cB
+
+    ext         v6.16b, v4.16b, v4.16b, #1
+    ext         v7.16b, v5.16b, v5.16b, #1
+
+    dup         v2.8b, w11              // cC
+    dup         v3.8b, w12              // cD
+
+    ext         v22.16b, v20.16b, v20.16b, #1
+    ext         v23.16b, v21.16b, v21.16b, #1
+
+1:  // height loop, interpolate xy
+    subs        w15, w15, #2
+    umull       v16.8h, v4.8b,  v0.8b
+    umlal       v16.8h, v6.8b,  v1.8b
+    umlal       v16.8h, v20.8b, v2.8b
+    umlal       v16.8h, v22.8b, v3.8b
+
+    umull       v17.8h, v5.8b,  v0.8b
+    umlal       v17.8h, v7.8b,  v1.8b
+    umlal       v17.8h, v21.8b, v2.8b
+    umlal       v17.8h, v23.8b, v3.8b
+
+    ld2        {v4.16b,v5.16b}, [x3], x4
+
+    ext         v6.16b, v4.16b, v4.16b, #1
+    ext         v7.16b, v5.16b, v5.16b, #1
+
+    umull       v18.8h, v20.8b, v0.8b
+    umlal       v18.8h, v22.8b, v1.8b
+    umlal       v18.8h, v4.8b,  v2.8b
+    umlal       v18.8h, v6.8b,  v3.8b
+
+    umull       v19.8h, v21.8b, v0.8b
+    umlal       v19.8h, v23.8b, v1.8b
+    umlal       v19.8h, v5.8b,  v2.8b
+    umlal       v19.8h, v7.8b,  v3.8b
+
+    ld2        {v20.16b,v21.16b}, [x3], x4
+
+    rshrn       v16.8b, v16.8h, #6
+    rshrn       v17.8b, v17.8h, #6
+    rshrn       v18.8b, v18.8h, #6
+    rshrn       v19.8b, v19.8h, #6
+
+    ext         v22.16b, v20.16b, v20.16b, #1
+    ext         v23.16b, v21.16b, v21.16b, #1
+
+    //pld         [x3]
+    //pld         [x3, x4]
+
+    st1        {v16.8b}, [x0], x2
+    st1        {v17.8b}, [x1], x2
+    st1        {v18.8b}, [x0], x2
+    st1        {v19.8b}, [x1], x2
+    b.gt        1b
+
+    ret
+2:  // dx or dy are 0
+    tst         w11, w11
+    add         w10, w10, w11
+    dup         v0.8b, w9
+    dup         v1.8b, w10
+
+    b.eq        4f
+
+    ld2        {v4.8b,v5.8b}, [x3], x4
+    ld2        {v6.8b,v7.8b}, [x3], x4
+3:  // vertical interpolation loop
+    subs        w15, w15, #2
+    umull       v16.8h, v4.8b,  v0.8b //U
+    umlal       v16.8h, v6.8b,  v1.8b
+    umull       v17.8h, v5.8b,  v0.8b //V
+    umlal       v17.8h, v7.8b,  v1.8b
+
+    ld2        {v4.8b,v5.8b}, [x3], x4
+
+    umull       v18.8h, v6.8b,  v0.8b
+    umlal       v18.8h, v4.8b,  v1.8b
+    umull       v19.8h, v7.8b,  v0.8b
+    umlal       v19.8h, v5.8b,  v1.8b
+
+    ld2        {v6.8b,v7.8b}, [x3], x4
+
+    rshrn       v16.8b, v16.8h, #6
+    rshrn       v17.8b, v17.8h, #6
+    rshrn       v18.8b, v18.8h, #6
+    rshrn       v19.8b, v19.8h, #6
+
+    //pld         [x3]
+    //pld         [x3, x4]
+
+    st1        {v16.8b}, [x0], x2
+    st1        {v17.8b}, [x1], x2
+    st1        {v18.8b}, [x0], x2
+    st1        {v19.8b}, [x1], x2
+    b.gt        3b
+
+    ret
+4:  // dy is 0
+    ld2        {v4.16b,v5.16b}, [x3], x4
+    ext         v6.16b, v4.16b, v4.16b, #1
+    ext         v7.16b, v5.16b, v5.16b, #1
+    ld2        {v20.16b,v21.16b}, [x3], x4
+    ext         v22.16b, v20.16b, v20.16b, #1
+    ext         v23.16b, v21.16b, v21.16b, #1
+5:  // horizontal interpolation loop
+    subs        w15, w15, #2
+    umull       v16.8h, v4.8b,  v0.8b //U
+    umlal       v16.8h, v6.8b,  v1.8b
+    umull       v17.8h, v5.8b,  v0.8b //V
+    umlal       v17.8h, v7.8b,  v1.8b
+
+    ld2        {v4.16b,v5.16b}, [x3], x4
+
+    umull       v18.8h, v20.8b, v0.8b
+    umlal       v18.8h, v22.8b, v1.8b
+    umull       v19.8h, v21.8b, v0.8b
+    umlal       v19.8h, v23.8b, v1.8b
+
+    ld2        {v20.16b,v21.16b}, [x3], x4
+
+    rshrn       v16.8b, v16.8h, #6
+    rshrn       v17.8b, v17.8h, #6
+    rshrn       v18.8b, v18.8h, #6
+    rshrn       v19.8b, v19.8h, #6
+
+    ext         v6.16b, v4.16b, v4.16b, #1
+    ext         v7.16b, v5.16b, v5.16b, #1
+    ext         v22.16b, v20.16b, v20.16b, #1
+    ext         v23.16b, v21.16b, v21.16b, #1
+
+    //pld         [x3]
+    //pld         [x3, x4]
+
+    st1        {v16.8b}, [x0], x2
+    st1        {v17.8b}, [x1], x2
+    st1        {v18.8b}, [x0], x2
+    st1        {v19.8b}, [x1], x2
+    b.gt        5b
+
+    ret
+endfunc
+
+function x264_hpel_filter_neon, export=1
+    ubfm        x9,  x3,  #0,  #4
+    add         w15, w5,  w9
+    sub         x13, x3,  x9
+    sub         x10, x0,  x9
+    sub         x11, x1,  x9
+    sub         x12, x2,  x9
+1:
+    sub         x3,  x13, #16
+    mov         x2,  x12
+    mov         x1,  x11
+    ld1        {v7.16b}, [x3], #16
+    mov         x0,  x10
+    add         x7,  x3,  #16           // src pointer next 16b for horiz filter
+    mov         x5,  x15
+    sub         x3,  x3,  x4,  lsl #1
+    movi        v30.16b,  #5
+    ld1        {v28.16b}, [x7], #16
+    movi        v31.16b,  #20
+    add         x9,  x3,  w5, uxtw
+
+    ld1        {v16.16b}, [x3], x4
+    ld1        {v17.16b}, [x3], x4
+    ld1        {v18.16b}, [x3], x4
+    ld1        {v19.16b}, [x3], x4
+    ld1        {v20.16b}, [x3], x4
+    ld1        {v21.16b}, [x3], x4
+
+    ext         v22.16b, v7.16b,  v18.16b, #14
+    uaddl       v1.8h,   v16.8b,  v21.8b
+    ext         v26.16b, v18.16b, v28.16b, #3
+    umlsl       v1.8h,   v17.8b,  v30.8b
+    ext         v23.16b, v7.16b,  v18.16b, #15
+    umlal       v1.8h,   v18.8b,  v31.8b
+    ext         v24.16b, v18.16b, v28.16b, #1
+    umlal       v1.8h,   v19.8b,  v31.8b
+    ext         v25.16b, v18.16b, v28.16b, #2
+    umlsl       v1.8h,   v20.8b,  v30.8b
+2:
+    subs        w5,  w5,  #16
+    sub         x3,  x9,  w5, sxtw
+
+    uaddl       v4.8h,  v22.8b,  v26.8b
+    uaddl2      v5.8h,  v22.16b, v26.16b
+    sqrshrun    v6.8b,  v1.8h,   #5
+    umlsl       v4.8h,  v23.8b,  v30.8b
+    umlsl2      v5.8h,  v23.16b, v30.16b
+    umlal       v4.8h,  v18.8b,  v31.8b
+    umlal2      v5.8h,  v18.16b, v31.16b
+    umlal       v4.8h,  v24.8b,  v31.8b
+    umlal2      v5.8h,  v24.16b, v31.16b
+    umlsl       v4.8h,  v25.8b,  v30.8b
+    umlsl2      v5.8h,  v25.16b, v30.16b
+
+    uaddl2      v2.8h,  v16.16b, v21.16b
+    sqrshrun    v4.8b,  v4.8h,   #5
+    mov         v7.16b, v18.16b
+    sqrshrun2   v4.16b, v5.8h,   #5
+
+    umlsl2      v2.8h,  v17.16b, v30.16b
+    ld1        {v16.16b}, [x3],  x4
+    umlal2      v2.8h,  v18.16b, v31.16b
+    ld1        {v17.16b}, [x3],  x4
+    umlal2      v2.8h,  v19.16b, v31.16b
+    ld1        {v18.16b}, [x3],  x4
+    umlsl2      v2.8h,  v20.16b, v30.16b
+    ld1        {v19.16b}, [x3],  x4
+    st1        {v4.16b},  [x0],  #16
+    sqrshrun2   v6.16b, v2.8h,   #5
+    ld1        {v20.16b}, [x3],  x4
+    ld1        {v21.16b}, [x3],  x4
+
+    ext         v22.16b, v0.16b, v1.16b, #12
+    ext         v26.16b, v1.16b, v2.16b, #6
+    ext         v23.16b, v0.16b, v1.16b, #14
+    st1        {v6.16b},  [x1],  #16
+    uaddl       v3.8h,   v16.8b, v21.8b
+    ext         v25.16b, v1.16b, v2.16b, #4
+    umlsl       v3.8h,   v17.8b, v30.8b
+    ext         v24.16b, v1.16b, v2.16b, #2
+
+    umlal       v3.8h,  v18.8b, v31.8b
+    add         v4.8h,  v22.8h, v26.8h
+    umlal       v3.8h,  v19.8b, v31.8b
+    add         v5.8h,  v23.8h, v25.8h
+    umlsl       v3.8h,  v20.8b, v30.8b
+    add         v6.8h,  v24.8h, v1.8h
+
+    ext         v22.16b, v1.16b, v2.16b, #12
+    ext         v26.16b, v2.16b, v3.16b, #6
+    ext         v23.16b, v1.16b, v2.16b, #14
+    ext         v25.16b, v2.16b, v3.16b, #4
+    ext         v24.16b, v2.16b, v3.16b, #2
+
+    add         v22.8h, v22.8h, v26.8h
+    add         v23.8h, v23.8h, v25.8h
+    add         v24.8h, v24.8h, v2.8h
+
+    sub         v4.8h,  v4.8h,  v5.8h   // a-b
+    sub         v5.8h,  v5.8h,  v6.8h   // b-c
+
+    sub         v22.8h, v22.8h, v23.8h  // a-b
+    sub         v23.8h, v23.8h, v24.8h  // b-c
+
+    sshr        v4.8h,  v4.8h,  #2      // (a-b)/4
+    sshr        v22.8h, v22.8h, #2      // (a-b)/4
+    sub         v4.8h,  v4.8h,  v5.8h   // (a-b)/4-b+c
+    sub         v22.8h, v22.8h, v23.8h  // (a-b)/4-b+c
+    sshr        v4.8h,  v4.8h,  #2      // ((a-b)/4-b+c)/4
+    sshr        v22.8h, v22.8h, #2      // ((a-b)/4-b+c)/4
+    add         v4.8h,  v4.8h,  v6.8h   // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+    add         v22.8h, v22.8h, v24.8h  // ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
+
+    sqrshrun    v4.8b,   v4.8h,   #6
+    ld1        {v28.16b}, [x7],   #16
+    mov         v0.16b,  v2.16b
+    ext         v23.16b, v7.16b,  v18.16b, #15
+    sqrshrun2   v4.16b,  v22.8h,  #6
+    mov         v1.16b,  v3.16b
+    ext         v22.16b, v7.16b,  v18.16b, #14
+    ext         v24.16b, v18.16b, v28.16b, #1
+    ext         v25.16b, v18.16b, v28.16b, #2
+    ext         v26.16b, v18.16b, v28.16b, #3
+
+    st1        {v4.16b}, [x2], #16
+    b.gt        2b
+
+    subs        w6,  w6,  #1
+    add         x10,  x10,  x4
+    add         x11,  x11,  x4
+    add         x12,  x12,  x4
+    add         x13,  x13,  x4
+    b.gt        1b
+
+    ret
+endfunc
+
+// frame_init_lowres_core( uint8_t *src0, uint8_t *dst0, uint8_t *dsth,
+//                         uint8_t *dstv, uint8_t *dstc, intptr_t src_stride,
+//                         intptr_t dst_stride, int width, int height )
+function x264_frame_init_lowres_core_neon, export=1
+    ldr         w8,  [sp]
+    sub         x10, x6,  w7, uxtw      // dst_stride - width
+    and         x10, x10, #~15
+
+1:
+    mov         w9,  w7                 // width
+    mov         x11, x0                 // src0
+    add         x12, x0,  x5            // src1 = src0 + src_stride
+    add         x13, x0,  x5,  lsl #1   // src2 = src1 + src_stride
+
+    ld2        {v0.16b,v1.16b}, [x11], #32
+    ld2        {v2.16b,v3.16b}, [x12], #32
+    ld2        {v4.16b,v5.16b}, [x13], #32
+
+    urhadd      v20.16b, v0.16b,  v2.16b    // s0[2x]   + s1[2x]
+    urhadd      v22.16b, v2.16b,  v4.16b    // s1[2x]   + s2[2x]
+2:
+    subs        w9,  w9,  #16
+    urhadd      v21.16b, v1.16b,  v3.16b    // s0[2x+1] + s1[2x+1]
+    urhadd      v23.16b, v3.16b,  v5.16b    // s1[2x+1] + s2[2x+1]
+
+    ld2        {v0.16b,v1.16b}, [x11], #32
+    ld2        {v2.16b,v3.16b}, [x12], #32
+    ld2        {v4.16b,v5.16b}, [x13], #32
+    urhadd      v30.16b, v0.16b,  v2.16b    // loop: s0[2x]   + s1[2x]
+    urhadd      v31.16b, v2.16b,  v4.16b    // loop: s1[2x]   + s2[2x]
+    ext         v24.16b, v20.16b, v30.16b, #1   // s0[2x+2] + s1[2x+2]
+    ext         v25.16b, v22.16b, v31.16b, #1   // s1[2x+2] + s2[2x+2]
+
+    urhadd      v16.16b, v20.16b, v21.16b
+    urhadd      v18.16b, v22.16b, v23.16b
+    urhadd      v17.16b, v21.16b, v24.16b
+    urhadd      v19.16b, v23.16b, v25.16b
+
+    st1        {v16.16b},   [x1],  #16
+    st1        {v18.16b},   [x3],  #16
+    st1        {v17.16b},   [x2],  #16
+    st1        {v19.16b},   [x4],  #16
+    b.le        3f
+
+    subs        w9,  w9,  #16
+    urhadd      v21.16b, v1.16b,  v3.16b    // s0[2x+1] + s1[2x+1]
+    urhadd      v23.16b, v3.16b,  v5.16b    // s1[2x+1] + s2[2x+1]
+
+    ld2        {v0.16b,v1.16b}, [x11], #32
+    ld2        {v2.16b,v3.16b}, [x12], #32
+    ld2        {v4.16b,v5.16b}, [x13], #32
+    urhadd      v20.16b, v0.16b,  v2.16b    // loop: s0[2x]   + s1[2x]
+    urhadd      v22.16b, v2.16b,  v4.16b    // loop: s1[2x]   + s2[2x]
+    ext         v24.16b, v30.16b, v20.16b, #1   // s0[2x+2] + s1[2x+2]
+    ext         v25.16b, v31.16b, v22.16b, #1   // s1[2x+2] + s2[2x+2]
+
+    urhadd      v16.16b, v30.16b, v21.16b
+    urhadd      v18.16b, v31.16b, v23.16b
+    urhadd      v17.16b, v21.16b, v24.16b
+    urhadd      v19.16b, v23.16b, v25.16b
+
+    st1        {v16.16b},   [x1],  #16
+    st1        {v18.16b},   [x3],  #16
+    st1        {v17.16b},   [x2],  #16
+    st1        {v19.16b},   [x4],  #16
+    b.gt        2b
+3:
+    subs        w8,  w8,  #1
+    add         x0,  x0,  x5,  lsl #1
+    add         x1,  x1,  x10
+    add         x2,  x2,  x10
+    add         x3,  x3,  x10
+    add         x4,  x4,  x10
+    b.gt        1b
+
+    ret
+endfunc
+
+function x264_load_deinterleave_chroma_fenc_neon, export=1
+    mov         x4,  #FENC_STRIDE/2
+    b           load_deinterleave_chroma
+endfunc
+
+function x264_load_deinterleave_chroma_fdec_neon, export=1
+    mov         x4,  #FDEC_STRIDE/2
+load_deinterleave_chroma:
+    ld2        {v0.8b,v1.8b}, [x1], x2
+    ld2        {v2.8b,v3.8b}, [x1], x2
+    subs        w3,  w3,  #2
+    st1        {v0.8b}, [x0], x4
+    st1        {v1.8b}, [x0], x4
+    st1        {v2.8b}, [x0], x4
+    st1        {v3.8b}, [x0], x4
+    b.gt        load_deinterleave_chroma
+
+    ret
+endfunc
+
+function x264_plane_copy_deinterleave_neon, export=1
+    add         w9,  w6,  #15
+    and         w9,  w9,  #0xfffffff0
+    sub         x1,  x1,  x9
+    sub         x3,  x3,  x9
+    sub         x5,  x5,  x9, lsl #1
+1:
+    ld2        {v0.16b,v1.16b}, [x4], #32
+    subs        w9,  w9,  #16
+    st1        {v0.16b}, [x0],  #16
+    st1        {v1.16b}, [x2],  #16
+    b.gt        1b
+
+    add         x4,  x4,  x5
+    subs        w7,  w7,  #1
+    add         x0,  x0,  x1
+    add         x2,  x2,  x3
+    mov         w9,  w6
+    b.gt       1b
+
+    ret
+endfunc
+
+function x264_plane_copy_deinterleave_rgb_neon, export=1
+    ldr             x8,  [sp]
+    ldp             x9,  x10, [sp, #8]
+    cmp             w8,  #3
+    uxtw            x9,  w9
+    add             x11, x9,  #7
+    and             x11, x11, #~7
+    sub             x1,  x1,  x11
+    sub             x3,  x3,  x11
+    sub             x5,  x5,  x11
+    b.ne            4f
+    sub             x7,  x7,  x11, lsl #1
+    sub             x7,  x7,  x11
+block3:
+    ld3            {v0.8b,v1.8b,v2.8b}, [x6], #24
+    subs            x11, x11, #8
+    st1            {v0.8b},    [x0], #8
+    st1            {v1.8b},    [x2], #8
+    st1            {v2.8b},    [x4], #8
+    b.gt            block3
+
+    subs            w10, w10, #1
+    add             x0,  x0,  x1
+    add             x2,  x2,  x3
+    add             x4,  x4,  x5
+    add             x6,  x6,  x7
+    mov             x11, x9
+    b.gt            block3
+
+    ret
+4:
+    sub             x7,  x7,  x11, lsl #2
+block4:
+    ld4            {v0.8b,v1.8b,v2.8b,v3.8b}, [x6], #32
+    subs            x11, x11, #8
+    st1            {v0.8b},    [x0], #8
+    st1            {v1.8b},    [x2], #8
+    st1            {v2.8b},    [x4], #8
+    b.gt            block4
+
+    subs            w10, w10, #1
+    add             x0,  x0,  x1
+    add             x2,  x2,  x3
+    add             x4,  x4,  x5
+    add             x6,  x6,  x7
+    mov             x11, x9
+    b.gt            block4
+
+    ret
+endfunc
+
+function x264_plane_copy_interleave_neon, export=1
+    add         w9,  w6,  #15
+    and         w9,  w9,  #0xfffffff0
+    sub         x1,  x1,  x9,  lsl #1
+    sub         x3,  x3,  x9
+    sub         x5,  x5,  x9
+1:
+    ld1        {v0.16b}, [x2],  #16
+    ld1        {v1.16b}, [x4],  #16
+    subs        w9,  w9,  #16
+    st2        {v0.16b,v1.16b}, [x0],  #32
+    b.gt        1b
+
+    subs        w7,  w7,  #1
+    add         x0,  x0,  x1
+    add         x2,  x2,  x3
+    add         x4,  x4,  x5
+    mov         w9,  w6
+    b.gt        1b
+
+    ret
+endfunc
+
+function x264_store_interleave_chroma_neon, export=1
+    mov             x5,  #FDEC_STRIDE
+1:
+    ld1        {v0.8b}, [x2], x5
+    ld1        {v1.8b}, [x3], x5
+    ld1        {v2.8b}, [x2], x5
+    ld1        {v3.8b}, [x3], x5
+    subs        w4,  w4,  #2
+    zip1        v4.16b,  v0.16b,  v1.16b
+    zip1        v5.16b,  v2.16b,  v3.16b
+    st1        {v4.16b}, [x0], x1
+    st1        {v5.16b}, [x0], x1
+    b.gt        1b
+
+    ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
new file mode 100644
index 0000000..5554268
--- /dev/null
+++ b/common/aarch64/mc-c.c
@@ -0,0 +1,253 @@
+/*****************************************************************************
+ * mc-c.c: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "mc.h"
+
+void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
+void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
+void x264_memzero_aligned_neon( void *dst, size_t n );
+
+void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_8x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x8_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x4_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_pixel_avg_4x2_neon  ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void x264_pixel_avg2_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
+
+void x264_plane_copy_deinterleave_neon(  pixel *dstu, intptr_t i_dstu,
+                                         pixel *dstv, intptr_t i_dstv,
+                                         pixel *src,  intptr_t i_src, int w, int h );
+void x264_plane_copy_deinterleave_rgb_neon( pixel *dsta, intptr_t i_dsta,
+                                            pixel *dstb, intptr_t i_dstb,
+                                            pixel *dstc, intptr_t i_dstc,
+                                            pixel *src,  intptr_t i_src, int pw, int w, int h );
+void x264_plane_copy_interleave_neon( pixel *dst,  intptr_t i_dst,
+                                      pixel *srcu, intptr_t i_srcu,
+                                      pixel *srcv, intptr_t i_srcv, int w, int h );
+
+void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+
+#define MC_WEIGHT(func)\
+void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w8##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+void x264_mc_weight_w4##func##_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
+\
+static void (* x264_mc##func##_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int ) =\
+{\
+    x264_mc_weight_w4##func##_neon,\
+    x264_mc_weight_w4##func##_neon,\
+    x264_mc_weight_w8##func##_neon,\
+    x264_mc_weight_w16##func##_neon,\
+    x264_mc_weight_w16##func##_neon,\
+    x264_mc_weight_w20##func##_neon,\
+};
+
+MC_WEIGHT()
+MC_WEIGHT(_nodenom)
+MC_WEIGHT(_offsetadd)
+MC_WEIGHT(_offsetsub)
+
+void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
+
+void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
+
+void x264_hpel_filter_v_neon( uint8_t *, uint8_t *, int16_t *, intptr_t, int );
+void x264_hpel_filter_c_neon( uint8_t *, int16_t *, int );
+void x264_hpel_filter_h_neon( uint8_t *, uint8_t *, int );
+
+#if !HIGH_BIT_DEPTH
+static void x264_weight_cache_neon( x264_t *h, x264_weight_t *w )
+{
+    if( w->i_scale == 1<<w->i_denom )
+    {
+        if( w->i_offset < 0 )
+        {
+            w->weightfn = x264_mc_offsetsub_wtab_neon;
+            w->cachea[0] = -w->i_offset;
+        }
+        else
+        {
+            w->weightfn = x264_mc_offsetadd_wtab_neon;
+            w->cachea[0] = w->i_offset;
+        }
+    }
+    else if( !w->i_denom )
+        w->weightfn = x264_mc_nodenom_wtab_neon;
+    else
+        w->weightfn = x264_mc_wtab_neon;
+}
+
+static void (* const x264_pixel_avg_wtab_neon[6])( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int ) =
+{
+    NULL,
+    x264_pixel_avg2_w4_neon,
+    x264_pixel_avg2_w8_neon,
+    x264_pixel_avg2_w16_neon,   // no slower than w12, so no point in a separate function
+    x264_pixel_avg2_w16_neon,
+    x264_pixel_avg2_w20_neon,
+};
+
+static void (* const x264_mc_copy_wtab_neon[5])( uint8_t *, intptr_t, uint8_t *, intptr_t, int ) =
+{
+    NULL,
+    x264_mc_copy_w4_neon,
+    x264_mc_copy_w8_neon,
+    NULL,
+    x264_mc_copy_w16_neon,
+};
+
+static const uint8_t hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1};
+static const uint8_t hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2};
+
+static void mc_luma_neon( uint8_t *dst,    intptr_t i_dst_stride,
+                          uint8_t *src[4], intptr_t i_src_stride,
+                          int mvx, int mvy,
+                          int i_width, int i_height, const x264_weight_t *weight )
+{
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    if ( (mvy&3) == 3 )             // explict if() to force conditional add
+        src1 += i_src_stride;
+
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    {
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        x264_pixel_avg_wtab_neon[i_width>>2](
+                dst, i_dst_stride, src1, i_src_stride,
+                src2, i_height );
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
+    }
+    else if( weight->weightfn )
+        weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
+    else
+        x264_mc_copy_wtab_neon[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, i_height );
+}
+
+static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
+                              uint8_t *src[4], intptr_t i_src_stride,
+                              int mvx, int mvy,
+                              int i_width, int i_height, const x264_weight_t *weight )
+{
+    int qpel_idx = ((mvy&3)<<2) + (mvx&3);
+    intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
+    uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset;
+    if ( (mvy&3) == 3 )             // explict if() to force conditional add
+        src1 += i_src_stride;
+
+    if( qpel_idx & 5 ) /* qpel interpolation needed */
+    {
+        uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
+        x264_pixel_avg_wtab_neon[i_width>>2](
+                dst, *i_dst_stride, src1, i_src_stride,
+                src2, i_height );
+        if( weight->weightfn )
+            weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
+        return dst;
+    }
+    else if( weight->weightfn )
+    {
+        weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
+        return dst;
+    }
+    else
+    {
+        *i_dst_stride = i_src_stride;
+        return src1;
+    }
+}
+
+void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
+                            uint8_t *src, intptr_t stride, int width,
+                            int height, int16_t *buf );
+#endif // !HIGH_BIT_DEPTH
+
+void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
+{
+#if !HIGH_BIT_DEPTH
+    if( cpu&X264_CPU_ARMV8 )
+    {
+        pf->prefetch_fenc_420 = x264_prefetch_fenc_420_aarch64;
+        pf->prefetch_fenc_422 = x264_prefetch_fenc_422_aarch64;
+        pf->prefetch_ref      = x264_prefetch_ref_aarch64;
+    }
+
+    if( !(cpu&X264_CPU_NEON) )
+        return;
+
+    pf->copy_16x16_unaligned = x264_mc_copy_w16_neon;
+    pf->copy[PIXEL_16x16]    = x264_mc_copy_w16_neon;
+    pf->copy[PIXEL_8x8]      = x264_mc_copy_w8_neon;
+    pf->copy[PIXEL_4x4]      = x264_mc_copy_w4_neon;
+
+    pf->plane_copy_deinterleave     = x264_plane_copy_deinterleave_neon;
+    pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_neon;
+    pf->plane_copy_interleave       = x264_plane_copy_interleave_neon;
+
+    pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_neon;
+    pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_neon;
+    pf->store_interleave_chroma       = x264_store_interleave_chroma_neon;
+
+    pf->avg[PIXEL_16x16] = x264_pixel_avg_16x16_neon;
+    pf->avg[PIXEL_16x8]  = x264_pixel_avg_16x8_neon;
+    pf->avg[PIXEL_8x16]  = x264_pixel_avg_8x16_neon;
+    pf->avg[PIXEL_8x8]   = x264_pixel_avg_8x8_neon;
+    pf->avg[PIXEL_8x4]   = x264_pixel_avg_8x4_neon;
+    pf->avg[PIXEL_4x16]  = x264_pixel_avg_4x16_neon;
+    pf->avg[PIXEL_4x8]   = x264_pixel_avg_4x8_neon;
+    pf->avg[PIXEL_4x4]   = x264_pixel_avg_4x4_neon;
+    pf->avg[PIXEL_4x2]   = x264_pixel_avg_4x2_neon;
+
+    pf->weight       = x264_mc_wtab_neon;
+    pf->offsetadd    = x264_mc_offsetadd_wtab_neon;
+    pf->offsetsub    = x264_mc_offsetsub_wtab_neon;
+    pf->weight_cache = x264_weight_cache_neon;
+
+    pf->mc_chroma = x264_mc_chroma_neon;
+    pf->mc_luma = mc_luma_neon;
+    pf->get_ref = get_ref_neon;
+    pf->hpel_filter = x264_hpel_filter_neon;
+    pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+#endif // !HIGH_BIT_DEPTH
+}
diff --git a/common/aarch64/mc.h b/common/aarch64/mc.h
new file mode 100644
index 0000000..feba321
--- /dev/null
+++ b/common/aarch64/mc.h
@@ -0,0 +1,29 @@
+/*****************************************************************************
+ * mc.h: aarch64 motion compensation
+ *****************************************************************************
+ * Copyright (C) 2014 x264 project
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_MC_H
+#define X264_AARCH64_MC_H
+
+void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf );
+
+#endif
diff --git a/common/mc.c b/common/mc.c
index 6797f0a..6a8b1b8 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -35,6 +35,9 @@
 #if ARCH_ARM
 #include "arm/mc.h"
 #endif
+#if ARCH_AARCH64
+#include "aarch64/mc.h"
+#endif
 
 
 static inline void pixel_avg( pixel *dst,  intptr_t i_dst_stride,
@@ -641,6 +644,9 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
 #if HAVE_ARMV6
     x264_mc_init_arm( cpu, pf );
 #endif
+#if ARCH_AARCH64
+    x264_mc_init_aarch64( cpu, pf );
+#endif
 
     if( cpu_independent )
     {
-- 
2.0.0