[x264-devel] [PATCH 8/9] aarch64: intra predition NEON asm

Sat Jul 19 20:57:54 CEST 2014

Ported from the ARM NEON asm.
---
 Makefile                   |   4 +-
 common/aarch64/predict-a.S | 662 +++++++++++++++++++++++++++++++++++++++++++++
 common/aarch64/predict-c.c | 114 ++++++++
 common/aarch64/predict.h   |  52 ++++
 common/pixel.c             |  19 +-
 common/predict.c           |  19 ++
 6 files changed, 858 insertions(+), 12 deletions(-)
 create mode 100644 common/aarch64/predict-a.S
 create mode 100644 common/aarch64/predict-c.c
 create mode 100644 common/aarch64/predict.h

diff --git a/Makefile b/Makefile
index b0d4a14..397b54d 100644
--- a/Makefile
+++ b/Makefile
@@ -129,8 +129,10 @@ ifneq ($(AS),)
 ASMSRC += common/aarch64/dct-a.S     \
           common/aarch64/mc-a.S      \
           common/aarch64/pixel-a.S   \
+          common/aarch64/predict-a.S \
           common/aarch64/quant-a.S
-SRCS   += common/aarch64/mc-c.c
+SRCS   += common/aarch64/mc-c.c      \
+          common/aarch64/predict-c.c
 OBJASM  = $(ASMSRC:%.S=%.o)
 endif
 endif
diff --git a/common/aarch64/predict-a.S b/common/aarch64/predict-a.S
new file mode 100644
index 0000000..d3271b8
--- /dev/null
+++ b/common/aarch64/predict-a.S
@@ -0,0 +1,662 @@
+/*****************************************************************************
+ * predict.S: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *          Mans Rullgard <mans at mansr.com>
+ *          Janne Grunau <janne-x264 at jannau.net>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.section .rodata
+.align 4
+
+p8weight: .short 1,2,3,4,1,2,3,4
+p16weight: .short 1,2,3,4,5,6,7,8
+
+.text
+
+.macro ldcol.8  vd,  xn,  xm,  n=8,  hi=0
+.if \n == 8 || \hi == 0
+    ld1        {\vd\().b}[0], [\xn], \xm
+    ld1        {\vd\().b}[1], [\xn], \xm
+    ld1        {\vd\().b}[2], [\xn], \xm
+    ld1        {\vd\().b}[3], [\xn], \xm
+.endif
+.if \n == 8 || \hi == 1
+    ld1        {\vd\().b}[4], [\xn], \xm
+    ld1        {\vd\().b}[5], [\xn], \xm
+    ld1        {\vd\().b}[6], [\xn], \xm
+    ld1        {\vd\().b}[7], [\xn], \xm
+.endif
+.endm
+
+.macro ldcol.16  vd,  xn,  xm
+    ldcol.8     \vd, \xn, \xm
+    ld1        {\vd\().b}[ 8], [\xn], \xm
+    ld1        {\vd\().b}[ 9], [\xn], \xm
+    ld1        {\vd\().b}[10], [\xn], \xm
+    ld1        {\vd\().b}[11], [\xn], \xm
+    ld1        {\vd\().b}[12], [\xn], \xm
+    ld1        {\vd\().b}[13], [\xn], \xm
+    ld1        {\vd\().b}[14], [\xn], \xm
+    ld1        {\vd\().b}[15], [\xn], \xm
+.endm
+
+
+function x264_predict_4x4_h_aarch64, export=1
+    ldrb    w1, [x0, #0*FDEC_STRIDE-1]
+    ldrb    w2, [x0, #1*FDEC_STRIDE-1]
+    ldrb    w3, [x0, #2*FDEC_STRIDE-1]
+    ldrb    w4, [x0, #3*FDEC_STRIDE-1]
+    add     w1, w1, w1, lsl #8
+    add     w2, w2, w2, lsl #8
+    add     w3, w3, w3, lsl #8
+    add     w4, w4, w4, lsl #8
+    add     w1, w1, w1, lsl #16
+    str     w1, [x0, #0*FDEC_STRIDE]
+    add     w2, w2, w2, lsl #16
+    str     w2, [x0, #1*FDEC_STRIDE]
+    add     w3, w3, w3, lsl #16
+    str     w3, [x0, #2*FDEC_STRIDE]
+    add     w4, w4, w4, lsl #16
+    str     w4, [x0, #3*FDEC_STRIDE]
+    ret
+endfunc
+
+function x264_predict_4x4_v_aarch64, export=1
+    ldr     w1,  [x0, #0 - 1 * FDEC_STRIDE]
+    str     w1,  [x0, #0 + 0 * FDEC_STRIDE]
+    str     w1,  [x0, #0 + 1 * FDEC_STRIDE]
+    str     w1,  [x0, #0 + 2 * FDEC_STRIDE]
+    str     w1,  [x0, #0 + 3 * FDEC_STRIDE]
+    ret
+endfunc
+
+function x264_predict_4x4_dc_neon, export=1
+    sub         x1,  x0,  #FDEC_STRIDE
+    sub         x2,  x0,  #1
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x1]
+    ld1r       {v1.8b}, [x2], x7
+    ld1r       {v2.8b}, [x2], x7
+    ld1r       {v3.8b}, [x2], x7
+    ld1r       {v4.8b}, [x2], x7
+    uaddlp      v0.4h,  v0.8b
+    uaddl       v1.8h,  v1.8b,  v2.8b
+    uaddl       v2.8h,  v3.8b,  v4.8b
+    addp        v0.4h,  v0.4h,  v0.4h
+    add         v1.4h,  v1.4h,  v2.4h
+    dup         v0.4h,  v0.h[0]
+    add         v0.4h,  v0.4h,  v1.4h
+    rshrn       v0.8b,  v0.8h,  #3
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0]
+    ret
+endfunc
+
+function x264_predict_4x4_dc_top_neon, export=1
+    sub         x1,  x0,  #FDEC_STRIDE
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x1]
+    uaddlp      v0.4h,  v0.8b
+    addp        v0.4h,  v0.4h,  v0.4h
+    dup         v0.4h,  v0.h[0]
+    rshrn       v0.8b,  v0.8h,  #2
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0], #FDEC_STRIDE
+    str         s0,  [x0]
+    ret
+endfunc
+
+function x264_predict_4x4_ddr_neon, export=1
+    sub         x1,  x0,  #FDEC_STRIDE+1
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x1], x7            // # -FDEC_STRIDE-1
+    ld1r       {v1.8b}, [x1], x7            // #0*FDEC_STRIDE-1
+    ld1r       {v2.8b}, [x1], x7            // #1*FDEC_STRIDE-1
+    ext         v0.8b,  v1.8b,  v0.8b,  #7
+    ld1r       {v3.8b}, [x1], x7            // #2*FDEC_STRIDE-1
+    ext         v0.8b,  v2.8b,  v0.8b,  #7  // a
+    ld1r       {v4.8b}, [x1], x7            // #3*FDEC_STRIDE-1
+    ext         v1.8b,  v3.8b,  v0.8b,  #7  // b
+    ext         v2.8b,  v4.8b,  v1.8b,  #7  // c
+    uaddl       v0.8h,  v0.8b,  v1.8b
+    uaddl       v1.8h,  v1.8b,  v2.8b
+    add         v0.8h,  v0.8h,  v1.8h
+    rshrn       v0.8b,  v0.8h,  #2
+
+    ext         v3.8b,  v0.8b, v0.8b,  #3
+    ext         v2.8b,  v0.8b, v0.8b,  #2
+    ext         v1.8b,  v0.8b, v0.8b,  #1
+
+    str         s3,  [x0], #FDEC_STRIDE
+    str         s2,  [x0], #FDEC_STRIDE
+    str         s1,  [x0], #FDEC_STRIDE
+    str         s0,  [x0]
+    ret
+endfunc
+
+function x264_predict_4x4_ddl_neon, export=1
+    sub         x0,  x0,  #FDEC_STRIDE
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x0],  x7
+    dup         v3.8b,  v0.b[7]
+    ext         v1.8b,  v0.8b,  v0.8b,  #1
+    ext         v2.8b,  v0.8b,  v3.8b,  #2
+    uhadd       v0.8b,  v0.8b,  v2.8b
+    urhadd      v0.8b,  v0.8b,  v1.8b
+    str         s0,  [x0], #FDEC_STRIDE
+    ext         v1.8b,  v0.8b,  v0.8b,  #1
+    ext         v2.8b,  v0.8b,  v0.8b,  #2
+    str         s1,  [x0], #FDEC_STRIDE
+    ext         v3.8b,  v0.8b,  v0.8b,  #3
+    str         s2,  [x0], #FDEC_STRIDE
+    str         s3,  [x0]
+    ret
+endfunc
+
+function x264_predict_8x8_dc_neon, export=1
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.16b}, [x1], #16
+    ld1        {v1.8b},  [x1]
+    ext         v0.16b, v0.16b, v0.16b, #7
+    uaddlv      h1,  v1.8b
+    uaddlv      h0,  v0.8b
+    add         v0.8h,  v0.8h,  v1.8h
+    dup         v0.8h,  v0.h[0]
+    rshrn       v0.8b,  v0.8h,  #4
+.rept 8
+    st1        {v0.8b}, [x0], x7
+.endr
+    ret
+endfunc
+
+function x264_predict_8x8_h_neon, export=1
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v16.16b}, [x1]
+    dup         v0.8b, v16.b[14]
+    dup         v1.8b, v16.b[13]
+    st1        {v0.8b}, [x0], x7
+    dup         v2.8b, v16.b[12]
+    st1        {v1.8b}, [x0], x7
+    dup         v3.8b, v16.b[11]
+    st1        {v2.8b}, [x0], x7
+    dup         v4.8b, v16.b[10]
+    st1        {v3.8b}, [x0], x7
+    dup         v5.8b, v16.b[9]
+    st1        {v4.8b}, [x0], x7
+    dup         v6.8b, v16.b[8]
+    st1        {v5.8b}, [x0], x7
+    dup         v7.8b, v16.b[7]
+    st1        {v6.8b}, [x0], x7
+    st1        {v7.8b}, [x0], x7
+    ret
+endfunc
+
+function x264_predict_8x8_v_neon, export=1
+    add         x1,  x1,  #16
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x1]
+.rept 8
+    st1        {v0.8b}, [x0], x7
+.endr
+    ret
+endfunc
+
+function x264_predict_8x8_ddl_neon, export=1
+    add         x1,  x1,  #16
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.16b}, [x1]
+    movi        v3.16b, #0
+    dup         v2.16b, v0.b[15]
+    ext         v4.16b, v3.16b, v0.16b, #15
+    ext         v2.16b, v0.16b, v2.16b, #1
+    uhadd       v4.16b, v4.16b, v2.16b
+    urhadd      v0.16b, v0.16b, v4.16b
+    ext         v1.16b, v0.16b, v0.16b, #1
+    ext         v2.16b, v0.16b, v0.16b, #2
+    st1        {v1.8b}, [x0], x7
+    ext         v3.16b, v0.16b, v0.16b, #3
+    st1        {v2.8b}, [x0], x7
+    ext         v4.16b, v0.16b, v0.16b, #4
+    st1        {v3.8b}, [x0], x7
+    ext         v5.16b, v0.16b, v0.16b, #5
+    st1        {v4.8b}, [x0], x7
+    ext         v6.16b, v0.16b, v0.16b, #6
+    st1        {v5.8b}, [x0], x7
+    ext         v7.16b, v0.16b, v0.16b, #7
+    st1        {v6.8b}, [x0], x7
+    ext         v0.16b, v0.16b, v0.16b, #8
+    st1        {v7.8b}, [x0], x7
+    st1        {v0.8b}, [x0], x7
+    ret
+endfunc
+
+function x264_predict_8x8_ddr_neon, export=1
+    ld1        {v0.16b,v1.16b}, [x1]
+    ext         v2.16b, v0.16b, v1.16b, #7
+    ext         v4.16b, v0.16b, v1.16b, #9
+    ext         v3.16b, v0.16b, v1.16b, #8
+
+    uhadd       v2.16b, v2.16b, v4.16b
+    urhadd      v7.16b, v3.16b, v2.16b
+
+    add         x0,  x0,  #7*FDEC_STRIDE
+    mov         x7,  #-1*FDEC_STRIDE
+
+    ext         v6.16b, v7.16b, v7.16b, #1
+    st1        {v7.8b},  [x0], x7
+    ext         v5.16b, v7.16b, v7.16b, #2
+    st1        {v6.8b},  [x0], x7
+    ext         v4.16b, v7.16b, v7.16b, #3
+    st1        {v5.8b},  [x0], x7
+    ext         v3.16b, v7.16b, v7.16b, #4
+    st1        {v4.8b},  [x0], x7
+    ext         v2.16b, v7.16b, v7.16b, #5
+    st1        {v3.8b},  [x0], x7
+    ext         v1.16b, v7.16b, v7.16b, #6
+    st1        {v2.8b},  [x0], x7
+    ext         v0.16b, v7.16b, v7.16b, #7
+    st1        {v1.8b},  [x0], x7
+    st1        {v0.8b},  [x0], x7
+    ret
+endfunc
+
+function x264_predict_8x8_vl_neon, export=1
+    add         x1,  x1,  #16
+    mov         x7, #FDEC_STRIDE
+
+    ld1        {v0.16b}, [x1]
+    ext         v1.16b, v1.16b, v0.16b, #15
+    ext         v2.16b, v0.16b, v2.16b, #1
+
+    uhadd       v1.16b, v1.16b, v2.16b
+    urhadd      v3.16b, v0.16b, v2.16b
+
+    urhadd      v0.16b, v0.16b, v1.16b
+
+    ext        v4.16b, v0.16b, v0.16b, #1
+    st1        {v3.8b}, [x0], x7
+    ext        v5.16b, v3.16b, v3.16b, #1
+    st1        {v4.8b}, [x0], x7
+    ext        v6.16b, v0.16b, v0.16b, #2
+    st1        {v5.8b}, [x0], x7
+    ext        v7.16b, v3.16b, v3.16b, #2
+    st1        {v6.8b}, [x0], x7
+    ext        v4.16b, v0.16b, v0.16b, #3
+    st1        {v7.8b}, [x0], x7
+    ext        v5.16b, v3.16b, v3.16b, #3
+    st1        {v4.8b}, [x0], x7
+    ext        v6.16b, v0.16b, v0.16b, #4
+    st1        {v5.8b}, [x0], x7
+    st1        {v6.8b}, [x0], x7
+    ret
+endfunc
+
+function x264_predict_8x8_vr_neon, export=1
+    add         x1,  x1,  #8
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v2.16b}, [x1]
+
+    ext         v1.16b, v2.16b, v2.16b, #14
+    ext         v0.16b, v2.16b, v2.16b, #15
+
+    uhadd       v3.16b, v2.16b, v1.16b
+    urhadd      v2.16b, v2.16b, v0.16b
+    urhadd      v0.16b, v0.16b, v3.16b
+
+    ext         v1.16b, v2.16b, v2.16b, #8
+    uzp1        v2.8b,  v0.8b,  v0.8b
+    uzp2        v3.8b,  v0.8b,  v0.8b
+    ext         v0.16b, v0.16b, v0.16b, #8
+
+    st1        {v1.8b}, [x0], x7
+    st1        {v0.8b}, [x0], x7
+    ext         v4.8b, v3.8b, v1.8b, #7
+    ext         v5.8b, v2.8b, v0.8b, #7
+    st1        {v4.8b}, [x0], x7
+    st1        {v5.8b}, [x0], x7
+    ext         v6.8b, v3.8b, v1.8b, #6
+    ext         v7.8b, v2.8b, v0.8b, #6
+    st1        {v6.8b}, [x0], x7
+    st1        {v7.8b}, [x0], x7
+    ext         v1.8b, v3.8b, v1.8b, #5
+    ext         v0.8b, v2.8b, v0.8b, #5
+    st1        {v1.8b}, [x0], x7
+    st1        {v0.8b}, [x0], x7
+    ret
+endfunc
+
+function x264_predict_8x8_hd_neon, export=1
+    add         x1,  x1,  #7
+    mov         x7, #FDEC_STRIDE
+
+    ld1        {v1.16b}, [x1]
+    ext         v3.16b, v1.16b, v1.16b, #1
+    ext         v2.16b, v1.16b, v1.16b, #2
+
+    urhadd      v4.16b, v1.16b, v3.16b
+
+    uhadd       v1.16b, v1.16b, v2.16b
+    urhadd      v0.16b, v1.16b, v3.16b
+
+    zip1        v16.8b, v4.8b,  v0.8b
+    zip2        v17.8b, v4.8b,  v0.8b
+    ext         v7.16b, v0.16b, v0.16b, #8
+
+    ext         v0.8b,  v17.8b, v7.8b,  #6
+    ext         v1.8b,  v17.8b, v7.8b,  #4
+    st1        {v0.8b},  [x0], x7
+    ext         v2.8b,  v17.8b, v7.8b,  #2
+    st1        {v1.8b},  [x0], x7
+    st1        {v2.8b},  [x0], x7
+    ext         v3.8b,  v16.8b, v17.8b, #6
+    st1        {v17.8b}, [x0], x7
+    ext         v4.8b,  v16.8b, v17.8b, #4
+    st1        {v3.8b},  [x0], x7
+    ext         v5.8b,  v16.8b, v17.8b, #2
+    st1        {v4.8b},  [x0], x7
+    st1        {v5.8b},  [x0], x7
+    st1        {v16.8b}, [x0], x7
+
+    ret
+endfunc
+
+function x264_predict_8x8_hu_neon, export=1
+    add         x1,  x1,  #7
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v7.8b}, [x1]
+    dup         v6.8b,  v7.b[0]
+    rev64       v7.8b,  v7.8b
+
+    ext         v4.8b,  v7.8b,  v6.8b,  #2
+    ext         v2.8b,  v7.8b,  v6.8b,  #1
+
+    uhadd       v5.8b,  v7.8b,  v4.8b
+    urhadd      v0.8b,  v2.8b,  v7.8b
+    urhadd      v1.8b,  v5.8b,  v2.8b
+
+    zip1        v16.8b, v0.8b,  v1.8b
+    zip2        v17.8b, v0.8b,  v1.8b
+
+    dup         v18.4h, v17.h[3]
+
+    ext         v0.8b,  v16.8b, v17.8b, #2
+    ext         v1.8b,  v16.8b, v17.8b, #4
+    ext         v2.8b,  v16.8b, v17.8b, #6
+    st1        {v16.8b}, [x0], x7
+    st1        {v0.8b},  [x0], x7
+    st1        {v1.8b},  [x0], x7
+    st1        {v2.8b},  [x0], x7
+
+    ext         v4.8b,  v17.8b, v18.8b, #2
+    ext         v5.8b,  v17.8b, v18.8b, #4
+    ext         v6.8b,  v17.8b, v18.8b, #6
+    st1        {v17.8b}, [x0], x7
+    st1        {v4.8b},  [x0], x7
+    st1        {v5.8b},  [x0], x7
+    st1        {v6.8b},  [x0]
+    ret
+endfunc
+
+
+function x264_predict_8x8c_dc_top_neon, export=1
+    sub         x2,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    ld1        {v0.8b},  [x2]
+    uaddlp      v0.4h,  v0.8b
+    addp        v0.4h,  v0.4h,  v0.4h
+    rshrn       v0.8b,  v0.8h,  #2
+    dup         v3.8b,  v0.b[1]
+    dup         v2.8b,  v0.b[0]
+    transpose   v0.2s,  v1.2s,  v2.2s,  v3.2s
+    b           pred8x8c_dc_end
+endfunc
+
+function x264_predict_8x8c_dc_left_neon, export=1
+    sub         x2,  x0,  #1
+    mov         x1,  #FDEC_STRIDE
+    ldcol.8     v0,  x2,  x1
+    uaddlp      v0.4h,  v0.8b
+    addp        v0.4h,  v0.4h,  v0.4h
+    rshrn       v0.8b,  v0.8h,  #2
+    dup         v1.8b,  v0.b[1]
+    dup         v0.8b,  v0.b[0]
+    b           pred8x8c_dc_end
+endfunc
+
+function x264_predict_8x8c_dc_neon, export=1
+    sub         x2,  x0,  #FDEC_STRIDE
+    sub         x3,  x0,  #1
+    mov         x1,  #FDEC_STRIDE
+    ld1        {v2.8b},  [x2]
+    ldcol.8     v3,  x3,  x1
+    transpose   v0.2s,  v1.2s,  v2.2s,  v3.2s
+    uaddlp      v0.4h,  v0.8b  // s0, s2
+    uaddlp      v1.4h,  v1.8b  // s1, s3
+    addp        v0.4h,  v0.4h,  v1.4h // s0, s2, s1, s3
+    addp        v1.4h,  v0.4h,  v0.4h
+    rshrn       v2.8b,  v0.8h,  #2
+    rshrn       v3.8b,  v1.8h,  #3
+    dup         v5.8b,  v2.b[2]  // dc1
+    dup         v6.8b,  v3.b[1]  // dc2
+    dup         v4.8b,  v3.b[0]  // dc0
+    dup         v7.8b,  v2.b[3]  // dc3
+    trn1        v0.2s,  v4.2s,  v5.2s
+    trn1        v1.2s,  v7.2s,  v6.2s
+pred8x8c_dc_end:
+    add         x2,  x0,  x1,  lsl #2
+.rept 4
+    st1        {v0.8b}, [x0], x1
+    st1        {v1.8b}, [x2], x1
+.endr
+    ret
+endfunc
+
+function x264_predict_8x8c_h_neon, export=1
+    sub         x1,  x0,  #1
+    mov         x7,  #FDEC_STRIDE
+.rept 4
+    ld1r       {v0.8b}, [x1], x7
+    ld1r       {v1.8b}, [x1], x7
+    st1        {v0.8b}, [x0], x7
+    st1        {v1.8b}, [x0], x7
+.endr
+    ret
+endfunc
+
+function x264_predict_8x8c_v_neon, export=1
+    sub         x0,  x0,  #FDEC_STRIDE
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.8b}, [x0], x7
+.rept 8
+    st1        {v0.8b}, [x0], x7
+.endr
+    ret
+endfunc
+
+function x264_predict_8x8c_p_neon, export=1
+    sub         x3,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    add         x2,  x3,  #4
+    sub         x3,  x3,  #1
+    ld1        {v0.s}[0], [x3]
+    ld1        {v2.s}[0], [x2], x1
+    ldcol.8     v0,  x3,  x1,  4,  hi=1
+    add         x3,  x3,  x1
+    ldcol.8     v3,  x3,  x1,  4
+    movrel      x4,  p8weight
+    movrel      x5,  p16weight
+    uaddl       v4.8h,  v2.8b,  v3.8b
+    rev32       v0.8b,  v0.8b
+    trn1        v2.2s,  v2.2s,  v3.2s
+    ld1        {v7.8h}, [x4]
+    usubl       v2.8h,  v2.8b,  v0.8b
+    mul         v2.8h,  v2.8h,  v7.8h
+    ld1        {v0.8h}, [x5]
+    saddlp      v2.4s,  v2.8h
+    addp        v2.4s,  v2.4s,  v2.4s
+    shl         v3.2s,  v2.2s,  #4
+    add         v2.2s,  v2.2s,  v3.2s
+    rshrn       v5.4h,  v2.4s,  #5    // b, c, x, x
+    addp        v2.4h,  v5.4h,  v5.4h
+    shl         v3.4h,  v2.4h,  #2
+    sub         v3.4h,  v3.4h,  v2.4h // 3 * (b + c)
+    rev64       v4.4h,  v4.4h
+    add         v4.4h,  v4.4h,  v0.4h
+    shl         v2.4h,  v4.4h,  #4              // a
+    sub         v2.4h,  v2.4h,  v3.4h           // a - 3 * (b + c) + 16
+    ext         v0.16b, v0.16b, v0.16b, #14
+    sub         v6.4h,  v5.4h,  v3.4h
+    mov         v0.h[0],  wzr
+    mul         v0.8h,  v0.8h,  v5.h[0]         // 0,1,2,3,4,5,6,7 * b
+    dup         v1.8h,  v2.h[0]                 // pix
+    dup         v2.8h,  v5.h[1]                 // c
+    add         v1.8h,  v1.8h,  v0.8h           // pix + x*b
+    mov         x3,  #8
+1:
+    subs        x3,  x3,  #1
+    sqshrun     v0.8b,  v1.8h,  #5
+    add         v1.8h,  v1.8h,  v2.8h
+    st1        {v0.8b}, [x0], x1
+    b.ne        1b
+    ret
+endfunc
+
+
+function x264_predict_16x16_dc_top_neon, export=1
+    sub         x2,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    ld1        {v0.16b}, [x2]
+    uaddlv      h0,     v0.16b
+    rshrn       v0.8b,  v0.8h,  #4
+    dup         v0.16b, v0.b[0]
+    b           pred16x16_dc_end
+endfunc
+
+function x264_predict_16x16_dc_left_neon, export=1
+    sub         x2,  x0,  #1
+    mov         x1,  #FDEC_STRIDE
+    ldcol.16    v0,  x2,  x1
+    uaddlv      h0,     v0.16b
+    rshrn       v0.8b,  v0.8h,  #4
+    dup         v0.16b, v0.b[0]
+    b           pred16x16_dc_end
+endfunc
+
+function x264_predict_16x16_dc_neon, export=1
+    sub         x3,  x0,  #FDEC_STRIDE
+    sub         x2,  x0,  #1
+    mov         x1,  #FDEC_STRIDE
+    ld1        {v0.16b}, [x3]
+    ldcol.16    v1,  x2,  x1
+    uaddlv      h0,     v0.16b
+    uaddlv      h1,     v1.16b
+    add         v0.4h,  v0.4h,  v1.4h
+    rshrn       v0.8b,  v0.8h,  #5
+    dup         v0.16b, v0.b[0]
+pred16x16_dc_end:
+.rept 16
+    st1        {v0.16b}, [x0], x1
+.endr
+    ret
+endfunc
+
+function x264_predict_16x16_h_neon, export=1
+    sub         x1,  x0,  #1
+    mov         x7, #FDEC_STRIDE
+.rept 8
+    ld1r       {v0.16b}, [x1], x7
+    ld1r       {v1.16b}, [x1], x7
+    st1        {v0.16b}, [x0], x7
+    st1        {v1.16b}, [x0], x7
+.endr
+    ret
+endfunc
+
+function x264_predict_16x16_v_neon, export=1
+    sub         x0,  x0,  #FDEC_STRIDE
+    mov         x7,  #FDEC_STRIDE
+    ld1        {v0.16b}, [x0], x7
+.rept 16
+    st1        {v0.16b}, [x0], x7
+.endr
+   ret
+endfunc
+
+function x264_predict_16x16_p_neon, export=1
+    sub         x3,  x0,  #FDEC_STRIDE
+    mov         x1,  #FDEC_STRIDE
+    add         x2,  x3,  #8
+    sub         x3,  x3,  #1
+    ld1        {v0.8b}, [x3]
+    ld1        {v2.8b}, [x2], x1
+    ldcol.8     v1,  x3,  x1
+    add         x3,  x3,  x1
+    ldcol.8     v3,  x3,  x1
+    rev64       v0.8b,  v0.8b
+    rev64       v1.8b,  v1.8b
+    movrel      x4,  p16weight
+    uaddl       v4.8h,  v2.8b,  v3.8b
+    ld1        {v7.8h}, [x4]
+    usubl       v2.8h,  v2.8b,  v0.8b
+    usubl       v3.8h,  v3.8b,  v1.8b
+    mul         v2.8h,  v2.8h,  v7.8h
+    mul         v3.8h,  v3.8h,  v7.8h
+    saddlp      v2.4s,  v2.8h
+    saddlp      v3.4s,  v3.8h
+    addp        v2.4s,  v2.4s,  v3.4s
+    addp        v2.4s,  v2.4s,  v2.4s
+    shl         v3.2s,  v2.2s,  #2
+    add         v2.2s,  v2.2s,  v3.2s
+    rshrn       v5.4h,  v2.4s,  #6    // b, c, x, x
+    addp        v2.4h,  v5.4h,  v5.4h
+    shl         v3.4h,  v2.4h,  #3
+    sub         v3.4h,  v3.4h,  v2.4h // 7 * (b + c)
+    ext         v4.16b, v4.16b, v4.16b, #14
+    add         v4.4h,  v4.4h,  v7.4h
+    shl         v2.4h,  v4.4h,  #4              // a
+    sub         v2.4h,  v2.4h,  v3.4h           // a - 7 * (b + c) + 16
+    ext         v7.16b, v7.16b, v7.16b, #14
+    mov         v7.h[0],  wzr
+    dup         v3.8h,  v5.h[0]
+    mul         v0.8h,  v7.8h,  v5.h[0]         // 0,1,2,3,4,5,6,7 * b
+    dup         v1.8h,  v2.h[0]                 // pix
+    dup         v2.8h,  v5.h[1]                 // c
+    shl         v3.8h,  v3.8h,  #3
+    add         v1.8h,  v1.8h,  v0.8h           // pix + x*b
+    add         v3.8h,  v3.8h,  v1.8h           // pix + x{8-15}*b
+    mov         x3,  #16
+1:
+    subs        x3,  x3,  #1
+    sqshrun     v0.8b,  v1.8h,  #5
+    add         v1.8h,  v1.8h,  v2.8h
+    sqshrun2    v0.16b, v3.8h,  #5
+    add         v3.8h,  v3.8h,  v2.8h
+    st1        {v0.16b}, [x0], x1
+    b.ne        1b
+    ret
+endfunc
diff --git a/common/aarch64/predict-c.c b/common/aarch64/predict-c.c
new file mode 100644
index 0000000..3803b57
--- /dev/null
+++ b/common/aarch64/predict-c.c
@@ -0,0 +1,114 @@
+/*****************************************************************************
+ * predict.c: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "predict.h"
+#include "pixel.h"
+
+void x264_predict_4x4_dc_top_neon( uint8_t *src );
+void x264_predict_4x4_ddr_neon( uint8_t *src );
+void x264_predict_4x4_ddl_neon( uint8_t *src );
+
+void x264_predict_8x8c_dc_top_neon( uint8_t *src );
+void x264_predict_8x8c_dc_left_neon( uint8_t *src );
+void x264_predict_8x8c_p_neon( uint8_t *src );
+
+void x264_predict_8x8_ddl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_ddr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vl_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_vr_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hd_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_hu_neon( uint8_t *src, uint8_t edge[36] );
+
+void x264_predict_16x16_dc_top_neon( uint8_t *src );
+void x264_predict_16x16_dc_left_neon( uint8_t *src );
+void x264_predict_16x16_p_neon( uint8_t *src );
+
+void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] )
+{
+#if !HIGH_BIT_DEPTH
+    if (cpu&X264_CPU_ARMV8)
+    {
+        pf[I_PRED_4x4_H]   = x264_predict_4x4_h_aarch64;
+        pf[I_PRED_4x4_V]   = x264_predict_4x4_v_aarch64;
+    }
+
+    if (cpu&X264_CPU_NEON)
+    {
+        pf[I_PRED_4x4_DC]     = x264_predict_4x4_dc_neon;
+        pf[I_PRED_4x4_DC_TOP] = x264_predict_4x4_dc_top_neon;
+        pf[I_PRED_4x4_DDL]    = x264_predict_4x4_ddl_neon;
+        pf[I_PRED_4x4_DDR]    = x264_predict_4x4_ddr_neon;
+    }
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_CHROMA_DC]      = x264_predict_8x8c_dc_neon;
+    pf[I_PRED_CHROMA_DC_TOP]  = x264_predict_8x8c_dc_top_neon;
+    pf[I_PRED_CHROMA_DC_LEFT] = x264_predict_8x8c_dc_left_neon;
+    pf[I_PRED_CHROMA_H]       = x264_predict_8x8c_h_neon;
+    pf[I_PRED_CHROMA_V]       = x264_predict_8x8c_v_neon;
+    pf[I_PRED_CHROMA_P]       = x264_predict_8x8c_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_8x8_DDL] = x264_predict_8x8_ddl_neon;
+    pf[I_PRED_8x8_DDR] = x264_predict_8x8_ddr_neon;
+    pf[I_PRED_8x8_VL]  = x264_predict_8x8_vl_neon;
+    pf[I_PRED_8x8_VR]  = x264_predict_8x8_vr_neon;
+    pf[I_PRED_8x8_DC]  = x264_predict_8x8_dc_neon;
+    pf[I_PRED_8x8_H]   = x264_predict_8x8_h_neon;
+    pf[I_PRED_8x8_HD]  = x264_predict_8x8_hd_neon;
+    pf[I_PRED_8x8_HU]  = x264_predict_8x8_hu_neon;
+    pf[I_PRED_8x8_V]   = x264_predict_8x8_v_neon;
+#endif // !HIGH_BIT_DEPTH
+}
+
+void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] )
+{
+    if (!(cpu&X264_CPU_NEON))
+        return;
+
+#if !HIGH_BIT_DEPTH
+    pf[I_PRED_16x16_DC ]    = x264_predict_16x16_dc_neon;
+    pf[I_PRED_16x16_DC_TOP] = x264_predict_16x16_dc_top_neon;
+    pf[I_PRED_16x16_DC_LEFT]= x264_predict_16x16_dc_left_neon;
+    pf[I_PRED_16x16_H ]     = x264_predict_16x16_h_neon;
+    pf[I_PRED_16x16_V ]     = x264_predict_16x16_v_neon;
+    pf[I_PRED_16x16_P ]     = x264_predict_16x16_p_neon;
+#endif // !HIGH_BIT_DEPTH
+}
diff --git a/common/aarch64/predict.h b/common/aarch64/predict.h
new file mode 100644
index 0000000..2d26a05
--- /dev/null
+++ b/common/aarch64/predict.h
@@ -0,0 +1,52 @@
+/*****************************************************************************
+ * predict.h: aarch64 intra prediction
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_PREDICT_H
+#define X264_AARCH64_PREDICT_H
+
+void x264_predict_4x4_h_aarch64( uint8_t *src );
+void x264_predict_4x4_v_aarch64( uint8_t *src );
+
+// for the merged 4x4 intra sad/satd which expects unified suffix
+#define x264_predict_4x4_h_neon x264_predict_4x4_h_aarch64
+#define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
+
+void x264_predict_4x4_dc_neon( uint8_t *src );
+void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8_dc_neon( uint8_t *src, uint8_t edge[36] );
+void x264_predict_8x8c_dc_neon( uint8_t *src );
+void x264_predict_8x8c_h_neon( uint8_t *src );
+void x264_predict_8x8c_v_neon( uint8_t *src );
+void x264_predict_16x16_v_neon( uint8_t *src );
+void x264_predict_16x16_h_neon( uint8_t *src );
+void x264_predict_16x16_dc_neon( uint8_t *src );
+
+void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] );
+void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
+void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] );
+void x264_predict_16x16_init_aarch64( int cpu, x264_predict_t pf[7] );
+
+#endif /* X264_AARCH64_PREDICT_H */
diff --git a/common/pixel.c b/common/pixel.c
index 3a8333d..9f603f2 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -40,6 +40,7 @@
 #endif
 #if ARCH_AARCH64
 #   include "aarch64/pixel.h"
+#   include "aarch64/predict.h"
 #endif
 
 
@@ -523,14 +524,10 @@ INTRA_MBCMP_8x8(sa8d,, _c )
 INTRA_MBCMP_8x8( sad, _mmx2,  _c )
 INTRA_MBCMP_8x8(sa8d, _sse2,  _sse2 )
 #endif
-#if !HIGH_BIT_DEPTH && HAVE_ARMV6
+#if !HIGH_BIT_DEPTH && (HAVE_ARMV6 || ARCH_AARCH64)
 INTRA_MBCMP_8x8( sad, _neon, _neon )
 INTRA_MBCMP_8x8(sa8d, _neon, _neon )
 #endif
-#if !HIGH_BIT_DEPTH && ARCH_AARCH64
-INTRA_MBCMP_8x8( sad, _neon, _c )
-INTRA_MBCMP_8x8(sa8d, _neon, _c )
-#endif
 
 #define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
 void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
@@ -597,14 +594,14 @@ INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _neon )
 INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _neon )
 #endif
 #if !HIGH_BIT_DEPTH && ARCH_AARCH64
-INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _neon, _c )
-INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _c )
-INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _c )
-INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _c )
+INTRA_MBCMP( sad,  4x4,   v, h, dc,  , _neon, _neon )
+INTRA_MBCMP(satd,  4x4,   v, h, dc,  , _neon, _neon )
+INTRA_MBCMP( sad,  8x8,  dc, h,  v, c, _neon, _neon )
+INTRA_MBCMP(satd,  8x8,  dc, h,  v, c, _neon, _neon )
 INTRA_MBCMP( sad,  8x16, dc, h,  v, c, _neon, _c )
 INTRA_MBCMP(satd,  8x16, dc, h,  v, c, _neon, _c )
-INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _c )
-INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _c )
+INTRA_MBCMP( sad, 16x16,  v, h, dc,  , _neon, _neon )
+INTRA_MBCMP(satd, 16x16,  v, h, dc,  , _neon, _neon )
 #endif
 
 // No C implementation of intra_satd_x9. See checkasm for its behavior,
diff --git a/common/predict.c b/common/predict.c
index cbc018d..f9c4615 100644
--- a/common/predict.c
+++ b/common/predict.c
@@ -40,6 +40,9 @@
 #if ARCH_ARM
 #   include "arm/predict.h"
 #endif
+#if ARCH_AARCH64
+#   include "aarch64/predict.h"
+#endif
 
 /****************************************************************************
  * 16x16 prediction for intra luma block
@@ -899,6 +902,10 @@ void x264_predict_16x16_init( int cpu, x264_predict_t pf[7] )
 #if HAVE_ARMV6
     x264_predict_16x16_init_arm( cpu, pf );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_16x16_init_aarch64( cpu, pf );
+#endif
 }
 
 void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
@@ -923,6 +930,10 @@ void x264_predict_8x8c_init( int cpu, x264_predict_t pf[7] )
 #if HAVE_ARMV6
     x264_predict_8x8c_init_arm( cpu, pf );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_8x8c_init_aarch64( cpu, pf );
+#endif
 }
 
 void x264_predict_8x16c_init( int cpu, x264_predict_t pf[7] )
@@ -963,6 +974,10 @@ void x264_predict_8x8_init( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_
 #if HAVE_ARMV6
     x264_predict_8x8_init_arm( cpu, pf, predict_filter );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_8x8_init_aarch64( cpu, pf, predict_filter );
+#endif
 }
 
 void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
@@ -987,5 +1002,9 @@ void x264_predict_4x4_init( int cpu, x264_predict_t pf[12] )
 #if HAVE_ARMV6
     x264_predict_4x4_init_arm( cpu, pf );
 #endif
+
+#if ARCH_AARCH64
+    x264_predict_4x4_init_aarch64( cpu, pf );
+#endif
 }
 
-- 
2.0.0