[x264-devel] [PATCH 23/29] Templatize AARCH64 assembly code

Vittorio Giovara vittorio.giovara at gmail.com
Thu Feb 2 10:05:35 CET 2017


---
 common/aarch64/asm.S     | 15 ++++++----
 common/aarch64/cabac-a.S |  4 +--
 common/aarch64/dct.h     | 27 ++++++++++++++++++
 common/aarch64/mc-c.c    | 55 +++++++++++++++++++++++++++++++++++++
 common/aarch64/mc.h      |  1 +
 common/aarch64/pixel.h   | 55 +++++++++++++++++++++++++++++++++++++
 common/aarch64/predict.h | 71 ++++++++++++++++++++++++++++++++++++++++++++++++
 common/aarch64/quant-a.S |  4 +--
 common/aarch64/quant.h   | 21 ++++++++++++++
 9 files changed, 243 insertions(+), 10 deletions(-)

diff --git a/common/aarch64/asm.S b/common/aarch64/asm.S
index 658a1dd..07f5719 100644
--- a/common/aarch64/asm.S
+++ b/common/aarch64/asm.S
@@ -27,12 +27,19 @@
 
 #include "config.h"
 
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+
 #ifdef PREFIX
-#   define EXTERN_ASM _x264_
+#   define BASE _x264_
 #else
-#   define EXTERN_ASM x264_
+#   define BASE x264_
 #endif
 
+#define EXTERN_ASM JOIN(JOIN(BASE, BIT_DEPTH), _)
+#define X(s) JOIN(EXTERN_ASM, s)
+#define X264(s) JOIN(BASE, s)
+
 #ifdef __ELF__
 #   define ELF
 #else
@@ -98,10 +105,6 @@ MACH    .const_data
 #endif
 .endm
 
-#define GLUE(a, b) a ## b
-#define JOIN(a, b) GLUE(a, b)
-#define X(s) JOIN(EXTERN_ASM, s)
-
 #define FDEC_STRIDE 32
 #define FENC_STRIDE 16
 
diff --git a/common/aarch64/cabac-a.S b/common/aarch64/cabac-a.S
index 7f8fa84..c05f963 100644
--- a/common/aarch64/cabac-a.S
+++ b/common/aarch64/cabac-a.S
@@ -30,8 +30,8 @@
 // w12 holds x264_cabac_t.i_range
 
 function cabac_encode_decision_asm, export=1
-    movrel      x8,  X(cabac_range_lps)
-    movrel      x9,  X(cabac_transition)
+    movrel      x8,  X264(cabac_range_lps)
+    movrel      x9,  X264(cabac_transition)
     add         w10, w1, #CABAC_STATE
     ldrb        w3,  [x0,  x10]         // i_state
     ldr         w12, [x0,  #CABAC_I_RANGE]
diff --git a/common/aarch64/dct.h b/common/aarch64/dct.h
index 095f4ab..25395cc 100644
--- a/common/aarch64/dct.h
+++ b/common/aarch64/dct.h
@@ -27,13 +27,29 @@
 #ifndef X264_AARCH64_DCT_H
 #define X264_AARCH64_DCT_H
 
+#define x264_dct4x4dc_neon x264_template(dct4x4dc_neon)
 void x264_dct4x4dc_neon( int16_t d[16] );
+#define x264_idct4x4dc_neon x264_template(idct4x4dc_neon)
 void x264_idct4x4dc_neon( int16_t d[16] );
 
+#define x264_sub16x16_dct8_neon x264_template(sub16x16_dct8_neon)
+#define x264_sub16x16_dct_neon x264_template(sub16x16_dct_neon)
+#define x264_sub4x4_dct_neon x264_template(sub4x4_dct_neon)
+#define x264_sub8x16_dct_dc_neon x264_template(sub8x16_dct_dc_neon)
+#define x264_sub8x8_dct8_neon x264_template(sub8x8_dct8_neon)
+#define x264_sub8x8_dct_dc_neon x264_template(sub8x8_dct_dc_neon)
+#define x264_sub8x8_dct_neon x264_template(sub8x8_dct_neon)
 void x264_sub4x4_dct_neon( int16_t dct[16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub8x8_dct_neon( int16_t dct[4][16], uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_neon( int16_t dct[16][16], uint8_t *pix1, uint8_t *pix2 );
 
+#define x264_add16x16_idct8_neon x264_template(add16x16_idct8_neon)
+#define x264_add16x16_idct_dc_neon x264_template(add16x16_idct_dc_neon)
+#define x264_add16x16_idct_neon x264_template(add16x16_idct_neon)
+#define x264_add4x4_idct_neon x264_template(add4x4_idct_neon)
+#define x264_add8x8_idct8_neon x264_template(add8x8_idct8_neon)
+#define x264_add8x8_idct_dc_neon x264_template(add8x8_idct_dc_neon)
+#define x264_add8x8_idct_neon x264_template(add8x8_idct_neon)
 void x264_add4x4_idct_neon( uint8_t *p_dst, int16_t dct[16] );
 void x264_add8x8_idct_neon( uint8_t *p_dst, int16_t dct[4][16] );
 void x264_add16x16_idct_neon( uint8_t *p_dst, int16_t dct[16][16] );
@@ -49,6 +65,17 @@ void x264_sub16x16_dct8_neon( int16_t dct[4][64], uint8_t *pix1, uint8_t *pix2 )
 void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
 void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
 
+#define x264_zigzag_interleave_8x8_cavlc_neon x264_template(zigzag_interleave_8x8_cavlc_neon)
+#define x264_zigzag_scan_4x4_field_neon x264_template(zigzag_scan_4x4_field_neon)
+#define x264_zigzag_scan_4x4_frame_neon x264_template(zigzag_scan_4x4_frame_neon)
+#define x264_zigzag_scan_8x8_field_neon x264_template(zigzag_scan_8x8_field_neon)
+#define x264_zigzag_scan_8x8_frame_neon x264_template(zigzag_scan_8x8_frame_neon)
+#define x264_zigzag_sub_4x4_field_neon x264_template(zigzag_sub_4x4_field_neon)
+#define x264_zigzag_sub_4x4_frame_neon x264_template(zigzag_sub_4x4_frame_neon)
+#define x264_zigzag_sub_4x4ac_field_neon x264_template(zigzag_sub_4x4ac_field_neon)
+#define x264_zigzag_sub_4x4ac_frame_neon x264_template(zigzag_sub_4x4ac_frame_neon)
+#define x264_zigzag_sub_8x8_field_neon x264_template(zigzag_sub_8x8_field_neon)
+#define x264_zigzag_sub_8x8_frame_neon x264_template(zigzag_sub_8x8_frame_neon)
 void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
 void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index 2cd548a..d73682a 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -27,13 +27,31 @@
 #include "common/common.h"
 #include "mc.h"
 
+#define x264_prefetch_fenc_420_aarch64 x264_template(prefetch_fenc_420_aarch64)
+#define x264_prefetch_fenc_422_aarch64 x264_template(prefetch_fenc_422_aarch64)
+#define x264_prefetch_ref_aarch64 x264_template(prefetch_ref_aarch64)
 void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
 void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_prefetch_fenc_422_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 
+#define x264_memcpy_aligned_neon x264_template(memcpy_aligned_neon)
+#define x264_memzero_aligned_neon x264_template(memzero_aligned_neon)
 void *x264_memcpy_aligned_neon( void *dst, const void *src, size_t n );
 void x264_memzero_aligned_neon( void *dst, size_t n );
 
+#define x264_pixel_avg2_w16_neon x264_template(pixel_avg2_w16_neon)
+#define x264_pixel_avg2_w20_neon x264_template(pixel_avg2_w20_neon)
+#define x264_pixel_avg2_w4_neon x264_template(pixel_avg2_w4_neon)
+#define x264_pixel_avg2_w8_neon x264_template(pixel_avg2_w8_neon)
+#define x264_pixel_avg_16x16_neon x264_template(pixel_avg_16x16_neon)
+#define x264_pixel_avg_16x8_neon x264_template(pixel_avg_16x8_neon)
+#define x264_pixel_avg_4x16_neon x264_template(pixel_avg_4x16_neon)
+#define x264_pixel_avg_4x2_neon x264_template(pixel_avg_4x2_neon)
+#define x264_pixel_avg_4x4_neon x264_template(pixel_avg_4x4_neon)
+#define x264_pixel_avg_4x8_neon x264_template(pixel_avg_4x8_neon)
+#define x264_pixel_avg_8x16_neon x264_template(pixel_avg_8x16_neon)
+#define x264_pixel_avg_8x4_neon x264_template(pixel_avg_8x4_neon)
+#define x264_pixel_avg_8x8_neon x264_template(pixel_avg_8x8_neon)
 void x264_pixel_avg_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_16x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_pixel_avg_8x16_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, intptr_t, int );
@@ -49,6 +67,11 @@ void x264_pixel_avg2_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t
 void x264_pixel_avg2_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 void x264_pixel_avg2_w20_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, uint8_t *, int );
 
+#define x264_plane_copy_core_neon x264_template(plane_copy_core_neon)
+#define x264_plane_copy_deinterleave_neon x264_template(plane_copy_deinterleave_neon)
+#define x264_plane_copy_deinterleave_rgb_neon x264_template(plane_copy_deinterleave_rgb_neon)
+#define x264_plane_copy_interleave_core_neon x264_template(plane_copy_interleave_core_neon)
+#define x264_plane_copy_swap_core_neon x264_template(plane_copy_swap_core_neon)
 void x264_plane_copy_core_neon( pixel *dst, intptr_t i_dst,
                                 pixel *src, intptr_t i_src, int w, int h );
 void x264_plane_copy_swap_core_neon( pixel *dst, intptr_t i_dst,
@@ -64,10 +87,29 @@ void x264_plane_copy_interleave_core_neon( pixel *dst,  intptr_t i_dst,
                                            pixel *srcu, intptr_t i_srcu,
                                            pixel *srcv, intptr_t i_srcv, int w, int h );
 
+#define x264_store_interleave_chroma_neon x264_template(store_interleave_chroma_neon)
 void x264_store_interleave_chroma_neon( pixel *dst, intptr_t i_dst, pixel *srcu, pixel *srcv, int height );
+#define x264_load_deinterleave_chroma_fdec_neon x264_template(load_deinterleave_chroma_fdec_neon)
 void x264_load_deinterleave_chroma_fdec_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
+#define x264_load_deinterleave_chroma_fenc_neon x264_template(load_deinterleave_chroma_fenc_neon)
 void x264_load_deinterleave_chroma_fenc_neon( pixel *dst, pixel *src, intptr_t i_src, int height );
 
+#define x264_mc_weight_w16_neon x264_template(mc_weight_w16_neon)
+#define x264_mc_weight_w16_nodenom_neon x264_template(mc_weight_w16_nodenom_neon)
+#define x264_mc_weight_w16_offsetadd_neon x264_template(mc_weight_w16_offsetadd_neon)
+#define x264_mc_weight_w16_offsetsub_neon x264_template(mc_weight_w16_offsetsub_neon)
+#define x264_mc_weight_w20_neon x264_template(mc_weight_w20_neon)
+#define x264_mc_weight_w20_nodenom_neon x264_template(mc_weight_w20_nodenom_neon)
+#define x264_mc_weight_w20_offsetadd_neon x264_template(mc_weight_w20_offsetadd_neon)
+#define x264_mc_weight_w20_offsetsub_neon x264_template(mc_weight_w20_offsetsub_neon)
+#define x264_mc_weight_w4_neon x264_template(mc_weight_w4_neon)
+#define x264_mc_weight_w4_nodenom_neon x264_template(mc_weight_w4_nodenom_neon)
+#define x264_mc_weight_w4_offsetadd_neon x264_template(mc_weight_w4_offsetadd_neon)
+#define x264_mc_weight_w4_offsetsub_neon x264_template(mc_weight_w4_offsetsub_neon)
+#define x264_mc_weight_w8_neon x264_template(mc_weight_w8_neon)
+#define x264_mc_weight_w8_nodenom_neon x264_template(mc_weight_w8_nodenom_neon)
+#define x264_mc_weight_w8_offsetadd_neon x264_template(mc_weight_w8_offsetadd_neon)
+#define x264_mc_weight_w8_offsetsub_neon x264_template(mc_weight_w8_offsetsub_neon)
 #define MC_WEIGHT(func)\
 void x264_mc_weight_w20##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
 void x264_mc_weight_w16##func##_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, const x264_weight_t *, int );\
@@ -89,20 +131,32 @@ MC_WEIGHT(_nodenom)
 MC_WEIGHT(_offsetadd)
 MC_WEIGHT(_offsetsub)
 
+#define x264_mc_copy_w16_neon x264_template(mc_copy_w16_neon)
+#define x264_mc_copy_w4_neon x264_template(mc_copy_w4_neon)
+#define x264_mc_copy_w8_neon x264_template(mc_copy_w8_neon)
 void x264_mc_copy_w4_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
 
+#define x264_mc_chroma_neon x264_template(mc_chroma_neon)
 void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+#define x264_integral_init4h_neon x264_template(integral_init4h_neon)
+#define x264_integral_init4v_neon x264_template(integral_init4v_neon)
+#define x264_integral_init8h_neon x264_template(integral_init8h_neon)
+#define x264_integral_init8v_neon x264_template(integral_init8v_neon)
 void x264_integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
 void x264_integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
 void x264_integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
 void x264_integral_init8v_neon( uint16_t *, intptr_t );
+#define x264_frame_init_lowres_core_neon x264_template(frame_init_lowres_core_neon)
 void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
 
+#define x264_mbtree_propagate_cost_neon x264_template(mbtree_propagate_cost_neon)
 void x264_mbtree_propagate_cost_neon( int16_t *, uint16_t *, uint16_t *, uint16_t *, uint16_t *, float *, int );
 
+#define x264_mbtree_fix8_pack_neon x264_template(mbtree_fix8_pack_neon)
 void x264_mbtree_fix8_pack_neon( uint16_t *dst, float *src, int count );
+#define x264_mbtree_fix8_unpack_neon x264_template(mbtree_fix8_unpack_neon)
 void x264_mbtree_fix8_unpack_neon( float *dst, uint16_t *src, int count );
 
 #if !HIGH_BIT_DEPTH
@@ -205,6 +259,7 @@ static uint8_t *get_ref_neon( uint8_t *dst,   intptr_t *i_dst_stride,
     }
 }
 
+#define x264_hpel_filter_neon x264_template(hpel_filter_neon)
 void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                             uint8_t *src, intptr_t stride, int width,
                             int height, int16_t *buf );
diff --git a/common/aarch64/mc.h b/common/aarch64/mc.h
index 33c311e..cda7379 100644
--- a/common/aarch64/mc.h
+++ b/common/aarch64/mc.h
@@ -26,6 +26,7 @@
 #ifndef X264_AARCH64_MC_H
 #define X264_AARCH64_MC_H
 
+#define x264_mc_init_aarch64 x264_template(mc_init_aarch64)
 void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf );
 
 #endif
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 8a7b83e..f224f7e 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -27,6 +27,60 @@
 #ifndef X264_AARCH64_PIXEL_H
 #define X264_AARCH64_PIXEL_H
 
+#define x264_pixel_hadamard_ac_16x16_neon x264_template(pixel_hadamard_ac_16x16_neon)
+#define x264_pixel_hadamard_ac_16x8_neon x264_template(pixel_hadamard_ac_16x8_neon)
+#define x264_pixel_hadamard_ac_8x16_neon x264_template(pixel_hadamard_ac_8x16_neon)
+#define x264_pixel_hadamard_ac_8x8_neon x264_template(pixel_hadamard_ac_8x8_neon)
+#define x264_pixel_sa8d_16x16_neon x264_template(pixel_sa8d_16x16_neon)
+#define x264_pixel_sa8d_8x8_neon x264_template(pixel_sa8d_8x8_neon)
+#define x264_pixel_sa8d_satd_16x16_neon x264_template(pixel_sa8d_satd_16x16_neon)
+#define x264_pixel_sad_16x16_neon x264_template(pixel_sad_16x16_neon)
+#define x264_pixel_sad_16x8_neon x264_template(pixel_sad_16x8_neon)
+#define x264_pixel_sad_4x16_neon x264_template(pixel_sad_4x16_neon)
+#define x264_pixel_sad_4x4_neon x264_template(pixel_sad_4x4_neon)
+#define x264_pixel_sad_4x8_neon x264_template(pixel_sad_4x8_neon)
+#define x264_pixel_sad_8x16_neon x264_template(pixel_sad_8x16_neon)
+#define x264_pixel_sad_8x4_neon x264_template(pixel_sad_8x4_neon)
+#define x264_pixel_sad_8x8_neon x264_template(pixel_sad_8x8_neon)
+#define x264_pixel_sad_x3_16x16_neon x264_template(pixel_sad_x3_16x16_neon)
+#define x264_pixel_sad_x3_16x8_neon x264_template(pixel_sad_x3_16x8_neon)
+#define x264_pixel_sad_x3_4x4_neon x264_template(pixel_sad_x3_4x4_neon)
+#define x264_pixel_sad_x3_4x8_neon x264_template(pixel_sad_x3_4x8_neon)
+#define x264_pixel_sad_x3_8x16_neon x264_template(pixel_sad_x3_8x16_neon)
+#define x264_pixel_sad_x3_8x4_neon x264_template(pixel_sad_x3_8x4_neon)
+#define x264_pixel_sad_x3_8x8_neon x264_template(pixel_sad_x3_8x8_neon)
+#define x264_pixel_sad_x4_16x16_neon x264_template(pixel_sad_x4_16x16_neon)
+#define x264_pixel_sad_x4_16x8_neon x264_template(pixel_sad_x4_16x8_neon)
+#define x264_pixel_sad_x4_4x4_neon x264_template(pixel_sad_x4_4x4_neon)
+#define x264_pixel_sad_x4_4x8_neon x264_template(pixel_sad_x4_4x8_neon)
+#define x264_pixel_sad_x4_8x16_neon x264_template(pixel_sad_x4_8x16_neon)
+#define x264_pixel_sad_x4_8x4_neon x264_template(pixel_sad_x4_8x4_neon)
+#define x264_pixel_sad_x4_8x8_neon x264_template(pixel_sad_x4_8x8_neon)
+#define x264_pixel_satd_16x16_neon x264_template(pixel_satd_16x16_neon)
+#define x264_pixel_satd_16x8_neon x264_template(pixel_satd_16x8_neon)
+#define x264_pixel_satd_4x16_neon x264_template(pixel_satd_4x16_neon)
+#define x264_pixel_satd_4x4_neon x264_template(pixel_satd_4x4_neon)
+#define x264_pixel_satd_4x8_neon x264_template(pixel_satd_4x8_neon)
+#define x264_pixel_satd_8x16_neon x264_template(pixel_satd_8x16_neon)
+#define x264_pixel_satd_8x4_neon x264_template(pixel_satd_8x4_neon)
+#define x264_pixel_satd_8x8_neon x264_template(pixel_satd_8x8_neon)
+#define x264_pixel_ssd_16x16_neon x264_template(pixel_ssd_16x16_neon)
+#define x264_pixel_ssd_16x8_neon x264_template(pixel_ssd_16x8_neon)
+#define x264_pixel_ssd_4x16_neon x264_template(pixel_ssd_4x16_neon)
+#define x264_pixel_ssd_4x4_neon x264_template(pixel_ssd_4x4_neon)
+#define x264_pixel_ssd_4x8_neon x264_template(pixel_ssd_4x8_neon)
+#define x264_pixel_ssd_8x16_neon x264_template(pixel_ssd_8x16_neon)
+#define x264_pixel_ssd_8x4_neon x264_template(pixel_ssd_8x4_neon)
+#define x264_pixel_ssd_8x8_neon x264_template(pixel_ssd_8x8_neon)
+#define x264_pixel_ssd_nv12_core_neon x264_template(pixel_ssd_nv12_core_neon)
+#define x264_pixel_ssim_4x4x2_core_neon x264_template(pixel_ssim_4x4x2_core_neon)
+#define x264_pixel_ssim_end4_neon x264_template(pixel_ssim_end4_neon)
+#define x264_pixel_var2_8x16_neon x264_template(pixel_var2_8x16_neon)
+#define x264_pixel_var2_8x8_neon x264_template(pixel_var2_8x8_neon)
+#define x264_pixel_var_16x16_neon x264_template(pixel_var_16x16_neon)
+#define x264_pixel_var_8x16_neon x264_template(pixel_var_8x16_neon)
+#define x264_pixel_var_8x8_neon x264_template(pixel_var_8x8_neon)
+#define x264_pixel_vsad_neon x264_template(pixel_vsad_neon)
 #define DECL_PIXELS( ret, name, suffix, args ) \
     ret x264_pixel_##name##_16x16_##suffix args;\
     ret x264_pixel_##name##_16x8_##suffix args;\
@@ -74,6 +128,7 @@ void x264_pixel_ssim_4x4x2_core_neon( const uint8_t *, intptr_t,
                                       int sums[2][4] );
 float x264_pixel_ssim_end4_neon( int sum0[5][4], int sum1[5][4], int width );
 
+#define x264_pixel_asd8_neon x264_template(pixel_asd8_neon)
 int x264_pixel_asd8_neon( uint8_t *, intptr_t,  uint8_t *, intptr_t, int );
 
 #endif
diff --git a/common/aarch64/predict.h b/common/aarch64/predict.h
index a8beada..d7866d3 100644
--- a/common/aarch64/predict.h
+++ b/common/aarch64/predict.h
@@ -27,6 +27,39 @@
 #ifndef X264_AARCH64_PREDICT_H
 #define X264_AARCH64_PREDICT_H
 
+#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
+#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
+#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
+#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
+#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
+#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
+#define x264_predict_4x4_dc_neon x264_template(predict_4x4_dc_neon)
+#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
+#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
+#define x264_predict_4x4_ddr_neon x264_template(predict_4x4_ddr_neon)
+#define x264_predict_4x4_h_aarch64 x264_template(predict_4x4_h_aarch64)
+#define x264_predict_4x4_v_aarch64 x264_template(predict_4x4_v_aarch64)
+#define x264_predict_8x16c_dc_left_neon x264_template(predict_8x16c_dc_left_neon)
+#define x264_predict_8x16c_dc_neon x264_template(predict_8x16c_dc_neon)
+#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
+#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
+#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
+#define x264_predict_8x16c_v_neon x264_template(predict_8x16c_v_neon)
+#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
+#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
+#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
+#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
+#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
+#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
+#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
+#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
+#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
+#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
+#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
+#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
+#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
+#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
+#define x264_predict_8x8c_v_aarch64 x264_template(predict_8x8c_v_aarch64)
 void x264_predict_4x4_h_aarch64( uint8_t *src );
 void x264_predict_4x4_v_aarch64( uint8_t *src );
 void x264_predict_8x8c_v_aarch64( uint8_t *src );
@@ -36,6 +69,39 @@ void x264_predict_8x8c_v_aarch64( uint8_t *src );
 #define x264_predict_4x4_v_neon x264_predict_4x4_v_aarch64
 #define x264_predict_8x8c_v_neon x264_predict_8x8c_v_aarch64
 
+#define x264_predict_16x16_dc_left_neon x264_template(predict_16x16_dc_left_neon)
+#define x264_predict_16x16_dc_neon x264_template(predict_16x16_dc_neon)
+#define x264_predict_16x16_dc_top_neon x264_template(predict_16x16_dc_top_neon)
+#define x264_predict_16x16_h_neon x264_template(predict_16x16_h_neon)
+#define x264_predict_16x16_p_neon x264_template(predict_16x16_p_neon)
+#define x264_predict_16x16_v_neon x264_template(predict_16x16_v_neon)
+#define x264_predict_4x4_dc_neon x264_template(predict_4x4_dc_neon)
+#define x264_predict_4x4_dc_top_neon x264_template(predict_4x4_dc_top_neon)
+#define x264_predict_4x4_ddl_neon x264_template(predict_4x4_ddl_neon)
+#define x264_predict_4x4_ddr_neon x264_template(predict_4x4_ddr_neon)
+#define x264_predict_4x4_h_aarch64 x264_template(predict_4x4_h_aarch64)
+#define x264_predict_4x4_v_aarch64 x264_template(predict_4x4_v_aarch64)
+#define x264_predict_8x16c_dc_left_neon x264_template(predict_8x16c_dc_left_neon)
+#define x264_predict_8x16c_dc_neon x264_template(predict_8x16c_dc_neon)
+#define x264_predict_8x16c_dc_top_neon x264_template(predict_8x16c_dc_top_neon)
+#define x264_predict_8x16c_h_neon x264_template(predict_8x16c_h_neon)
+#define x264_predict_8x16c_p_neon x264_template(predict_8x16c_p_neon)
+#define x264_predict_8x16c_v_neon x264_template(predict_8x16c_v_neon)
+#define x264_predict_8x8_dc_neon x264_template(predict_8x8_dc_neon)
+#define x264_predict_8x8_ddl_neon x264_template(predict_8x8_ddl_neon)
+#define x264_predict_8x8_ddr_neon x264_template(predict_8x8_ddr_neon)
+#define x264_predict_8x8_h_neon x264_template(predict_8x8_h_neon)
+#define x264_predict_8x8_hd_neon x264_template(predict_8x8_hd_neon)
+#define x264_predict_8x8_hu_neon x264_template(predict_8x8_hu_neon)
+#define x264_predict_8x8_v_neon x264_template(predict_8x8_v_neon)
+#define x264_predict_8x8_vl_neon x264_template(predict_8x8_vl_neon)
+#define x264_predict_8x8_vr_neon x264_template(predict_8x8_vr_neon)
+#define x264_predict_8x8c_dc_left_neon x264_template(predict_8x8c_dc_left_neon)
+#define x264_predict_8x8c_dc_neon x264_template(predict_8x8c_dc_neon)
+#define x264_predict_8x8c_dc_top_neon x264_template(predict_8x8c_dc_top_neon)
+#define x264_predict_8x8c_h_neon x264_template(predict_8x8c_h_neon)
+#define x264_predict_8x8c_p_neon x264_template(predict_8x8c_p_neon)
+#define x264_predict_8x8c_v_aarch64 x264_template(predict_8x8c_v_aarch64)
 void x264_predict_4x4_dc_neon( uint8_t *src );
 void x264_predict_8x8_v_neon( uint8_t *src, uint8_t edge[36] );
 void x264_predict_8x8_h_neon( uint8_t *src, uint8_t edge[36] );
@@ -49,6 +115,11 @@ void x264_predict_16x16_v_neon( uint8_t *src );
 void x264_predict_16x16_h_neon( uint8_t *src );
 void x264_predict_16x16_dc_neon( uint8_t *src );
 
+#define x264_predict_4x4_init_aarch64 x264_template(predict_4x4_init_aarch64)
+#define x264_predict_8x8_init_aarch64 x264_template(predict_8x8_init_aarch64)
+#define x264_predict_8x8c_init_aarch64 x264_template(predict_8x8c_init_aarch64)
+#define x264_predict_8x16c_init_aarch64 x264_template(predict_8x16c_init_aarch64)
+#define x264_predict_16x16_init_aarch64 x264_template(predict_16x16_init_aarch64)
 void x264_predict_4x4_init_aarch64( int cpu, x264_predict_t pf[12] );
 void x264_predict_8x8_init_aarch64( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
 void x264_predict_8x8c_init_aarch64( int cpu, x264_predict_t pf[7] );
diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
index 865e896..f94a0a0 100644
--- a/common/aarch64/quant-a.S
+++ b/common/aarch64/quant-a.S
@@ -305,7 +305,7 @@ endfunc
 .macro decimate_score_1x size
 function decimate_score\size\()_neon, export=1
     ld1        {v0.8h,v1.8h}, [x0]
-    movrel      x5,  X(decimate_table4)
+    movrel      x5,  X264(decimate_table4)
     movi        v3.16b, #0x01
     sqxtn       v0.8b,  v0.8h
     sqxtn2      v0.16b, v1.8h
@@ -391,7 +391,7 @@ function decimate_score64_neon, export=1
     mvn         x1,  x1
     mov         w0,  #0
     cbz         x1,  0f
-    movrel      x5,  X(decimate_table8)
+    movrel      x5,  X264(decimate_table8)
 1:
     clz         x3,  x1
     lsl         x1,  x1,  x3
diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h
index e0133e7..d885344 100644
--- a/common/aarch64/quant.h
+++ b/common/aarch64/quant.h
@@ -29,31 +29,52 @@
 
 int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
 
+#define x264_quant_2x2_dc_neon x264_template(quant_2x2_dc_neon)
+#define x264_quant_4x4_dc_neon x264_template(quant_4x4_dc_neon)
+#define x264_quant_4x4_neon x264_template(quant_4x4_neon)
+#define x264_quant_4x4x4_neon x264_template(quant_4x4x4_neon)
+#define x264_quant_8x8_neon x264_template(quant_8x8_neon)
 int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
 int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
 int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
 int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
 int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
 
+#define x264_dequant_4x4_dc_neon x264_template(dequant_4x4_dc_neon)
+#define x264_dequant_4x4_neon x264_template(dequant_4x4_neon)
+#define x264_dequant_8x8_neon x264_template(dequant_8x8_neon)
 void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
 void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
 
+#define x264_decimate_score15_neon x264_template(decimate_score15_neon)
+#define x264_decimate_score16_neon x264_template(decimate_score16_neon)
+#define x264_decimate_score64_neon x264_template(decimate_score64_neon)
 int x264_decimate_score15_neon( int16_t * );
 int x264_decimate_score16_neon( int16_t * );
 int x264_decimate_score64_neon( int16_t * );
 
+#define x264_coeff_last15_neon x264_template(coeff_last15_neon)
+#define x264_coeff_last16_neon x264_template(coeff_last16_neon)
+#define x264_coeff_last4_aarch64 x264_template(coeff_last4_aarch64)
+#define x264_coeff_last64_neon x264_template(coeff_last64_neon)
+#define x264_coeff_last8_aarch64 x264_template(coeff_last8_aarch64)
 int x264_coeff_last4_aarch64( int16_t * );
 int x264_coeff_last8_aarch64( int16_t * );
 int x264_coeff_last15_neon( int16_t * );
 int x264_coeff_last16_neon( int16_t * );
 int x264_coeff_last64_neon( int16_t * );
 
+#define x264_coeff_level_run15_neon x264_template(coeff_level_run15_neon)
+#define x264_coeff_level_run16_neon x264_template(coeff_level_run16_neon)
+#define x264_coeff_level_run4_aarch64 x264_template(coeff_level_run4_aarch64)
+#define x264_coeff_level_run8_neon x264_template(coeff_level_run8_neon)
 int x264_coeff_level_run4_aarch64( int16_t *, x264_run_level_t * );
 int x264_coeff_level_run8_neon( int16_t *, x264_run_level_t * );
 int x264_coeff_level_run15_neon( int16_t *, x264_run_level_t * );
 int x264_coeff_level_run16_neon( int16_t *, x264_run_level_t * );
 
+#define x264_denoise_dct_neon x264_template(denoise_dct_neon)
 void x264_denoise_dct_neon( dctcoef *, uint32_t *, udctcoef *, int );
 
 #endif
-- 
2.10.0



More information about the x264-devel mailing list