[x264-devel] [PATCH 25/32] Enable assembly templating for arm/aarch64 architectures

Vittorio Giovara vittorio.giovara at gmail.com
Fri Jan 20 15:20:50 CET 2017


Use EXTERN_ASM prefix to rename all internal asm symbols.
---
 Makefile               | 24 ++++++++++++++++++++----
 common/aarch64/asm.S   | 13 +++++++------
 common/arm/asm.S       | 13 +++++++------
 tools/asm.list         | 31 +++++++++++++++++++++++++++++++
 tools/duplicate-asm.sh |  3 ++-
 5 files changed, 67 insertions(+), 17 deletions(-)

diff --git a/Makefile b/Makefile
index 4ce4c60..908599c 100644
--- a/Makefile
+++ b/Makefile
@@ -128,7 +128,10 @@ ASMSRC += common/arm/cpu-a.S common/arm/pixel-a.S common/arm/mc-a.S \
           common/arm/dct-a.S common/arm/quant-a.S common/arm/deblock-a.S \
           common/arm/predict-a.S common/arm/bitstream-a.S
 SRCS   += common/arm/mc-c.c common/arm/predict-c.c
-OBJASM  = $(ASMSRC:%.S=%.o)
+
+OBJASM += $(ASMSRC:%.S=8bit/%.o)
+OBJASM += $(ASMSRC:%.S=10bit/%.o)
+
 OBJCHK += tools/checkasm-arm.o
 endif
 endif
@@ -147,7 +150,10 @@ ASMSRC += common/aarch64/bitstream-a.S \
 SRCS   += common/aarch64/asm-offsets.c \
           common/aarch64/mc-c.c        \
           common/aarch64/predict-c.c
-OBJASM  = $(ASMSRC:%.S=%.o)
+
+OBJASM += $(ASMSRC:%.S=8bit/%.o)
+OBJASM += $(ASMSRC:%.S=10bit/%.o)
+
 OBJCHK += tools/checkasm-aarch64.o
 endif
 endif
@@ -241,12 +247,12 @@ $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJEXAMPLE): .depend
 	$(AS) $(ASFLAGS) -o $@ $<
 	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
 
-8bit/%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
+8bit/%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm common/bitdepth.h common/bitdepth-asm.h
 	@mkdir -p $(dir $@)
 	$(AS) $(ASFLAGS) -o $@ $< -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8 -Dprivate_prefix=x264_8
 	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
 
-10bit/%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm
+10bit/%.o: %.asm common/x86/x86inc.asm common/x86/x86util.asm common/bitdepth.h common/bitdepth-asm.h
 	@mkdir -p $(dir $@)
 	$(AS) $(ASFLAGS) -o $@ $< -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10 -Dprivate_prefix=x264_10
 	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
@@ -255,6 +261,16 @@ $(OBJS) $(OBJASM) $(OBJSO) $(OBJCLI) $(OBJCHK) $(OBJEXAMPLE): .depend
 	$(AS) $(ASFLAGS) -o $@ $<
 	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
 
+8bit/%.o: %.S
+	@mkdir -p $(dir $@)
+	$(AS) $(ASFLAGS) -o $@ $< -DHIGH_BIT_DEPTH=0 -DBIT_DEPTH=8
+	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
+
+10bit/%.o: %.S
+	@mkdir -p $(dir $@)
+	$(AS) $(ASFLAGS) -o $@ $< -DHIGH_BIT_DEPTH=1 -DBIT_DEPTH=10
+	-@ $(if $(STRIP), $(STRIP) -x $@) # delete local/anonymous symbols, so they don't show up in oprofile
+
 %.dll.o: %.rc x264.h
 	$(RC) $(RCFLAGS)$@ -DDLL $<
 
diff --git a/common/aarch64/asm.S b/common/aarch64/asm.S
index fff572a..6dd5a3a 100644
--- a/common/aarch64/asm.S
+++ b/common/aarch64/asm.S
@@ -27,12 +27,17 @@
 
 #include "config.h"
 
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+
 #ifdef PREFIX
-#   define EXTERN_ASM _x264_
+#   define EXTERN_ASM JOIN(JOIN(_x264_, BIT_DEPTH), _)
 #else
-#   define EXTERN_ASM x264_
+#   define EXTERN_ASM JOIN(JOIN(x264_, BIT_DEPTH), _)
 #endif
 
+#define X(s) JOIN(EXTERN_ASM, s)
+
 #ifdef __ELF__
 #   define ELF
 #else
@@ -94,10 +99,6 @@ MACH    .const_data
 #endif
 .endm
 
-#define GLUE(a, b) a ## b
-#define JOIN(a, b) GLUE(a, b)
-#define X(s) JOIN(EXTERN_ASM, s)
-
 #define FDEC_STRIDE 32
 #define FENC_STRIDE 16
 
diff --git a/common/arm/asm.S b/common/arm/asm.S
index dbb3168..7e486b8 100644
--- a/common/arm/asm.S
+++ b/common/arm/asm.S
@@ -38,12 +38,17 @@
 
 .fpu neon
 
+#define GLUE(a, b) a ## b
+#define JOIN(a, b) GLUE(a, b)
+
 #ifdef PREFIX
-#   define EXTERN_ASM _x264_
+#   define EXTERN_ASM JOIN(JOIN(_x264_, BIT_DEPTH), _)
 #else
-#   define EXTERN_ASM x264_
+#   define EXTERN_ASM JOIN(JOIN(x264_, BIT_DEPTH), _)
 #endif
 
+#define X(s) JOIN(EXTERN_ASM, s)
+
 #ifdef __ELF__
 #   define ELF
 #else
@@ -164,10 +169,6 @@ ELF     .size   \name, . - \name
 #endif
 .endm
 
-#define GLUE(a, b) a ## b
-#define JOIN(a, b) GLUE(a, b)
-#define X(s) JOIN(EXTERN_ASM, s)
-
 #define FENC_STRIDE 16
 #define FDEC_STRIDE 32
 
diff --git a/tools/asm.list b/tools/asm.list
index b6959a5..7787bdd 100644
--- a/tools/asm.list
+++ b/tools/asm.list
@@ -40,6 +40,7 @@ decimate_score16
 decimate_score64
 denoise_dct
 dequant_4x4
+dequant_4x4_dc
 dequant_4x4_flat16
 dequant_4x4dc
 dequant_8x8
@@ -94,9 +95,21 @@ mc_offsetsub_w4
 mc_offsetsub_w8
 mc_weight_w12
 mc_weight_w16
+mc_weight_w16_nodenom
+mc_weight_w16_offsetadd
+mc_weight_w16_offsetsub
 mc_weight_w20
+mc_weight_w20_nodenom
+mc_weight_w20_offsetadd
+mc_weight_w20_offsetsub
 mc_weight_w4
+mc_weight_w4_nodenom
+mc_weight_w4_offsetadd
+mc_weight_w4_offsetsub
 mc_weight_w8
+mc_weight_w8_nodenom
+mc_weight_w8_offsetadd
+mc_weight_w8_offsetsub
 memcpy_aligned
 memzero_aligned
 nal_escape
@@ -137,6 +150,14 @@ pixel_sad_4x8
 pixel_sad_8x16
 pixel_sad_8x4
 pixel_sad_8x8
+pixel_sad_aligned
+pixel_sad_aligned_16x16
+pixel_sad_aligned_16x8
+pixel_sad_aligned_4x4
+pixel_sad_aligned_4x8
+pixel_sad_aligned_8x16
+pixel_sad_aligned_8x4
+pixel_sad_aligned_8x8
 pixel_sad_x3_16x16
 pixel_sad_x3_16x8
 pixel_sad_x3_4x4
@@ -187,21 +208,26 @@ predict_16x16_dc_left
 predict_16x16_dc_top
 predict_16x16_h
 predict_16x16_init
+predict_16x16_p
 predict_16x16_p_core
 predict_16x16_v
 predict_4x4_dc
+predict_4x4_dc_top
 predict_4x4_ddl
 predict_4x4_ddr
 predict_4x4_h
 predict_4x4_hd
 predict_4x4_hu
 predict_4x4_init
+predict_4x4_v
 predict_4x4_vl
 predict_4x4_vr
 predict_8x16c_dc
+predict_8x16c_dc_left
 predict_8x16c_dc_top
 predict_8x16c_h
 predict_8x16c_init
+predict_8x16c_p
 predict_8x16c_p_core
 predict_8x16c_v
 predict_8x8_dc
@@ -218,11 +244,14 @@ predict_8x8_v
 predict_8x8_vl
 predict_8x8_vr
 predict_8x8c_dc
+predict_8x8c_dc_left
 predict_8x8c_dc_top
 predict_8x8c_h
 predict_8x8c_init
+predict_8x8c_p
 predict_8x8c_p_core
 predict_8x8c_v
+prefetch_fenc
 prefetch_fenc_420
 prefetch_fenc_422
 prefetch_ref
@@ -254,3 +283,5 @@ zigzag_sub_4x4_field
 zigzag_sub_4x4_frame
 zigzag_sub_4x4ac_field
 zigzag_sub_4x4ac_frame
+zigzag_sub_8x8_field
+zigzag_sub_8x8_frame
diff --git a/tools/duplicate-asm.sh b/tools/duplicate-asm.sh
index 81c3ed6..60b3ac9 100755
--- a/tools/duplicate-asm.sh
+++ b/tools/duplicate-asm.sh
@@ -13,7 +13,8 @@ ARCH_LIST="sse2_amd sse2_aligned sse2_lzcnt sse2slow sse2 \
            ssse3_atom ssse3_aligned ssse3_cache64 ssse3_lzcnt ssse3 sse3 sse4 sse \
            avx2_bmi2 avx2_lzcnt avx2 avx \
            mmx2_lzcnt mmx2 mmx \
-           asm atom fma4 xop"
+           asm atom fma4 xop \
+           neon_dual neon armv6 arm aarch64"
 
 for var in $API_LIST; do
     for arch in $ARCH_LIST; do
-- 
2.10.0



More information about the x264-devel mailing list