[x265] [PATCH 01/12] AArch64: Fix costCoeffNxN test on Apple Silicon

Hari Limaye hari.limaye at arm.com
Thu May 9 14:01:20 UTC 2024


Hi Santhoshini,

Thank you for clarifying - I am attaching a patch file as requested.

This patch file contains all 12 patches that have been submitted to the mailing list thus far. It can be applied using `git am aarch64_initial_refactoring.patch` to preserve the commit history.

Many thanks,

Hari
-------------- next part --------------
>From a7522afc35dbf173b6fe7def18c73918c4d7df87 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Mon, 2 Oct 2023 20:54:57 +0100
Subject: [PATCH 01/12] AArch64: Fix costCoeffNxN test on Apple Silicon

The assembly routine x265_costCoeffNxN_neon is buggy and produces an
incorrect result on Apple Silicon, causing the pixel testbench to fail
on these platforms.

x265_costCoeffNxN assumes that parameter `int subPosBase`, the second
parameter of type `int` passed on the stack, is at position `sp + 8`;
this assumption is consistent with the AArch64 PCS, as arguments smaller
than 8 bytes are widened to 8 bytes (aapcs64 6.8.2 C.16).
However arm64e diverges from AAPCS64: 'Function arguments may consume
slots on the stack that are not multiples of 8 bytes'.
---
 source/common/aarch64/asm.S        | 12 +++++++++++-
 source/common/aarch64/pixel-util.S |  4 ++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/source/common/aarch64/asm.S b/source/common/aarch64/asm.S
index ce0668103..742978631 100644
--- a/source/common/aarch64/asm.S
+++ b/source/common/aarch64/asm.S
@@ -72,6 +72,16 @@
 
 #define PFX_C(name)        JOIN(JOIN(JOIN(EXTERN_ASM, X265_NS), _), name)
 
+// Alignment of stack arguments of size less than 8 bytes.
+#ifdef __APPLE__
+#define STACK_ARG_ALIGNMENT 4
+#else
+#define STACK_ARG_ALIGNMENT 8
+#endif
+
+// Get offset from SP of stack argument at index `idx`.
+#define STACK_ARG_OFFSET(idx) (idx * STACK_ARG_ALIGNMENT)
+
 #ifdef __APPLE__
 .macro endfunc
 ELF .size \name, . - \name
@@ -184,4 +194,4 @@ ELF     .size   \name, . - \name
     vtrn            \t3, \t4, \s3, \s4
 .endm
 
-#endif
\ No newline at end of file
+#endif
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 9b3c11504..378c6891c 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -2311,7 +2311,7 @@ endfunc
 //    uint8_t *baseCtx,      // x6
 //    int offset,            // x7
 //    int scanPosSigOff,     // sp
-//    int subPosBase)        // sp + 8
+//    int subPosBase)        // sp + 8, or sp + 4 on APPLE
 function PFX(costCoeffNxN_neon)
     // abs(coeff)
     add             x2, x2, x2
@@ -2410,7 +2410,7 @@ function PFX(costCoeffNxN_neon)
     add             x4, x4, x15
     str             h2, [x13]              // absCoeff[numNonZero] = tmpCoeff[blkPos]
 
-    ldr             x9, [sp, #8]           // subPosBase
+    ldr             x9, [sp, #STACK_ARG_OFFSET(1)]           // subPosBase
     uxth            w9, w9
     cmp             w9, #0
     cset            x2, eq
-- 
2.42.1


>From dc1416f3e2647ee1d8797c223d1ef89c4098dd70 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Wed, 25 Oct 2023 11:50:36 +0100
Subject: [PATCH 02/12] Arm: Use local labels for assembly routine loops

Amend loop labels in Arm & AArch64 assembly files to start with `.Loop`
instead of `.loop`, as the GNU assembler prefix for local labels is
`.L`. This improves the output of tools like perf and gdb, as code under
loop labels are correctly attributed to their containing routine.
---
 source/common/aarch64/blockcopy8-sve.S  |  60 +++++------
 source/common/aarch64/blockcopy8.S      | 116 ++++++++++-----------
 source/common/aarch64/ipfilter-common.S | 104 +++++++++----------
 source/common/aarch64/ipfilter-sve2.S   |  56 +++++-----
 source/common/aarch64/ipfilter.S        |  12 +--
 source/common/aarch64/mc-a-sve2.S       |  88 ++++++++--------
 source/common/aarch64/mc-a.S            |  32 +++---
 source/common/aarch64/p2s-sve.S         |  12 +--
 source/common/aarch64/p2s.S             |  12 +--
 source/common/aarch64/pixel-util-sve.S  |   4 +-
 source/common/aarch64/pixel-util-sve2.S |  56 +++++-----
 source/common/aarch64/pixel-util.S      | 132 ++++++++++++------------
 source/common/aarch64/sad-a-sve2.S      |  16 +--
 source/common/aarch64/sad-a.S           |   8 +-
 source/common/aarch64/ssd-a-sve2.S      |  16 +--
 source/common/aarch64/ssd-a.S           |  32 +++---
 source/common/arm/blockcopy8.S          |   4 +-
 source/common/arm/dct-a.S               |   8 +-
 source/common/arm/ipfilter8.S           | 108 +++++++++----------
 source/common/arm/mc-a.S                |   8 +-
 source/common/arm/pixel-util.S          |  28 ++---
 source/common/arm/sad-a.S               |  36 +++----
 source/common/arm/ssd-a.S               |  28 ++---
 23 files changed, 488 insertions(+), 488 deletions(-)

diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 846927909..d5664af58 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -112,7 +112,7 @@ function PFX(blockcopy_sp_32x32_sve)
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, [x11]
-.loop_csp32_sve:
+.Loop_csp32_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, [x2], x3
@@ -124,7 +124,7 @@ function PFX(blockcopy_sp_32x32_sve)
     st1             {v0.16b-v1.16b}, [x0], x1
     st1             {v2.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_csp32_sve
+    cbnz            w12, .Loop_csp32_sve
     ret
 .vl_gt_16_blockcopy_sp_32_32:
     cmp             x9, #48
@@ -199,7 +199,7 @@ function PFX(blockcopy_ps_32x32_sve)
     bgt             .vl_gt_16_blockcopy_ps_32_32
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_cps32_sve:
+.Loop_cps32_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, [x2], x3
@@ -215,7 +215,7 @@ function PFX(blockcopy_ps_32x32_sve)
     st1             {v0.8h-v3.8h}, [x0], x1
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps32_sve
+    cbnz            w12, .Loop_cps32_sve
     ret
 .vl_gt_16_blockcopy_ps_32_32:
     cmp             x9, #48
@@ -248,7 +248,7 @@ function PFX(blockcopy_ps_64x64_sve)
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_cps64_sve:
+.Loop_cps64_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x2], x3
@@ -263,7 +263,7 @@ function PFX(blockcopy_ps_64x64_sve)
     st1             {v0.8h-v3.8h}, [x0], #64
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps64_sve
+    cbnz            w12, .Loop_cps64_sve
     ret
 .vl_gt_16_blockcopy_ps_64_64:
     cmp             x9, #48
@@ -338,13 +338,13 @@ function PFX(blockcopy_ss_32x32_sve)
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #4
-.loop_css32_sve:
+.Loop_css32_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], x3
     st1             {v0.8h-v3.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css32_sve
+    cbnz            w12, .Loop_css32_sve
     ret
 .vl_gt_16_blockcopy_ss_32_32:
     cmp             x9, #48
@@ -379,7 +379,7 @@ function PFX(blockcopy_ss_64x64_sve)
     lsl             x3, x3, #1
     sub             x3, x3, #64
     mov             w12, #8
-.loop_css64_sve:
+.Loop_css64_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], #64
@@ -387,7 +387,7 @@ function PFX(blockcopy_ss_64x64_sve)
     st1             {v0.8h-v3.8h}, [x0], #64
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css64_sve
+    cbnz            w12, .Loop_css64_sve
     ret
 .vl_gt_16_blockcopy_ss_64_64:
     cmp             x9, #48
@@ -474,13 +474,13 @@ function PFX(blockcopy_ss_32x64_sve)
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #8
-.loop_css32x64_sve:
+.Loop_css32x64_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], x3
     st1             {v0.8h-v3.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css32x64_sve
+    cbnz            w12, .Loop_css32x64_sve
     ret
 .vl_gt_16_blockcopy_ss_32_64:
     cmp             x9, #48
@@ -570,7 +570,7 @@ function PFX(blockcopy_ps_32x64_sve)
     bgt             .vl_gt_16_blockcopy_ps_32_64
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_cps32x64_sve:
+.Loop_cps32x64_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, [x2], x3
@@ -586,7 +586,7 @@ function PFX(blockcopy_ps_32x64_sve)
     st1             {v0.8h-v3.8h}, [x0], x1
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps32x64_sve
+    cbnz            w12, .Loop_cps32x64_sve
     ret
 .vl_gt_16_blockcopy_ps_32_64:
     cmp             x9, #48
@@ -730,13 +730,13 @@ function PFX(blockcopy_pp_32x\h\()_sve)
     rdvl            x9, #1
     cmp             x9, #16
     bgt             .vl_gt_16_blockcopy_pp_32xN_\h
-.loop_sve_32x\h\():
+.Loop_sve_32x\h\():
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b-v1.16b}, [x2], x3
     st1             {v0.16b-v1.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_sve_32x\h
+    cbnz            w12, .Loop_sve_32x\h
     ret
 .vl_gt_16_blockcopy_pp_32xN_\h:
     ptrue           p0.b, vl32
@@ -765,13 +765,13 @@ function PFX(blockcopy_pp_64x\h\()_sve)
     rdvl            x9, #1
     cmp             x9, #16
     bgt             .vl_gt_16_blockcopy_pp_64xN_\h
-.loop_sve_64x\h\():
+.Loop_sve_64x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, [x2], x3
     st1             {v0.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_sve_64x\h
+    cbnz            w12, .Loop_sve_64x\h
     ret
 .vl_gt_16_blockcopy_pp_64xN_\h:
     cmp             x9, #48
@@ -856,7 +856,7 @@ function PFX(cpy2Dto1D_shl_16x16_sve)
     bgt             .vl_gt_16_cpy2Dto1D_shl_16x16
     cpy2Dto1D_shl_start_sve
     mov             w12, #4
-.loop_cpy2Dto1D_shl_16_sve:
+.Loop_cpy2Dto1D_shl_16_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.16b-v3.16b}, [x1], x2
@@ -864,7 +864,7 @@ function PFX(cpy2Dto1D_shl_16x16_sve)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.16b-v3.16b}, [x0], #32
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_16_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_16_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_16x16:
     ptrue           p0.h, vl16
@@ -885,7 +885,7 @@ function PFX(cpy2Dto1D_shl_32x32_sve)
     bgt             .vl_gt_16_cpy2Dto1D_shl_32x32
     cpy2Dto1D_shl_start_sve
     mov             w12, #16
-.loop_cpy2Dto1D_shl_32_sve:
+.Loop_cpy2Dto1D_shl_32_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], x2
@@ -895,7 +895,7 @@ function PFX(cpy2Dto1D_shl_32x32_sve)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_32_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_32_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_32x32:
     cmp             x9, #48
@@ -931,7 +931,7 @@ function PFX(cpy2Dto1D_shl_64x64_sve)
     cpy2Dto1D_shl_start_sve
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy2Dto1D_shl_64_sve:
+.Loop_cpy2Dto1D_shl_64_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -947,7 +947,7 @@ function PFX(cpy2Dto1D_shl_64x64_sve)
     st1             {v2.16b-v5.16b}, [x0], #64
     st1             {v16.16b-v19.16b}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_64_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_64_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_64x64:
     dup             z0.h, w3
@@ -1055,7 +1055,7 @@ function PFX(cpy2Dto1D_shr_32x32_sve)
     bgt             .vl_gt_16_cpy2Dto1D_shr_32x32
     cpy2Dto1D_shr_start
     mov             w12, #16
-.loop_cpy2Dto1D_shr_32_sve:
+.Loop_cpy2Dto1D_shr_32_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.8h-v5.8h}, [x1], x2
@@ -1069,7 +1069,7 @@ function PFX(cpy2Dto1D_shr_32x32_sve)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.8h-v5.8h}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shr_32_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shr_32_sve
     ret
 .vl_gt_16_cpy2Dto1D_shr_32x32:
     dup             z0.h, w3
@@ -1218,7 +1218,7 @@ function PFX(cpy1Dto2D_shr_16x16_sve)
     bgt             .vl_gt_16_cpy1Dto2D_shr_16x16
     cpy1Dto2D_shr_start
     mov             w12, #4
-.loop_cpy1Dto2D_shr_16:
+.Loop_cpy1Dto2D_shr_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.8h-v3.8h}, [x1], #32
@@ -1228,7 +1228,7 @@ function PFX(cpy1Dto2D_shr_16x16_sve)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.8h-v3.8h}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_16
+    cbnz            w12, .Loop_cpy1Dto2D_shr_16
     ret
 .vl_gt_16_cpy1Dto2D_shr_16x16:
     dup             z0.h, w3
@@ -1254,7 +1254,7 @@ function PFX(cpy1Dto2D_shr_32x32_sve)
     bgt             .vl_gt_16_cpy1Dto2D_shr_32x32
     cpy1Dto2D_shr_start
     mov             w12, #16
-.loop_cpy1Dto2D_shr_32_sve:
+.Loop_cpy1Dto2D_shr_32_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1268,7 +1268,7 @@ function PFX(cpy1Dto2D_shr_32x32_sve)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_32_sve
+    cbnz            w12, .Loop_cpy1Dto2D_shr_32_sve
     ret
 .vl_gt_16_cpy1Dto2D_shr_32x32:
     dup             z0.h, w3
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 495ee7ea2..1ad371c57 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -86,7 +86,7 @@ function PFX(blockcopy_sp_32x32_neon)
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, [x11]
-.loop_csp32:
+.Loop_csp32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, [x2], x3
@@ -98,7 +98,7 @@ function PFX(blockcopy_sp_32x32_neon)
     st1             {v0.16b-v1.16b}, [x0], x1
     st1             {v2.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_csp32
+    cbnz            w12, .Loop_csp32
     ret
 endfunc
 
@@ -108,7 +108,7 @@ function PFX(blockcopy_sp_64x64_neon)
     sub             x3, x3, #64
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, [x11]
-.loop_csp64:
+.Loop_csp64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, [x2], #64
@@ -119,7 +119,7 @@ function PFX(blockcopy_sp_64x64_neon)
     tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
     st1             {v0.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_csp64
+    cbnz            w12, .Loop_csp64
     ret
 endfunc
 
@@ -168,7 +168,7 @@ endfunc
 function PFX(blockcopy_ps_32x32_neon)
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_cps32:
+.Loop_cps32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, [x2], x3
@@ -184,7 +184,7 @@ function PFX(blockcopy_ps_32x32_neon)
     st1             {v0.8h-v3.8h}, [x0], x1
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps32
+    cbnz            w12, .Loop_cps32
     ret
 endfunc
 
@@ -192,7 +192,7 @@ function PFX(blockcopy_ps_64x64_neon)
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_cps64:
+.Loop_cps64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x2], x3
@@ -207,7 +207,7 @@ function PFX(blockcopy_ps_64x64_neon)
     st1             {v0.8h-v3.8h}, [x0], #64
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps64
+    cbnz            w12, .Loop_cps64
     ret
 endfunc
 
@@ -252,13 +252,13 @@ function PFX(blockcopy_ss_32x32_neon)
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #4
-.loop_css32:
+.Loop_css32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], x3
     st1             {v0.8h-v3.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css32
+    cbnz            w12, .Loop_css32
     ret
 endfunc
 
@@ -268,7 +268,7 @@ function PFX(blockcopy_ss_64x64_neon)
     lsl             x3, x3, #1
     sub             x3, x3, #64
     mov             w12, #8
-.loop_css64:
+.Loop_css64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], #64
@@ -276,7 +276,7 @@ function PFX(blockcopy_ss_64x64_neon)
     st1             {v0.8h-v3.8h}, [x0], #64
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css64
+    cbnz            w12, .Loop_css64
     ret
 endfunc
 
@@ -321,13 +321,13 @@ function PFX(blockcopy_ss_32x64_neon)
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #8
-.loop_css32x64:
+.Loop_css32x64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], x3
     st1             {v0.8h-v3.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css32x64
+    cbnz            w12, .Loop_css32x64
     ret
 endfunc
 
@@ -376,7 +376,7 @@ endfunc
 function PFX(blockcopy_ps_32x64_neon)
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_cps32x64:
+.Loop_cps32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, [x2], x3
@@ -392,7 +392,7 @@ function PFX(blockcopy_ps_32x64_neon)
     st1             {v0.8h-v3.8h}, [x0], x1
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps32x64
+    cbnz            w12, .Loop_cps32x64
     ret
 endfunc
 
@@ -443,7 +443,7 @@ function PFX(blockcopy_sp_32x64_neon)
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, [x11]
-.loop_csp32x64:
+.Loop_csp32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, [x2], x3
@@ -455,7 +455,7 @@ function PFX(blockcopy_sp_32x64_neon)
     st1             {v0.16b-v1.16b}, [x0], x1
     st1             {v2.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_csp32x64
+    cbnz            w12, .Loop_csp32x64
     ret
 endfunc
 
@@ -595,13 +595,13 @@ blockcopy_pp_8xN_neon 32
 
 function PFX(blockcopy_pp_8x64_neon)
     mov             w12, #4
-.loop_pp_8x64:
+.Loop_pp_8x64:
     sub             w12, w12, #1
 .rept 16
     ld1             {v0.4h}, [x2], x3
     st1             {v0.4h}, [x0], x1
 .endr
-    cbnz            w12, .loop_pp_8x64
+    cbnz            w12, .Loop_pp_8x64
     ret
 endfunc
 
@@ -623,13 +623,13 @@ blockcopy_pp_16xN_neon 16
 .macro blockcopy_pp_16xN1_neon h
 function PFX(blockcopy_pp_16x\h\()_neon)
     mov             w12, #\h / 8
-.loop_16x\h\():
+.Loop_16x\h\():
 .rept 8
     ld1             {v0.8h}, [x2], x3
     st1             {v0.8h}, [x0], x1
 .endr
     sub             w12, w12, #1
-    cbnz            w12, .loop_16x\h
+    cbnz            w12, .Loop_16x\h
     ret
 endfunc
 .endm
@@ -651,38 +651,38 @@ endfunc
 function PFX(blockcopy_pp_12x32_neon)
     sub             x1, x1, #8
     mov             w12, #4
-.loop_pp_12x32:
+.Loop_pp_12x32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b}, [x2], x3
     str             d0, [x0], #8
     st1             {v0.s}[2], [x0], x1
 .endr
-    cbnz            w12, .loop_pp_12x32
+    cbnz            w12, .Loop_pp_12x32
     ret
 endfunc
 
 function PFX(blockcopy_pp_24x32_neon)
     mov             w12, #4
-.loop_24x32:
+.Loop_24x32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8b-v2.8b}, [x2], x3
     st1             {v0.8b-v2.8b}, [x0], x1
 .endr
-    cbnz            w12, .loop_24x32
+    cbnz            w12, .Loop_24x32
     ret
 endfunc
 
 function PFX(blockcopy_pp_24x64_neon)
     mov             w12, #4
-.loop_24x64:
+.Loop_24x64:
     sub             w12, w12, #1
 .rept 16
     ld1             {v0.8b-v2.8b}, [x2], x3
     st1             {v0.8b-v2.8b}, [x0], x1
 .endr
-    cbnz            w12, .loop_24x64
+    cbnz            w12, .Loop_24x64
     ret
 endfunc
 
@@ -697,13 +697,13 @@ endfunc
 .macro blockcopy_pp_32xN_neon h
 function PFX(blockcopy_pp_32x\h\()_neon)
     mov             w12, #\h / 8
-.loop_32x\h\():
+.Loop_32x\h\():
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b-v1.16b}, [x2], x3
     st1             {v0.16b-v1.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_32x\h
+    cbnz            w12, .Loop_32x\h
     ret
 endfunc
 .endm
@@ -716,26 +716,26 @@ blockcopy_pp_32xN_neon 48
 
 function PFX(blockcopy_pp_48x64_neon)
     mov             w12, #8
-.loop_48x64:
+.Loop_48x64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b-v2.16b}, [x2], x3
     st1             {v0.16b-v2.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_48x64
+    cbnz            w12, .Loop_48x64
     ret
 endfunc
 
 .macro blockcopy_pp_64xN_neon h
 function PFX(blockcopy_pp_64x\h\()_neon)
     mov             w12, #\h / 4
-.loop_64x\h\():
+.Loop_64x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, [x2], x3
     st1             {v0.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_64x\h
+    cbnz            w12, .Loop_64x\h
     ret
 endfunc
 .endm
@@ -950,11 +950,11 @@ function PFX(count_nonzero_32_neon)
     trn1            v16.16b, v16.16b, v17.16b
     movi            v18.16b, #0
     mov             w12, #16
-.loop_count_nonzero_32:
+.Loop_count_nonzero_32:
     sub             w12, w12, #1
     COUNT_NONZERO_8
     add             v18.16b, v18.16b, v0.16b
-    cbnz            w12, .loop_count_nonzero_32
+    cbnz            w12, .Loop_count_nonzero_32
 
     uaddlv          s0, v18.8h
     fmov            w0, s0
@@ -994,7 +994,7 @@ endfunc
 function PFX(cpy2Dto1D_shl_16x16_neon)
     cpy2Dto1D_shl_start
     mov             w12, #4
-.loop_cpy2Dto1D_shl_16:
+.Loop_cpy2Dto1D_shl_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.16b-v3.16b}, [x1], x2
@@ -1002,14 +1002,14 @@ function PFX(cpy2Dto1D_shl_16x16_neon)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.16b-v3.16b}, [x0], #32
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_16
+    cbnz            w12, .Loop_cpy2Dto1D_shl_16
     ret
 endfunc
 
 function PFX(cpy2Dto1D_shl_32x32_neon)
     cpy2Dto1D_shl_start
     mov             w12, #16
-.loop_cpy2Dto1D_shl_32:
+.Loop_cpy2Dto1D_shl_32:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], x2
@@ -1019,7 +1019,7 @@ function PFX(cpy2Dto1D_shl_32x32_neon)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_32
+    cbnz            w12, .Loop_cpy2Dto1D_shl_32
     ret
 endfunc
 
@@ -1027,7 +1027,7 @@ function PFX(cpy2Dto1D_shl_64x64_neon)
     cpy2Dto1D_shl_start
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy2Dto1D_shl_64:
+.Loop_cpy2Dto1D_shl_64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1043,7 +1043,7 @@ function PFX(cpy2Dto1D_shl_64x64_neon)
     st1             {v2.16b-v5.16b}, [x0], #64
     st1             {v16.16b-v19.16b}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_64
+    cbnz            w12, .Loop_cpy2Dto1D_shl_64
     ret
 endfunc
 
@@ -1079,7 +1079,7 @@ endfunc
 function PFX(cpy2Dto1D_shr_16x16_neon)
     cpy2Dto1D_shr_start
     mov             w12, #4
-.loop_cpy2Dto1D_shr_16:
+.Loop_cpy2Dto1D_shr_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.8h-v3.8h}, [x1], x2
@@ -1089,14 +1089,14 @@ function PFX(cpy2Dto1D_shr_16x16_neon)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.8h-v3.8h}, [x0], #32
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shr_16
+    cbnz            w12, .Loop_cpy2Dto1D_shr_16
     ret
 endfunc
 
 function PFX(cpy2Dto1D_shr_32x32_neon)
     cpy2Dto1D_shr_start
     mov             w12, #16
-.loop_cpy2Dto1D_shr_32:
+.Loop_cpy2Dto1D_shr_32:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.8h-v5.8h}, [x1], x2
@@ -1110,7 +1110,7 @@ function PFX(cpy2Dto1D_shr_32x32_neon)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.8h-v5.8h}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shr_32
+    cbnz            w12, .Loop_cpy2Dto1D_shr_32
     ret
 endfunc
 
@@ -1147,7 +1147,7 @@ endfunc
 function PFX(cpy1Dto2D_shl_16x16_neon)
     cpy1Dto2D_shl_start
     mov             w12, #4
-.loop_cpy1Dto2D_shl_16:
+.Loop_cpy1Dto2D_shl_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.16b-v3.16b}, [x1], #32
@@ -1155,14 +1155,14 @@ function PFX(cpy1Dto2D_shl_16x16_neon)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.16b-v3.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shl_16
+    cbnz            w12, .Loop_cpy1Dto2D_shl_16
     ret
 endfunc
 
 function PFX(cpy1Dto2D_shl_32x32_neon)
     cpy1Dto2D_shl_start
     mov             w12, #16
-.loop_cpy1Dto2D_shl_32:
+.Loop_cpy1Dto2D_shl_32:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1172,7 +1172,7 @@ function PFX(cpy1Dto2D_shl_32x32_neon)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shl_32
+    cbnz            w12, .Loop_cpy1Dto2D_shl_32
     ret
 endfunc
 
@@ -1180,7 +1180,7 @@ function PFX(cpy1Dto2D_shl_64x64_neon)
     cpy1Dto2D_shl_start
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy1Dto2D_shl_64:
+.Loop_cpy1Dto2D_shl_64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1196,7 +1196,7 @@ function PFX(cpy1Dto2D_shl_64x64_neon)
     st1             {v2.16b-v5.16b}, [x0], #64
     st1             {v16.16b-v19.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shl_64
+    cbnz            w12, .Loop_cpy1Dto2D_shl_64
     ret
 endfunc
 
@@ -1231,7 +1231,7 @@ endfunc
 function PFX(cpy1Dto2D_shr_16x16_neon)
     cpy1Dto2D_shr_start
     mov             w12, #4
-.loop_cpy1Dto2D_shr_16:
+.Loop_cpy1Dto2D_shr_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.8h-v3.8h}, [x1], #32
@@ -1241,14 +1241,14 @@ function PFX(cpy1Dto2D_shr_16x16_neon)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.8h-v3.8h}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_16
+    cbnz            w12, .Loop_cpy1Dto2D_shr_16
     ret
 endfunc
 
 function PFX(cpy1Dto2D_shr_32x32_neon)
     cpy1Dto2D_shr_start
     mov             w12, #16
-.loop_cpy1Dto2D_shr_32:
+.Loop_cpy1Dto2D_shr_32:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1262,7 +1262,7 @@ function PFX(cpy1Dto2D_shr_32x32_neon)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_32
+    cbnz            w12, .Loop_cpy1Dto2D_shr_32
     ret
 endfunc
 
@@ -1270,7 +1270,7 @@ function PFX(cpy1Dto2D_shr_64x64_neon)
     cpy1Dto2D_shr_start
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy1Dto2D_shr_64:
+.Loop_cpy1Dto2D_shr_64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1294,6 +1294,6 @@ function PFX(cpy1Dto2D_shr_64x64_neon)
     st1             {v2.16b-v5.16b}, [x0], #64
     st1             {v16.16b-v19.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_64
+    cbnz            w12, .Loop_cpy1Dto2D_shr_64
     ret
 endfunc
diff --git a/source/common/aarch64/ipfilter-common.S b/source/common/aarch64/ipfilter-common.S
index b7c61ee64..a08c3c165 100644
--- a/source/common/aarch64/ipfilter-common.S
+++ b/source/common/aarch64/ipfilter-common.S
@@ -800,10 +800,10 @@
     mov             w12, #32
     dup             v31.8h, w12
     qpel_start_\v
-.loop_luma_vpp_\v\()_\w\()x\h:
+.Loop_luma_vpp_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vpp_w8_\v\()_\w\()x\h:
+.Loop_luma_vpp_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
 .if \w == 8 || \w == 24
     qpel_load_32b \v
@@ -833,11 +833,11 @@
     add             x9, x9, #16
 .endif
     cmp             x9, #\w
-    blt             .loop_luma_vpp_w8_\v\()_\w\()x\h
+    blt             .Loop_luma_vpp_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vpp_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vpp_\v\()_\w\()x\h
     ret
 .endm
 
@@ -854,10 +854,10 @@
     mov             w12, #8192
     dup             v31.8h, w12
     qpel_start_\v
-.loop_ps_\v\()_\w\()x\h:
+.Loop_ps_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_ps_w8_\v\()_\w\()x\h:
+.Loop_ps_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
 .if \w == 8 || \w == 24
     qpel_load_32b \v
@@ -885,11 +885,11 @@
     add             x9, x9, #16
 .endif
     cmp             x9, #\w
-    blt             .loop_ps_w8_\v\()_\w\()x\h
+    blt             .Loop_ps_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_ps_\v\()_\w\()x\h
+    cbnz            x5, .Loop_ps_\v\()_\w\()x\h
     ret
 .endm
 
@@ -914,10 +914,10 @@
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_\v\()_1
-.loop_luma_vsp_\v\()_\w\()x\h:
+.Loop_luma_vsp_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vsp_w8_\v\()_\w\()x\h:
+.Loop_luma_vsp_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_load_64b \v
     qpel_filter_\v\()_32b_1
@@ -933,11 +933,11 @@
     add             x9, x9, #8
 .endif
     cmp             x9, x12
-    blt             .loop_luma_vsp_w8_\v\()_\w\()x\h
+    blt             .Loop_luma_vsp_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vsp_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vsp_\v\()_\w\()x\h
     ret
 .endm
 
@@ -957,10 +957,10 @@
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_\v\()_1
-.loop_luma_vss_\v\()_\w\()x\h:
+.Loop_luma_vss_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vss_w8_\v\()_\w\()x\h:
+.Loop_luma_vss_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_load_64b \v
     qpel_filter_\v\()_32b_1
@@ -981,11 +981,11 @@
 .endif
 .endif
     cmp             x9, x12
-    blt             .loop_luma_vss_w8_\v\()_\w\()x\h
+    blt             .Loop_luma_vss_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vss_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vss_\v\()_\w\()x\h
     ret
 .endm
 
@@ -1013,11 +1013,11 @@
 .endr
     ret
 .else
-.loop1_hpp_\v\()_\w\()x\h:
+.Loop1_hpp_\v\()_\w\()x\h:
     mov             x7, #\w
     mov             x11, x0
     sub             x11, x11, #4
-.loop2_hpp_\v\()_\w\()x\h:
+.Loop2_hpp_\v\()_\w\()x\h:
     vextin8 \v
     qpel_filter_\v\()_32b
     hpp_end
@@ -1031,11 +1031,11 @@
     str             s17, [x2], #4
     sub             x7, x7, #4
 .endif
-    cbnz            x7, .loop2_hpp_\v\()_\w\()x\h
+    cbnz            x7, .Loop2_hpp_\v\()_\w\()x\h
     sub             x6, x6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            x6, .loop1_hpp_\v\()_\w\()x\h
+    cbnz            x6, .Loop1_hpp_\v\()_\w\()x\h
     ret
 .endif
 .endm
@@ -1051,7 +1051,7 @@
     dup             v31.8h, w12
     qpel_start_\v
 .if \w == 4
-.loop_hps_\v\()_\w\()x\h\():
+.Loop_hps_\v\()_\w\()x\h\():
     mov             x11, x0
     sub             x11, x11, #4
     vextin8 \v
@@ -1061,14 +1061,14 @@
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop_hps_\v\()_\w\()x\h
+    cbnz            w6, .Loop_hps_\v\()_\w\()x\h
     ret
 .else
-.loop1_hps_\v\()_\w\()x\h\():
+.Loop1_hps_\v\()_\w\()x\h\():
     mov             w7, #\w
     mov             x11, x0
     sub             x11, x11, #4
-.loop2_hps_\v\()_\w\()x\h\():
+.Loop2_hps_\v\()_\w\()x\h\():
 .if \w == 8 || \w == 12 || \w == 24
     vextin8 \v
     qpel_filter_\v\()_32b
@@ -1092,11 +1092,11 @@
     sub             w7, w7, #16
     sub             x11, x11, #16
 .endif
-    cbnz            w7, .loop2_hps_\v\()_\w\()x\h
+    cbnz            w7, .Loop2_hps_\v\()_\w\()x\h
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop1_hps_\v\()_\w\()x\h
+    cbnz            w6, .Loop1_hps_\v\()_\w\()x\h
     ret
 .endif
 .endm
@@ -1107,10 +1107,10 @@
     dup             v31.8h, w12
     sub             x0, x0, x1
     mov             x5, #\h
-.loop_chroma_vpp_\v\()_\w\()x\h:
+.Loop_chroma_vpp_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_chroma_vpp_w8_\v\()_\w\()x\h:
+.Loop_chroma_vpp_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_32b \v
     qpel_filter_chroma_\v\()_32b
@@ -1137,11 +1137,11 @@
     str             d17, [x7], #8
 .endif
     cmp             x9, #\w
-    blt             .loop_chroma_vpp_w8_\v\()_\w\()x\h
+    blt             .Loop_chroma_vpp_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_chroma_vpp_\v\()_\w\()x\h
+    cbnz            x5, .Loop_chroma_vpp_\v\()_\w\()x\h
     ret
 .endm
 
@@ -1152,10 +1152,10 @@
     lsl             x3, x3, #1
     sub             x0, x0, x1
     mov             x5, #\h
-.loop_vps_\v\()_\w\()x\h:
+.Loop_vps_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_vps_w8_\v\()_\w\()x\h:
+.Loop_vps_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_32b \v
     qpel_filter_chroma_\v\()_32b
@@ -1180,12 +1180,12 @@
     str             q17, [x7], #16
 .endif
     cmp             x9, #\w
-    blt             .loop_vps_w8_\v\()_\w\()x\h
+    blt             .Loop_vps_w8_\v\()_\w\()x\h
 
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vps_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vps_\v\()_\w\()x\h
     ret
 .endm
 
@@ -1200,10 +1200,10 @@
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_chroma_\v\()_1
-.loop_vsp_\v\()_\w\()x\h:
+.Loop_vsp_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_vsp_w8_\v\()_\w\()x\h:
+.Loop_vsp_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_64b \v
     qpel_filter_chroma_\v\()_32b_1
@@ -1223,11 +1223,11 @@
     str             d17, [x7], #8
 .endif
     cmp             x9, x12
-    blt             .loop_vsp_w8_\v\()_\w\()x\h
+    blt             .Loop_vsp_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vsp_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vsp_\v\()_\w\()x\h
     ret
 .endm
 
@@ -1239,7 +1239,7 @@
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_chroma_\v\()_1
-.loop_vss_\v\()_\w\()x\h:
+.Loop_vss_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
 .if \w == 4
@@ -1252,7 +1252,7 @@
     add             x9, x9, #4
 .endr
 .else
-.loop_vss_w8_\v\()_\w\()x\h:
+.Loop_vss_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_64b \v
     qpel_filter_chroma_\v\()_32b_1
@@ -1268,12 +1268,12 @@
     add             x9, x9, #8
 .endif
     cmp             x9, x12
-    blt             .loop_vss_w8_\v\()_\w\()x\h
+    blt             .Loop_vss_w8_\v\()_\w\()x\h
 .endif
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vss_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vss_\v\()_\w\()x\h
     ret
 .endm
 
@@ -1284,7 +1284,7 @@
     mov             w6, #\h
     sub             x3, x3, #\w
 .if \w == 2 || \w == 4 || \w == 6 || \w == 12
-.loop4_chroma_hpp_\v\()_\w\()x\h:
+.Loop4_chroma_hpp_\v\()_\w\()x\h:
     mov             x11, x0
     sub             x11, x11, #2
     vextin8_chroma \v
@@ -1310,15 +1310,15 @@
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop4_chroma_hpp_\v\()_\w\()x\h
+    cbnz            w6, .Loop4_chroma_hpp_\v\()_\w\()x\h
     ret
 .else
-.loop2_chroma_hpp_\v\()_\w\()x\h:
+.Loop2_chroma_hpp_\v\()_\w\()x\h:
     mov             x7, #\w
     lsr             x7, x7, #3
     mov             x11, x0
     sub             x11, x11, #2
-.loop3_chroma_hpp_\v\()_\w\()x\h:
+.Loop3_chroma_hpp_\v\()_\w\()x\h:
 .if \w == 8 || \w == 24
     vextin8_chroma \v
     qpel_filter_chroma_\v\()_32b
@@ -1336,11 +1336,11 @@
     sub             x7, x7, #2
     sub             x11, x11, #16
 .endif
-    cbnz            x7, .loop3_chroma_hpp_\v\()_\w\()x\h
+    cbnz            x7, .Loop3_chroma_hpp_\v\()_\w\()x\h
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop2_chroma_hpp_\v\()_\w\()x\h
+    cbnz            w6, .Loop2_chroma_hpp_\v\()_\w\()x\h
     ret
 .endif
 .endm
@@ -1397,12 +1397,12 @@
     add             w10, w10, #3
 9:
     mov             w6, w10
-.loop1_chroma_hps_\v\()_\w\()x\h\():
+.Loop1_chroma_hps_\v\()_\w\()x\h\():
     mov             x7, #\w
     lsr             x7, x7, #3
     mov             x11, x0
     sub             x11, x11, #2
-.loop2_chroma_hps_\v\()_\w\()x\h\():
+.Loop2_chroma_hps_\v\()_\w\()x\h\():
 .if \w == 8 || \w == 24
     vextin8_chroma \v
     qpel_filter_chroma_\v\()_32b
@@ -1419,11 +1419,11 @@
     sub             x7, x7, #2
     sub             x11, x11, #16
 .endif
-    cbnz            x7, .loop2_chroma_hps_\v\()_\w\()x\h\()
+    cbnz            x7, .Loop2_chroma_hps_\v\()_\w\()x\h\()
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop1_chroma_hps_\v\()_\w\()x\h\()
+    cbnz            w6, .Loop1_chroma_hps_\v\()_\w\()x\h\()
     ret
 .endif
 .endm
diff --git a/source/common/aarch64/ipfilter-sve2.S b/source/common/aarch64/ipfilter-sve2.S
index 95657db55..525ed1172 100644
--- a/source/common/aarch64/ipfilter-sve2.S
+++ b/source/common/aarch64/ipfilter-sve2.S
@@ -370,10 +370,10 @@
     cmp             x9, #16
     bgt             .vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h
     qpel_start_\v
-.loop_luma_vpp_sve2_\v\()_\w\()x\h:
+.Loop_luma_vpp_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
+.Loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
 .if \w == 8 || \w == 24
     qpel_load_32b \v
@@ -403,11 +403,11 @@
     add             x9, x9, #16
 .endif
     cmp             x9, #\w
-    blt             .loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vpp_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vpp_sve2_\v\()_\w\()x\h
     ret
 .vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h:
     ptrue           p0.h, vl8
@@ -522,7 +522,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_sve2
     ld1rd           {z22.d}, p0/z, [x12, #48]
     ld1rd           {z23.d}, p0/z, [x12, #56]
 
-.loop_vps_sve2_4x\h:
+.Loop_vps_sve2_4x\h:
     mov             x6, x0
 
     ld1b            {z0.s}, p0/z, [x6]
@@ -557,7 +557,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_sve2
 
     add             x0, x0, x1
     sub             x4, x4, #1
-    cbnz            x4, .loop_vps_sve2_4x\h
+    cbnz            x4, .Loop_vps_sve2_4x\h
     ret
 endfunc
 .endm
@@ -593,7 +593,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_sve2
     ld1rd           {z22.d}, p0/z, [x12, #48]
     ld1rd           {z23.d}, p0/z, [x12, #56]
 
-.loop_vsp_sve2_4x\h:
+.Loop_vsp_sve2_4x\h:
     mov             x6, x0
 
     ld1             {v0.8b}, [x6], x1
@@ -630,7 +630,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_sve2
 
     add             x0, x0, x1
     sub             x4, x4, #1
-    cbnz            x4, .loop_vsp_sve2_4x\h
+    cbnz            x4, .Loop_vsp_sve2_4x\h
     ret
 endfunc
 .endm
@@ -654,10 +654,10 @@ LUMA_VSP_4xN_SVE2 16
     cmp             x14, #16
     bgt             .vl_gt_16_FILTER_VPS_\v\()_\w\()x\h
     qpel_start_\v
-.loop_ps_sve2_\v\()_\w\()x\h:
+.Loop_ps_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_ps_w8_sve2_\v\()_\w\()x\h:
+.Loop_ps_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
 .if \w == 8 || \w == 24
     qpel_load_32b \v
@@ -685,11 +685,11 @@ LUMA_VSP_4xN_SVE2 16
     add             x9, x9, #16
 .endif
     cmp             x9, #\w
-    blt             .loop_ps_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_ps_w8_sve2_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_ps_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_ps_sve2_\v\()_\w\()x\h
     ret
 .vl_gt_16_FILTER_VPS_\v\()_\w\()x\h:
     ptrue           p0.h, vl8
@@ -796,10 +796,10 @@ LUMA_VPS_SVE2 64, 48
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_\v\()_1
-.loop_luma_vss_sve2_\v\()_\w\()x\h:
+.Loop_luma_vss_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vss_w8_sve2_\v\()_\w\()x\h:
+.Loop_luma_vss_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_load_64b \v
     qpel_filter_\v\()_32b_1
@@ -820,11 +820,11 @@ LUMA_VPS_SVE2 64, 48
 .endif
 .endif
     cmp             x9, x12
-    blt             .loop_luma_vss_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_luma_vss_w8_sve2_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vss_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vss_sve2_\v\()_\w\()x\h
     ret
 .endm
 
@@ -884,10 +884,10 @@ LUMA_VSS_SVE2 48, 64
     mov             z31.h, #32
     sub             x0, x0, x1
     mov             x5, #\h
-.loop_chroma_vpp_sve2_\v\()_\w\()x\h:
+.Loop_chroma_vpp_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h:
+.Loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_32b_sve2 \v
     qpel_filter_chroma_sve2_\v\()_32b
@@ -914,11 +914,11 @@ LUMA_VSS_SVE2 48, 64
     str             d17, [x7], #8
 .endif
     cmp             x9, #\w
-    blt             .loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_chroma_vpp_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_chroma_vpp_sve2_\v\()_\w\()x\h
     ret
 .endm
 
@@ -1008,10 +1008,10 @@ CHROMA_VPP_SVE2 48, 64
     lsl             x3, x3, #1
     sub             x0, x0, x1
     mov             x5, #\h
-.loop_vps_sve2_\v\()_\w\()x\h:
+.Loop_vps_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_vps_w8_sve2_\v\()_\w\()x\h:
+.Loop_vps_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_32b_sve2 \v
     qpel_filter_chroma_sve2_\v\()_32b
@@ -1036,12 +1036,12 @@ CHROMA_VPP_SVE2 48, 64
     str             q17, [x7], #16
 .endif
     cmp             x9, #\w
-    blt             .loop_vps_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_vps_w8_sve2_\v\()_\w\()x\h
 
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vps_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vps_sve2_\v\()_\w\()x\h
     ret
 .endm
 
@@ -1170,7 +1170,7 @@ CHROMA_VPS_SVE2 48, 64
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_chroma_sve2_\v\()_1
-.loop_vss_sve2_\v\()_\w\()x\h:
+.Loop_vss_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
 .if \w == 4
@@ -1183,7 +1183,7 @@ CHROMA_VPS_SVE2 48, 64
     add             x9, x9, #4
 .endr
 .else
-.loop_vss_w8_sve2_\v\()_\w\()x\h:
+.Loop_vss_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_64b \v
     qpel_filter_chroma_\v\()_32b_1
@@ -1199,12 +1199,12 @@ CHROMA_VPS_SVE2 48, 64
     add             x9, x9, #8
 .endif
     cmp             x9, x12
-    blt             .loop_vss_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_vss_w8_sve2_\v\()_\w\()x\h
 .endif
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vss_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vss_sve2_\v\()_\w\()x\h
     ret
 .endm
 
diff --git a/source/common/aarch64/ipfilter.S b/source/common/aarch64/ipfilter.S
index 80624862d..228ffae29 100644
--- a/source/common/aarch64/ipfilter.S
+++ b/source/common/aarch64/ipfilter.S
@@ -85,7 +85,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon
     ushll           v3.8h, v3.8b, #0
 
     mov             x9, #\h
-.loop_4x\h:
+.Loop_4x\h:
     ld1             {v4.s}[0], [x0], x1
     ld1             {v4.s}[1], [x0], x1
     ushll           v4.8h, v4.8b, #0
@@ -124,7 +124,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon
     st1             {v16.s}[1], [x2], x3
 
     sub             x9, x9, #2
-    cbnz            x9, .loop_4x\h
+    cbnz            x9, .Loop_4x\h
     ret
 endfunc
 .endm
@@ -202,7 +202,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon
     ld1r            {v22.2d}, [x12], #8
     ld1r            {v23.2d}, [x12], #8
 
-.loop_vps_4x\h:
+.Loop_vps_4x\h:
     mov             x6, x0
 
     ld1             {v0.s}[0], [x6], x1
@@ -252,7 +252,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon
 
     add             x0, x0, x1
     sub             x4, x4, #1
-    cbnz            x4, .loop_vps_4x\h
+    cbnz            x4, .Loop_vps_4x\h
     ret
 endfunc
 .endm
@@ -331,7 +331,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon
     ld1r            {v21.2d}, [x12], #8
     ld1r            {v22.2d}, [x12], #8
     ld1r            {v23.2d}, [x12], #8
-.loop_vsp_4x\h:
+.Loop_vsp_4x\h:
     mov             x6, x0
 
     ld1             {v0.8b}, [x6], x1
@@ -368,7 +368,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon
 
     add             x0, x0, x1
     sub             x4, x4, #1
-    cbnz            x4, .loop_vsp_4x\h
+    cbnz            x4, .Loop_vsp_4x\h
     ret
 endfunc
 .endm
diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S
index 704bdaed0..e4540ce9b 100644
--- a/source/common/aarch64/mc-a-sve2.S
+++ b/source/common/aarch64/mc-a-sve2.S
@@ -219,7 +219,7 @@ function PFX(pixel_avg_pp_48x64_sve2)
     mov             x11, #0
     whilelt         p0.b, x11, x10
     mov             w12, #8
-.loop_gt_32_pixel_avg_pp_48x64:
+.Loop_gt_32_pixel_avg_pp_48x64:
     sub             w12, w12, #1
 .rept 8
     ld1b            {z0.b}, p0/z, [x2]
@@ -230,7 +230,7 @@ function PFX(pixel_avg_pp_48x64_sve2)
     st1b            {z0.b}, p0, [x0]
     add             x0, x0, x1
 .endr
-    cbnz            w12, .loop_gt_32_pixel_avg_pp_48x64
+    cbnz            w12, .Loop_gt_32_pixel_avg_pp_48x64
     ret
 endfunc
 
@@ -339,7 +339,7 @@ function PFX(addAvg_6x\h\()_sve2)
     mov             w12, #\h / 2
     ptrue           p0.b, vl16
     ptrue           p2.h, vl6
-.loop_sve2_addavg_6x\h\():
+.Loop_sve2_addavg_6x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -359,7 +359,7 @@ function PFX(addAvg_6x\h\()_sve2)
     add             x2, x2, x5
     st1b            {z2.h}, p2, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_6x\h
+    cbnz            w12, .Loop_sve2_addavg_6x\h
     ret
 endfunc
 .endm
@@ -398,7 +398,7 @@ endfunc
 function PFX(addAvg_8x\h\()_sve2)
     mov             w12, #\h / 2
     ptrue           p0.b, vl16
-.loop_sve2_addavg_8x\h\():
+.Loop_sve2_addavg_8x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -418,7 +418,7 @@ function PFX(addAvg_8x\h\()_sve2)
     add             x2, x2, x5
     st1b            {z2.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_8x\h
+    cbnz            w12, .Loop_sve2_addavg_8x\h
     ret
 endfunc
 .endm
@@ -440,7 +440,7 @@ function PFX(addAvg_12x\h\()_sve2)
     bgt             .vl_gt_16_addAvg_12x\h
     ptrue           p0.b, vl16
     ptrue           p1.b, vl8
-.loop_sve2_addavg_12x\h\():
+.Loop_sve2_addavg_12x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -457,13 +457,13 @@ function PFX(addAvg_12x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z2.h}, p1, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_12x\h
+    cbnz            w12, .Loop_sve2_addavg_12x\h
     ret
 .vl_gt_16_addAvg_12x\h\():
     mov             x10, #24
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_sve2_gt_16_addavg_12x\h\():
+.Loop_sve2_gt_16_addavg_12x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -476,7 +476,7 @@ function PFX(addAvg_12x\h\()_sve2)
     add             z2.b, z2.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_gt_16_addavg_12x\h
+    cbnz            w12, .Loop_sve2_gt_16_addavg_12x\h
     ret
 endfunc
 .endm
@@ -491,7 +491,7 @@ function PFX(addAvg_16x\h\()_sve2)
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_16x\h
     ptrue           p0.b, vl16
-.loop_eq_16_sve2_addavg_16x\h\():
+.Loop_eq_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -508,13 +508,13 @@ function PFX(addAvg_16x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z2.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_16x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_16x\h
     ret
 .vl_gt_16_addAvg_16x\h\():
     cmp             x9, #32
     bgt             .vl_gt_32_addAvg_16x\h
     ptrue           p0.b, vl32
-.loop_gt_16_sve2_addavg_16x\h\():
+.Loop_gt_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -525,13 +525,13 @@ function PFX(addAvg_16x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p1, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_16_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
     ret
 .vl_gt_32_addAvg_16x\h\():
     mov             x10, #48
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_32_sve2_addavg_16x\h\():
+.Loop_gt_32_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     add             x0, x0, x3, lsl #1
@@ -541,7 +541,7 @@ function PFX(addAvg_16x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_32_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_32_sve2_addavg_16x\h
     ret
 endfunc
 .endm
@@ -561,7 +561,7 @@ function PFX(addAvg_24x\h\()_sve2)
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_24x\h
     addAvg_start
-.loop_eq_16_sve2_addavg_24x\h\():
+.Loop_eq_16_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1             {v0.16b-v2.16b}, [x0], x3
     ld1             {v3.16b-v5.16b}, [x1], x4
@@ -572,14 +572,14 @@ function PFX(addAvg_24x\h\()_sve2)
     sqxtun          v1.8b, v1.8h
     sqxtun          v2.8b, v2.8h
     st1             {v0.8b-v2.8b}, [x2], x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_24x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_24x\h
     ret
 .vl_gt_16_addAvg_24x\h\():
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_24x\h
     ptrue           p0.b, vl32
     ptrue           p1.b, vl16
-.loop_gt_16_sve2_addavg_24x\h\():
+.Loop_gt_16_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p1/z, [x0, #1, mul vl]
@@ -596,13 +596,13 @@ function PFX(addAvg_24x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p1, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_16_sve2_addavg_24x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_24x\h
     ret
 .vl_gt_48_addAvg_24x\h\():
     mov             x10, #48
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_48_sve2_addavg_24x\h\():
+.Loop_gt_48_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z2.b}, p0/z, [x1]
@@ -613,7 +613,7 @@ function PFX(addAvg_24x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_48_sve2_addavg_24x\h
+    cbnz            w12, .Loop_gt_48_sve2_addavg_24x\h
     ret
 endfunc
 .endm
@@ -628,7 +628,7 @@ function PFX(addAvg_32x\h\()_sve2)
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_32x\h
     ptrue           p0.b, vl16
-.loop_eq_16_sve2_addavg_32x\h\():
+.Loop_eq_16_sve2_addavg_32x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -657,13 +657,13 @@ function PFX(addAvg_32x\h\()_sve2)
     st1b            {z2.h}, p0, [x2, #2, mul vl]
     st1b            {z3.h}, p0, [x2, #3, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_32x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_32x\h
     ret
 .vl_gt_16_addAvg_32x\h\():
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_32x\h
     ptrue           p0.b, vl32
-.loop_gt_eq_32_sve2_addavg_32x\h\():
+.Loop_gt_eq_32_sve2_addavg_32x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -680,11 +680,11 @@ function PFX(addAvg_32x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_32_sve2_addavg_32x\h
+    cbnz            w12, .Loop_gt_eq_32_sve2_addavg_32x\h
     ret
 .vl_gt_48_addAvg_32x\h\():
     ptrue           p0.b, vl64
-.loop_eq_64_sve2_addavg_32x\h\():
+.Loop_eq_64_sve2_addavg_32x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -695,7 +695,7 @@ function PFX(addAvg_32x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_eq_64_sve2_addavg_32x\h
+    cbnz            w12, .Loop_eq_64_sve2_addavg_32x\h
     ret
 endfunc
 .endm
@@ -715,7 +715,7 @@ function PFX(addAvg_48x64_sve2)
     addAvg_start
     sub             x3, x3, #64
     sub             x4, x4, #64
-.loop_eq_16_sve2_addavg_48x64:
+.Loop_eq_16_sve2_addavg_48x64:
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], #64
     ld1             {v4.8h-v7.8h}, [x1], #64
@@ -734,13 +734,13 @@ function PFX(addAvg_48x64_sve2)
     sqxtun          v2.8b, v20.8h
     sqxtun2         v2.16b, v21.8h
     st1             {v0.16b-v2.16b}, [x2], x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_48x64
+    cbnz            w12, .Loop_eq_16_sve2_addavg_48x64
     ret
 .vl_gt_16_addAvg_48x64:
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_48x64
     ptrue           p0.b, vl32
-.loop_gt_eq_32_sve2_addavg_48x64:
+.Loop_gt_eq_32_sve2_addavg_48x64:
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -763,14 +763,14 @@ function PFX(addAvg_48x64_sve2)
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     st1b            {z2.h}, p0, [x2, #2, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_32_sve2_addavg_48x64
+    cbnz            w12, .Loop_gt_eq_32_sve2_addavg_48x64
     ret
 .vl_gt_48_addAvg_48x64:
     cmp             x9, #112
     bgt             .vl_gt_112_addAvg_48x64
     ptrue           p0.b, vl64
     ptrue           p1.b, vl32
-.loop_gt_48_sve2_addavg_48x64:
+.Loop_gt_48_sve2_addavg_48x64:
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p1/z, [x0, #1, mul vl]
@@ -787,13 +787,13 @@ function PFX(addAvg_48x64_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p1, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_48_sve2_addavg_48x64
+    cbnz            w12, .Loop_gt_48_sve2_addavg_48x64
     ret
 .vl_gt_112_addAvg_48x64:
     mov             x10, #96
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_112_sve2_addavg_48x64:
+.Loop_gt_112_sve2_addavg_48x64:
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z4.b}, p0/z, [x1]
@@ -804,7 +804,7 @@ function PFX(addAvg_48x64_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_112_sve2_addavg_48x64
+    cbnz            w12, .Loop_gt_112_sve2_addavg_48x64
     ret
 endfunc
 
@@ -817,7 +817,7 @@ function PFX(addAvg_64x\h\()_sve2)
     addAvg_start
     sub             x3, x3, #64
     sub             x4, x4, #64
-.loop_eq_16_sve2_addavg_64x\h\():
+.Loop_eq_16_sve2_addavg_64x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], #64
     ld1             {v4.8h-v7.8h}, [x1], #64
@@ -840,13 +840,13 @@ function PFX(addAvg_64x\h\()_sve2)
     sqxtun          v3.8b, v22.8h
     sqxtun2         v3.16b, v23.8h
     st1             {v0.16b-v3.16b}, [x2], x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_64x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_64x\h
     ret
 .vl_gt_16_addAvg_64x\h\():
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_64x\h
     ptrue           p0.b, vl32
-.loop_gt_eq_32_sve2_addavg_64x\h\():
+.Loop_gt_eq_32_sve2_addavg_64x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -875,13 +875,13 @@ function PFX(addAvg_64x\h\()_sve2)
     st1b            {z2.h}, p0, [x2, #2, mul vl]
     st1b            {z3.h}, p0, [x2, #3, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_32_sve2_addavg_64x\h
+    cbnz            w12, .Loop_gt_eq_32_sve2_addavg_64x\h
     ret
 .vl_gt_48_addAvg_64x\h\():
     cmp             x9, #112
     bgt             .vl_gt_112_addAvg_64x\h
     ptrue           p0.b, vl64
-.loop_gt_eq_48_sve2_addavg_64x\h\():
+.Loop_gt_eq_48_sve2_addavg_64x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -898,11 +898,11 @@ function PFX(addAvg_64x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_48_sve2_addavg_64x\h
+    cbnz            w12, .Loop_gt_eq_48_sve2_addavg_64x\h
     ret
 .vl_gt_112_addAvg_64x\h\():
     ptrue           p0.b, vl128
-.loop_gt_eq_128_sve2_addavg_64x\h\():
+.Loop_gt_eq_128_sve2_addavg_64x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z4.b}, p0/z, [x1]
@@ -913,7 +913,7 @@ function PFX(addAvg_64x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_128_sve2_addavg_64x\h
+    cbnz            w12, .Loop_gt_eq_128_sve2_addavg_64x\h
     ret
 endfunc
 .endm
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
index d122b8bb3..8c2878b3e 100644
--- a/source/common/aarch64/mc-a.S
+++ b/source/common/aarch64/mc-a.S
@@ -283,7 +283,7 @@ function PFX(addAvg_6x\h\()_neon)
     addAvg_start
     mov             w12, #\h / 2
     sub             x5, x5, #4
-.loop_addavg_6x\h:
+.Loop_addavg_6x\h:
     sub             w12, w12, #1
     ld1             {v0.16b}, [x0], x3
     ld1             {v1.16b}, [x1], x4
@@ -305,7 +305,7 @@ function PFX(addAvg_6x\h\()_neon)
     st1             {v0.h}[2], [x2], x5
     str             s1, [x2], #4
     st1             {v1.h}[2], [x2], x5
-    cbnz            w12, .loop_addavg_6x\h
+    cbnz            w12, .Loop_addavg_6x\h
     ret
 endfunc
 .endm
@@ -344,7 +344,7 @@ endfunc
 function PFX(addAvg_8x\h\()_neon)
     addAvg_start
     mov             w12, #\h / 2
-.loop_addavg_8x\h:
+.Loop_addavg_8x\h:
     sub             w12, w12, #1
     ld1             {v0.16b}, [x0], x3
     ld1             {v1.16b}, [x1], x4
@@ -364,7 +364,7 @@ function PFX(addAvg_8x\h\()_neon)
     sqxtun          v1.8b, v1.8h
     st1             {v0.8b}, [x2], x5
     st1             {v1.8b}, [x2], x5
-    cbnz            w12, .loop_addavg_8x\h
+    cbnz            w12, .Loop_addavg_8x\h
     ret
 endfunc
 .endm
@@ -385,7 +385,7 @@ function PFX(addAvg_12x\h\()_neon)
     sub             x4, x4, #16
     sub             x5, x5, #8
     mov             w12, #\h
-.loop_addAvg_12X\h\():
+.Loop_addAvg_12X\h\():
     sub             w12, w12, #1
     ld1             {v0.16b}, [x0], #16
     ld1             {v1.16b}, [x1], #16
@@ -403,7 +403,7 @@ function PFX(addAvg_12x\h\()_neon)
     sqxtun          v1.8b, v1.8h
     st1             {v0.8b}, [x2], #8
     st1             {v1.s}[0], [x2], x5
-    cbnz            w12, .loop_addAvg_12X\h
+    cbnz            w12, .Loop_addAvg_12X\h
     ret
 endfunc
 .endm
@@ -415,7 +415,7 @@ addAvg_12xN 32
 function PFX(addAvg_16x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_16x\h:
+.Loop_addavg_16x\h:
     sub             w12, w12, #1
     ld1             {v0.8h-v1.8h}, [x0], x3
     ld1             {v2.8h-v3.8h}, [x1], x4
@@ -424,7 +424,7 @@ function PFX(addAvg_16x\h\()_neon)
     sqxtun          v0.8b, v0.8h
     sqxtun2         v0.16b, v1.8h
     st1             {v0.16b}, [x2], x5
-    cbnz            w12, .loop_addavg_16x\h
+    cbnz            w12, .Loop_addavg_16x\h
     ret
 endfunc
 .endm
@@ -441,7 +441,7 @@ addAvg_16xN 64
 function PFX(addAvg_24x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_24x\h\():
+.Loop_addavg_24x\h\():
     sub             w12, w12, #1
     ld1             {v0.16b-v2.16b}, [x0], x3
     ld1             {v3.16b-v5.16b}, [x1], x4
@@ -452,7 +452,7 @@ function PFX(addAvg_24x\h\()_neon)
     sqxtun          v1.8b, v1.8h
     sqxtun          v2.8b, v2.8h
     st1             {v0.8b-v2.8b}, [x2], x5
-    cbnz            w12, .loop_addavg_24x\h
+    cbnz            w12, .Loop_addavg_24x\h
     ret
 endfunc
 .endm
@@ -464,7 +464,7 @@ addAvg_24xN 64
 function PFX(addAvg_32x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_32x\h\():
+.Loop_addavg_32x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], x3
     ld1             {v4.8h-v7.8h}, [x1], x4
@@ -477,7 +477,7 @@ function PFX(addAvg_32x\h\()_neon)
     sqxtun          v2.8b, v2.8h
     sqxtun          v3.8b, v3.8h
     st1             {v0.8b-v3.8b}, [x2], x5
-    cbnz            w12, .loop_addavg_32x\h
+    cbnz            w12, .Loop_addavg_32x\h
     ret
 endfunc
 .endm
@@ -494,7 +494,7 @@ function PFX(addAvg_48x64_neon)
     sub             x3, x3, #64
     sub             x4, x4, #64
     mov             w12, #64
-.loop_addavg_48x64:
+.Loop_addavg_48x64:
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], #64
     ld1             {v4.8h-v7.8h}, [x1], #64
@@ -513,7 +513,7 @@ function PFX(addAvg_48x64_neon)
     sqxtun          v2.8b, v20.8h
     sqxtun2         v2.16b, v21.8h
     st1             {v0.16b-v2.16b}, [x2], x5
-    cbnz            w12, .loop_addavg_48x64
+    cbnz            w12, .Loop_addavg_48x64
     ret
 endfunc
 
@@ -523,7 +523,7 @@ function PFX(addAvg_64x\h\()_neon)
     mov             w12, #\h
     sub             x3, x3, #64
     sub             x4, x4, #64
-.loop_addavg_64x\h\():
+.Loop_addavg_64x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], #64
     ld1             {v4.8h-v7.8h}, [x1], #64
@@ -546,7 +546,7 @@ function PFX(addAvg_64x\h\()_neon)
     sqxtun          v3.8b, v22.8h
     sqxtun2         v3.16b, v23.8h
     st1             {v0.16b-v3.16b}, [x2], x5
-    cbnz            w12, .loop_addavg_64x\h
+    cbnz            w12, .Loop_addavg_64x\h
     ret
 endfunc
 .endm
diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S
index dc32df2e6..85bb14b3d 100644
--- a/source/common/aarch64/p2s-sve.S
+++ b/source/common/aarch64/p2s-sve.S
@@ -204,7 +204,7 @@ function PFX(filterPixelToShort_32x\h\()_sve)
 #else
     p2s_start
     mov             x9, #\h
-.loop_filter_sve_P2S_32x\h:
+.Loop_filter_sve_P2S_32x\h:
     sub             x9, x9, #1
     ld1             {v0.16b-v1.16b}, [x0], x1
     ushll           v22.8h, v0.8b,  #P2S_SHIFT
@@ -216,7 +216,7 @@ function PFX(filterPixelToShort_32x\h\()_sve)
     add             v24.8h, v24.8h, v31.8h
     add             v25.8h, v25.8h, v31.8h
     st1             {v22.16b-v25.16b}, [x2], x3
-    cbnz            x9, .loop_filter_sve_P2S_32x\h
+    cbnz            x9, .Loop_filter_sve_P2S_32x\h
     ret
 #endif
 endfunc
@@ -331,7 +331,7 @@ function PFX(filterPixelToShort_64x\h\()_sve)
     p2s_start
     sub             x3, x3, #64
     mov             x9, #\h
-.loop_filter_sve_P2S_64x\h:
+.Loop_filter_sve_P2S_64x\h:
     sub             x9, x9, #1
     ld1             {v0.16b-v3.16b}, [x0], x1
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
@@ -352,7 +352,7 @@ function PFX(filterPixelToShort_64x\h\()_sve)
     add             v23.8h, v23.8h, v31.8h
     st1             {v16.16b-v19.16b}, [x2], #64
     st1             {v20.16b-v23.16b}, [x2], x3
-    cbnz            x9, .loop_filter_sve_P2S_64x\h
+    cbnz            x9, .Loop_filter_sve_P2S_64x\h
     ret
 #endif
 endfunc
@@ -422,7 +422,7 @@ function PFX(filterPixelToShort_48x64_sve)
     p2s_start
     sub             x3, x3, #64
     mov             x9, #64
-.loop_filterP2S_sve_48x64:
+.Loop_filterP2S_sve_48x64:
     sub            x9, x9, #1
     ld1             {v0.16b-v2.16b}, [x0], x1
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
@@ -439,7 +439,7 @@ function PFX(filterPixelToShort_48x64_sve)
     add             v21.8h, v21.8h, v31.8h
     st1             {v16.16b-v19.16b}, [x2], #64
     st1             {v20.16b-v21.16b}, [x2], x3
-    cbnz            x9, .loop_filterP2S_sve_48x64
+    cbnz            x9, .Loop_filterP2S_sve_48x64
     ret
 #endif
 endfunc
diff --git a/source/common/aarch64/p2s.S b/source/common/aarch64/p2s.S
index 58301c9bf..b15835b34 100644
--- a/source/common/aarch64/p2s.S
+++ b/source/common/aarch64/p2s.S
@@ -262,7 +262,7 @@ p2s_24xN 64
 function PFX(filterPixelToShort_32x\h\()_neon)
     p2s_start
     mov             x9, #\h
-.loop_filterP2S_32x\h:
+.Loop_filterP2S_32x\h:
     sub             x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, [x0], x1
@@ -282,7 +282,7 @@ function PFX(filterPixelToShort_32x\h\()_neon)
     add             v24.8h, v24.8h, v31.8h
     add             v25.8h, v25.8h, v31.8h
     st1             {v22.16b-v25.16b}, [x2], x3
-    cbnz            x9, .loop_filterP2S_32x\h
+    cbnz            x9, .Loop_filterP2S_32x\h
     ret
 endfunc
 .endm
@@ -302,7 +302,7 @@ function PFX(filterPixelToShort_64x\h\()_neon)
 #endif
     sub             x3, x3, #64
     mov             x9, #\h
-.loop_filterP2S_64x\h:
+.Loop_filterP2S_64x\h:
     sub             x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, [x0], #64
@@ -336,7 +336,7 @@ function PFX(filterPixelToShort_64x\h\()_neon)
     add             v23.8h, v23.8h, v31.8h
     st1             {v16.16b-v19.16b}, [x2], #64
     st1             {v20.16b-v23.16b}, [x2], x3
-    cbnz            x9, .loop_filterP2S_64x\h
+    cbnz            x9, .Loop_filterP2S_64x\h
     ret
 endfunc
 .endm
@@ -353,7 +353,7 @@ function PFX(filterPixelToShort_48x64_neon)
 #endif
     sub             x3, x3, #64
     mov             x9, #64
-.loop_filterP2S_48x64:
+.Loop_filterP2S_48x64:
     sub            x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, [x0], #64
@@ -381,6 +381,6 @@ function PFX(filterPixelToShort_48x64_neon)
     add             v21.8h, v21.8h, v31.8h
     st1             {v16.16b-v19.16b}, [x2], #64
     st1             {v20.16b-v21.16b}, [x2], x3
-    cbnz            x9, .loop_filterP2S_48x64
+    cbnz            x9, .Loop_filterP2S_48x64
     ret
 endfunc
diff --git a/source/common/aarch64/pixel-util-sve.S b/source/common/aarch64/pixel-util-sve.S
index 715fcc1cb..c1d6b4129 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -333,7 +333,7 @@ function PFX(quant_sve)
     eor             w10, w10, w10
     eor             z17.d, z17.d, z17.d
 
-.loop_quant_sve:
+.Loop_quant_sve:
     ld1             {v18.4h}, [x0], #8
     ld1             {v7.4s}, [x1], #16
     sxtl            v6.4s, v18.4h
@@ -364,7 +364,7 @@ function PFX(quant_sve)
     st1             {v5.4h}, [x3], #8
 
     subs            w6, w6, #1
-    b.ne             .loop_quant_sve
+    b.ne             .Loop_quant_sve
 
     addv            s4, v4.4s
     mov             w9, v4.s[0]
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index dbd138f62..2af5d63c1 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -64,11 +64,11 @@ function PFX(pixel_var_16x16_sve2)
     bgt             .vl_gt_16_pixel_var_16x16
     pixel_var_start
     mov             w12, #16
-.loop_var_16_sve2:
+.Loop_var_16_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b}, [x0], x1
     pixel_var_1 v4
-    cbnz            w12, .loop_var_16_sve2
+    cbnz            w12, .Loop_var_16_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_16x16:
@@ -95,12 +95,12 @@ function PFX(pixel_var_32x32_sve2)
     bgt             .vl_gt_16_pixel_var_32x32
     pixel_var_start
     mov             w12, #32
-.loop_var_32_sve2:
+.Loop_var_32_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, [x0], x1
     pixel_var_1 v4
     pixel_var_1 v5
-    cbnz            w12, .loop_var_32_sve2
+    cbnz            w12, .Loop_var_32_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_32x32:
@@ -150,14 +150,14 @@ function PFX(pixel_var_64x64_sve2)
     bgt             .vl_gt_16_pixel_var_64x64
     pixel_var_start
     mov             w12, #64
-.loop_var_64_sve2:
+.Loop_var_64_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x0], x1
     pixel_var_1 v4
     pixel_var_1 v5
     pixel_var_1 v6
     pixel_var_1 v7
-    cbnz            w12, .loop_var_64_sve2
+    cbnz            w12, .Loop_var_64_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_64x64:
@@ -268,7 +268,7 @@ function PFX(getResidual32_sve2)
     bgt             .vl_gt_16_getResidual32
     lsl             x4, x3, #1
     mov             w12, #4
-.loop_residual_32:
+.Loop_residual_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x0], x3
@@ -286,7 +286,7 @@ function PFX(getResidual32_sve2)
     st1             {v16.8h-v19.8h}, [x2], x4
     st1             {v20.8h-v23.8h}, [x2], x4
 .endr
-    cbnz            w12, .loop_residual_32
+    cbnz            w12, .Loop_residual_32
     ret
 .vl_gt_16_getResidual32:
     cmp             x9, #48
@@ -323,7 +323,7 @@ function PFX(pixel_sub_ps_32x32_sve2)
     bgt             .vl_gt_16_pixel_sub_ps_32x32
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_sub_ps_32_sve2:
+.Loop_sub_ps_32_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -341,7 +341,7 @@ function PFX(pixel_sub_ps_32x32_sve2)
     st1             {v16.8h-v19.8h}, [x0], x1
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_32_sve2
+    cbnz            w12, .Loop_sub_ps_32_sve2
     ret
 .vl_gt_16_pixel_sub_ps_32x32:
     cmp             x9, #48
@@ -387,7 +387,7 @@ function PFX(pixel_sub_ps_64x64_sve2)
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_sub_ps_64_sve2:
+.Loop_sub_ps_64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, [x2], x4
@@ -403,7 +403,7 @@ function PFX(pixel_sub_ps_64x64_sve2)
     st1             {v16.8h-v19.8h}, [x0], #64
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_64_sve2
+    cbnz            w12, .Loop_sub_ps_64_sve2
     ret
 .vl_gt_16_pixel_sub_ps_64x64:
     rdvl            x9, #1
@@ -473,7 +473,7 @@ function PFX(pixel_sub_ps_32x64_sve2)
     bgt             .vl_gt_16_pixel_sub_ps_32x64
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_sub_ps_32x64_sve2:
+.Loop_sub_ps_32x64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -491,7 +491,7 @@ function PFX(pixel_sub_ps_32x64_sve2)
     st1             {v16.8h-v19.8h}, [x0], x1
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_32x64_sve2
+    cbnz            w12, .Loop_sub_ps_32x64_sve2
     ret
 .vl_gt_16_pixel_sub_ps_32x64:
     cmp             x9, #48
@@ -609,7 +609,7 @@ pixel_add_ps_16xN_sve2 32
     bgt             .vl_gt_16_pixel_add_ps_32x\h
     lsl             x5, x5, #1
     mov             w12, #\h / 4
-.loop_add_ps__sve2_32x\h\():
+.Loop_add_ps__sve2_32x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -628,7 +628,7 @@ pixel_add_ps_16xN_sve2 32
     sqxtun2         v5.16b, v27.8h
     st1             {v4.16b-v5.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_add_ps__sve2_32x\h
+    cbnz            w12, .Loop_add_ps__sve2_32x\h
     ret
 .vl_gt_16_pixel_add_ps_32x\h\():
     cmp             x9, #48
@@ -1157,7 +1157,7 @@ function PFX(ssimDist16_sve2)
     bgt             .vl_gt_16_ssimDist16
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist16_sve2:
+.Loop_ssimDist16_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
     ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
@@ -1171,7 +1171,7 @@ function PFX(ssimDist16_sve2)
     add             x2, x2, x3
     ssimDist_1_sve2 z4, z5, z8, z9
     ssimDist_1_sve2 z6, z7, z10, z11
-    cbnz            w12, .loop_ssimDist16_sve2
+    cbnz            w12, .Loop_ssimDist16_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist16:
@@ -1217,7 +1217,7 @@ function PFX(ssimDist32_sve2)
     bgt             .vl_gt_16_ssimDist32
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist32_sve2:
+.Loop_ssimDist32_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, [x0]
     ld1b            {z3.s}, p0/z, [x0, #1, mul vl]
@@ -1241,7 +1241,7 @@ function PFX(ssimDist32_sve2)
     ssimDist_1_sve2 z4, z5, z12, z13
     ssimDist_1_sve2 z6, z7, z14, z15
     ssimDist_1_sve2 z8, z9, z30, z31
-    cbnz            w12, .loop_ssimDist32_sve2
+    cbnz            w12, .Loop_ssimDist32_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist32:
@@ -1309,7 +1309,7 @@ function PFX(ssimDist64_sve2)
     bgt             .vl_gt_16_ssimDist64
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist64_sve2:
+.Loop_ssimDist64_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, [x0]
     ld1b            {z3.s}, p0/z, [x0, #1, mul vl]
@@ -1357,7 +1357,7 @@ function PFX(ssimDist64_sve2)
     ssimDist_1_sve2 z8, z9, z29, z30
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w12, .loop_ssimDist64_sve2
+    cbnz            w12, .Loop_ssimDist64_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist64:
@@ -1482,7 +1482,7 @@ function PFX(normFact16_sve2)
     bgt             .vl_gt_16_normFact16
     normFact_start
     ptrue           p0.s, vl4
-.loop_normFact16_sve2:
+.Loop_normFact16_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
     ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
@@ -1491,7 +1491,7 @@ function PFX(normFact16_sve2)
     add             x0, x0, x1
     normFact_1_sve2 z4, z5
     normFact_1_sve2 z6, z7
-    cbnz            w12, .loop_normFact16_sve2
+    cbnz            w12, .Loop_normFact16_sve2
     normFact_end
     ret
 .vl_gt_16_normFact16:
@@ -1529,7 +1529,7 @@ function PFX(normFact32_sve2)
     bgt             .vl_gt_16_normFact32
     normFact_start
     ptrue           p0.s, vl4
-.loop_normFact32_sve2:
+.Loop_normFact32_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
     ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
@@ -1544,7 +1544,7 @@ function PFX(normFact32_sve2)
     normFact_1_sve2 z6, z7
     normFact_1_sve2 z8, z9
     normFact_1_sve2 z10, z11
-    cbnz            w12, .loop_normFact32_sve2
+    cbnz            w12, .Loop_normFact32_sve2
     normFact_end
     ret
 .vl_gt_16_normFact32:
@@ -1599,7 +1599,7 @@ function PFX(normFact64_sve2)
     bgt             .vl_gt_16_normFact64
     normFact_start
     ptrue           p0.s, vl4
-.loop_normFact64_sve2:
+.Loop_normFact64_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
     ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
@@ -1628,7 +1628,7 @@ function PFX(normFact64_sve2)
     normFact_1_sve2 z8, z9
     normFact_1_sve2 z10, z11
     add             x0, x0, x1
-    cbnz            w12, .loop_normFact64_sve2
+    cbnz            w12, .Loop_normFact64_sve2
     normFact_end
     ret
 .vl_gt_16_normFact64:
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 378c6891c..1df49ba6e 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -60,11 +60,11 @@ endfunc
 function PFX(pixel_var_16x16_neon)
     pixel_var_start
     mov             w12, #16
-.loop_var_16:
+.Loop_var_16:
     sub             w12, w12, #1
     ld1             {v4.16b}, [x0], x1
     pixel_var_1 v4
-    cbnz            w12, .loop_var_16
+    cbnz            w12, .Loop_var_16
     pixel_var_end
     ret
 endfunc
@@ -72,12 +72,12 @@ endfunc
 function PFX(pixel_var_32x32_neon)
     pixel_var_start
     mov             w12, #32
-.loop_var_32:
+.Loop_var_32:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, [x0], x1
     pixel_var_1 v4
     pixel_var_1 v5
-    cbnz            w12, .loop_var_32
+    cbnz            w12, .Loop_var_32
     pixel_var_end
     ret
 endfunc
@@ -85,14 +85,14 @@ endfunc
 function PFX(pixel_var_64x64_neon)
     pixel_var_start
     mov             w12, #64
-.loop_var_64:
+.Loop_var_64:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x0], x1
     pixel_var_1 v4
     pixel_var_1 v5
     pixel_var_1 v6
     pixel_var_1 v7
-    cbnz            w12, .loop_var_64
+    cbnz            w12, .Loop_var_64
     pixel_var_end
     ret
 endfunc
@@ -148,7 +148,7 @@ endfunc
 function PFX(getResidual32_neon)
     lsl             x4, x3, #1
     mov             w12, #4
-.loop_residual_32:
+.Loop_residual_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x0], x3
@@ -166,7 +166,7 @@ function PFX(getResidual32_neon)
     st1             {v16.8h-v19.8h}, [x2], x4
     st1             {v20.8h-v23.8h}, [x2], x4
 .endr
-    cbnz            w12, .loop_residual_32
+    cbnz            w12, .Loop_residual_32
     ret
 endfunc
 
@@ -221,7 +221,7 @@ endfunc
 function PFX(pixel_sub_ps_32x32_neon)
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_sub_ps_32:
+.Loop_sub_ps_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -239,7 +239,7 @@ function PFX(pixel_sub_ps_32x32_neon)
     st1             {v16.8h-v19.8h}, [x0], x1
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_32
+    cbnz            w12, .Loop_sub_ps_32
     ret
 endfunc
 
@@ -247,7 +247,7 @@ function PFX(pixel_sub_ps_64x64_neon)
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_sub_ps_64:
+.Loop_sub_ps_64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, [x2], x4
@@ -263,7 +263,7 @@ function PFX(pixel_sub_ps_64x64_neon)
     st1             {v16.8h-v19.8h}, [x0], #64
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_64
+    cbnz            w12, .Loop_sub_ps_64
     ret
 endfunc
 
@@ -318,7 +318,7 @@ endfunc
 function PFX(pixel_sub_ps_32x64_neon)
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_sub_ps_32x64:
+.Loop_sub_ps_32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -336,7 +336,7 @@ function PFX(pixel_sub_ps_32x64_neon)
     st1             {v16.8h-v19.8h}, [x0], x1
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_32x64
+    cbnz            w12, .Loop_sub_ps_32x64
     ret
 endfunc
 
@@ -383,7 +383,7 @@ endfunc
 function PFX(pixel_add_ps_16x\h\()_neon)
     lsl             x5, x5, #1
     mov             w12, #\h / 8
-.loop_add_ps_16x\h\():
+.Loop_add_ps_16x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b}, [x2], x4
@@ -405,7 +405,7 @@ function PFX(pixel_add_ps_16x\h\()_neon)
     st1             {v4.16b}, [x0], x1
     st1             {v5.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_add_ps_16x\h
+    cbnz            w12, .Loop_add_ps_16x\h
     ret
 endfunc
 .endm
@@ -417,7 +417,7 @@ pixel_add_ps_16xN_neon 32
  function PFX(pixel_add_ps_32x\h\()_neon)
     lsl             x5, x5, #1
     mov             w12, #\h / 4
-.loop_add_ps_32x\h\():
+.Loop_add_ps_32x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -436,7 +436,7 @@ pixel_add_ps_16xN_neon 32
     sqxtun2         v5.16b, v27.8h
     st1             {v4.16b-v5.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_add_ps_32x\h
+    cbnz            w12, .Loop_add_ps_32x\h
     ret
 endfunc
 .endm
@@ -448,7 +448,7 @@ function PFX(pixel_add_ps_64x64_neon)
     lsl             x5, x5, #1
     sub             x5, x5, #64
     mov             w12, #32
-.loop_add_ps_64x64:
+.Loop_add_ps_64x64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v0.16b-v3.16b}, [x2], x4
@@ -480,7 +480,7 @@ function PFX(pixel_add_ps_64x64_neon)
     sqxtun2         v3.16b, v7.8h
     st1             {v0.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_add_ps_64x64
+    cbnz            w12, .Loop_add_ps_64x64
     ret
 endfunc
 
@@ -548,7 +548,7 @@ endfunc
 // void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
 function PFX(scale2D_64to32_neon)
     mov             w12, #32
-.loop_scale2D:
+.Loop_scale2D:
     ld1             {v0.16b-v3.16b}, [x1], x2
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x1], x2
@@ -561,7 +561,7 @@ function PFX(scale2D_64to32_neon)
     uqrshrn         v1.8b, v2.8h, #2
     uqrshrn2        v1.16b, v3.8h, #2
     st1             {v0.16b-v1.16b}, [x0], #32
-    cbnz            w12, .loop_scale2D
+    cbnz            w12, .Loop_scale2D
     ret
 endfunc
 
@@ -569,33 +569,33 @@ endfunc
 function PFX(pixel_planecopy_cp_neon)
     dup             v2.16b, w6
     sub             x5, x5, #1
-.loop_h:
+.Loop_h:
     mov             x6, x0
     mov             x12, x2
     mov             x7, #0
-.loop_w:
+.Loop_w:
     ldr             q0, [x6], #16
     ushl            v0.16b, v0.16b, v2.16b
     str             q0, [x12], #16
     add             x7, x7, #16
     cmp             x7, x4
-    blt             .loop_w
+    blt             .Loop_w
 
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_h
+    cbnz            x5, .Loop_h
 
 // handle last row
     mov             x5, x4
     lsr             x5, x5, #3
-.loopW8:
+.LoopW8:
     ldr             d0, [x0], #8
     ushl            v0.8b, v0.8b, v2.8b
     str             d0, [x2], #8
     sub             x4, x4, #8
     sub             x5, x5, #1
-    cbnz            x5, .loopW8
+    cbnz            x5, .LoopW8
 
     mov             x5, #8
     sub             x5, x5, x4
@@ -1508,7 +1508,7 @@ function PFX(pixel_sa8d_32x64_neon)
     mov             x10, x30
     mov             w11, #4
     mov             w9, #0
-.loop_sa8d_32:
+.Loop_sa8d_32:
     sub             w11, w11, #1
     sa8d_16x16      w4
     sub             x0, x0, x1, lsl #4
@@ -1520,7 +1520,7 @@ function PFX(pixel_sa8d_32x64_neon)
     add             w9, w9, w4
     sub             x0, x0, #24
     sub             x2, x2, #24
-    cbnz            w11, .loop_sa8d_32
+    cbnz            w11, .Loop_sa8d_32
     mov             w0, w9
     ret             x10
 endfunc
@@ -1529,7 +1529,7 @@ function PFX(pixel_sa8d_64x64_neon)
     mov             x10, x30
     mov             w11, #4
     mov             w9, #0
-.loop_sa8d_64:
+.Loop_sa8d_64:
     sub             w11, w11, #1
     sa8d_16x16      w4
     sub             x0, x0, x1, lsl #4
@@ -1554,7 +1554,7 @@ function PFX(pixel_sa8d_64x64_neon)
 
     sub             x0, x0, #56
     sub             x2, x2, #56
-    cbnz            w11, .loop_sa8d_64
+    cbnz            w11, .Loop_sa8d_64
     mov             w0, w9
     ret             x10
 endfunc
@@ -1807,7 +1807,7 @@ function PFX(quant_neon)
     eor             w10, w10, w10
     eor             v17.16b, v17.16b, v17.16b
 
-.loop_quant:
+.Loop_quant:
 
     ld1             {v18.4h}, [x0], #8
     ld1             {v7.4s}, [x1], #16
@@ -1839,7 +1839,7 @@ function PFX(quant_neon)
     st1             {v5.4h}, [x3], #8
 
     subs            w6, w6, #1
-    b.ne             .loop_quant
+    b.ne             .Loop_quant
 
     addv            s4, v4.4s
     mov             w9, v4.s[0]
@@ -1858,7 +1858,7 @@ function PFX(nquant_neon)
     mov             x4, #0
     movi            v22.4s, #0
 
-.loop_nquant:
+.Loop_nquant:
     ld1             {v16.4h}, [x0], #8
     sub             w5, w5, #1
     sxtl            v19.4s, v16.4h         // v19 = coef[blockpos]
@@ -1883,7 +1883,7 @@ function PFX(nquant_neon)
     abs             v17.4h, v16.4h
     st1             {v17.4h}, [x2], #8
 
-    cbnz            w5, .loop_nquant
+    cbnz            w5, .Loop_nquant
 
     uaddlv          d4, v4.4s
     fmov            x12, d4
@@ -1937,7 +1937,7 @@ endfunc
 function PFX(ssimDist16_neon)
     mov w12, #16
     ssimDist_start
-.loop_ssimDist16:
+.Loop_ssimDist16:
     sub             w12, w12, #1
     ld1             {v4.16b}, [x0], x1
     ld1             {v5.16b}, [x2], x3
@@ -1947,7 +1947,7 @@ function PFX(ssimDist16_neon)
     uxtl2           v5.8h, v5.16b
     ssimDist_1      v6, v7
     ssimDist_1      v4, v5
-    cbnz            w12, .loop_ssimDist16
+    cbnz            w12, .Loop_ssimDist16
     ssimDist_end
     ret
 endfunc
@@ -1955,7 +1955,7 @@ endfunc
 function PFX(ssimDist32_neon)
     mov w12, #32
     ssimDist_start
-.loop_ssimDist32:
+.Loop_ssimDist32:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, [x0], x1
     ld1             {v6.16b-v7.16b}, [x2], x3
@@ -1971,7 +1971,7 @@ function PFX(ssimDist32_neon)
     ssimDist_1      v23, v24
     ssimDist_1      v25, v26
     ssimDist_1      v27, v28
-    cbnz            w12, .loop_ssimDist32
+    cbnz            w12, .Loop_ssimDist32
     ssimDist_end
     ret
 endfunc
@@ -1979,7 +1979,7 @@ endfunc
 function PFX(ssimDist64_neon)
     mov w12, #64
     ssimDist_start
-.loop_ssimDist64:
+.Loop_ssimDist64:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x0], x1
     ld1             {v16.16b-v19.16b}, [x2], x3
@@ -2007,7 +2007,7 @@ function PFX(ssimDist64_neon)
     ssimDist_1      v23, v24
     ssimDist_1      v25, v26
     ssimDist_1      v27, v28
-    cbnz            w12, .loop_ssimDist64
+    cbnz            w12, .Loop_ssimDist64
     ssimDist_end
     ret
 endfunc
@@ -2035,14 +2035,14 @@ endfunc
 function PFX(normFact16_neon)
     mov w12, #16
     normFact_start
-.loop_normFact16:
+.Loop_normFact16:
     sub             w12, w12, #1
     ld1             {v4.16b}, [x0], x1
     uxtl            v5.8h, v4.8b
     uxtl2           v4.8h, v4.16b
     normFact_1      v5
     normFact_1      v4
-    cbnz            w12, .loop_normFact16
+    cbnz            w12, .Loop_normFact16
     normFact_end
     ret
 endfunc
@@ -2050,7 +2050,7 @@ endfunc
 function PFX(normFact32_neon)
     mov w12, #32
     normFact_start
-.loop_normFact32:
+.Loop_normFact32:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, [x0], x1
     uxtl            v6.8h, v4.8b
@@ -2061,7 +2061,7 @@ function PFX(normFact32_neon)
     normFact_1      v5
     normFact_1      v6
     normFact_1      v7
-    cbnz            w12, .loop_normFact32
+    cbnz            w12, .Loop_normFact32
     normFact_end
     ret
 endfunc
@@ -2069,7 +2069,7 @@ endfunc
 function PFX(normFact64_neon)
     mov w12, #64
     normFact_start
-.loop_normFact64:
+.Loop_normFact64:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x0], x1
     uxtl            v26.8h, v4.8b
@@ -2088,7 +2088,7 @@ function PFX(normFact64_neon)
     normFact_1      v25
     normFact_1      v26
     normFact_1      v27
-    cbnz            w12, .loop_normFact64
+    cbnz            w12, .Loop_normFact64
     normFact_end
     ret
 endfunc
@@ -2120,9 +2120,9 @@ function PFX(weight_pp_neon)
     cbnz            w11, .widenTo32Bit
 
     // 16-bit arithmetic is enough.
-.loopHpp:
+.LoopHpp:
     mov             x12, x3
-.loopWpp:
+.LoopWpp:
     ldr             q0, [x0], #16
     sub             x12, x12, #16
     umull           v1.8h, v0.8b, v25.8b  // val *= w0 << correction >> shift
@@ -2132,18 +2132,18 @@ function PFX(weight_pp_neon)
     sqxtun          v0.8b, v1.8h          // val = x265_clip(val)
     sqxtun2         v0.16b, v2.8h
     str             q0, [x1], #16
-    cbnz            x12, .loopWpp
+    cbnz            x12, .LoopWpp
     add             x1, x1, x2
     add             x0, x0, x2
     sub             x4, x4, #1
-    cbnz            x4, .loopHpp
+    cbnz            x4, .LoopHpp
     ret
 
     // 32-bit arithmetic is needed.
 .widenTo32Bit:
-.loopHpp32:
+.LoopHpp32:
     mov             x12, x3
-.loopWpp32:
+.LoopWpp32:
     ldr             d0, [x0], #8
     sub             x12, x12, #8
     uxtl            v0.8h, v0.8b
@@ -2155,11 +2155,11 @@ function PFX(weight_pp_neon)
     sqxtn2          v0.8h, v2.4s
     sqxtun          v0.8b, v0.8h
     str             d0, [x1], #8
-    cbnz            x12, .loopWpp32
+    cbnz            x12, .LoopWpp32
     add             x1, x1, x2
     add             x0, x0, x2
     sub             x4, x4, #1
-    cbnz            x4, .loopHpp32
+    cbnz            x4, .LoopHpp32
     ret
 
     // The shift right cannot be moved out of the loop.
@@ -2169,9 +2169,9 @@ function PFX(weight_pp_neon)
     neg             w7, w7                // -shift
     dup             v27.4s, w7
     dup             v29.4s, w9            // offset
-.loopHppUS:
+.LoopHppUS:
     mov             x12, x3
-.loopWppUS:
+.LoopWppUS:
     ldr             d0, [x0], #8
     sub             x12, x12, #8
     uxtl            v0.8h, v0.8b
@@ -2187,11 +2187,11 @@ function PFX(weight_pp_neon)
     sqxtn2          v0.8h, v2.4s
     sqxtun          v0.8b, v0.8h
     str             d0, [x1], #8
-    cbnz            x12, .loopWppUS
+    cbnz            x12, .LoopWppUS
     add             x1, x1, x2
     add             x0, x0, x2
     sub             x4, x4, #1
-    cbnz            x4, .loopHppUS
+    cbnz            x4, .LoopHppUS
     ret
 endfunc
 
@@ -2220,7 +2220,7 @@ function PFX(scanPosLast_neon)
     add             x11, x10, x7    // 3*x7
     add             x9, x4, #1      // CG count
 
-.loop_spl:
+.Loop_spl:
     // position of current CG
     ldrh            w6, [x0], #32
     add             x6, x1, x6, lsl #1
@@ -2267,14 +2267,14 @@ function PFX(scanPosLast_neon)
     // accelerate by preparing w13 = w13 & w15
     and             w13, w13, w15
     mov             x14, xzr
-.loop_spl_1:
+.Loop_spl_1:
     cbz             w15, .pext_end
     clz             w6, w15
     lsl             w13, w13, w6
     lsl             w15, w15, w6
     extr            w14, w14, w13, #31
     bfm             w15, wzr, #1, #0
-    b               .loop_spl_1
+    b               .Loop_spl_1
 .pext_end:
     strh            w14, [x2], #2
 
@@ -2285,7 +2285,7 @@ function PFX(scanPosLast_neon)
     sub             x5, x5, x6
     strb            w6, [x4], #1
 
-    cbnz            x5, .loop_spl
+    cbnz            x5, .Loop_spl
 
     // count trailing zeros
     rbit            w13, w12
@@ -2364,7 +2364,7 @@ function PFX(costCoeffNxN_neon)
     mov             x11, #0
     movi            v31.16b, #0
     cbz             x2, .idx_zero
-.loop_ccnn:
+.Loop_ccnn:
 //   {
 //        const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
 //        ctxSig = cnt & posZeroMask;
@@ -2403,7 +2403,7 @@ function PFX(costCoeffNxN_neon)
     cmp             w9, #1
     csel            w10, w11, w10, eq
     strb            w10, [x6, x14]
-    cbnz            x2, .loop_ccnn
+    cbnz            x2, .Loop_ccnn
 .idx_zero:
 
     add             x13, x3, x4, lsl #1
diff --git a/source/common/aarch64/sad-a-sve2.S b/source/common/aarch64/sad-a-sve2.S
index 9c86d84b6..599a3719a 100644
--- a/source/common/aarch64/sad-a-sve2.S
+++ b/source/common/aarch64/sad-a-sve2.S
@@ -217,12 +217,12 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
     SAD_START_\w
 
     mov             w9, #\h/8
-.loop_sve2_\w\()x\h:
+.Loop_sve2_\w\()x\h:
     sub             w9, w9, #1
 .rept 4
     SAD_\w
 .endr
-    cbnz            w9, .loop_sve2_\w\()x\h
+    cbnz            w9, .Loop_sve2_\w\()x\h
 
     SAD_END_\w
 
@@ -231,12 +231,12 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
     SAD_START_\w
 
     mov             w9, #\h/8
-.loop_sve2_loop_\w\()x\h:
+.Loop_sve2_loop_\w\()x\h:
     sub             w9, w9, #1
 .rept 4
     SAD_\w
 .endr
-    cbnz            w9, .loop_sve2_loop_\w\()x\h
+    cbnz            w9, .Loop_sve2_loop_\w\()x\h
 
     SAD_END_\w
 .else
@@ -402,7 +402,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
     bgt             .vl_gt_16_sad_x_loop_\x\()_\w\()x\h
     SAD_X_START_\w \x
     mov             w12, #\h/4
-.loop_sad_sve2_x\x\()_\w\()x\h:
+.Loop_sad_sve2_x\x\()_\w\()x\h:
     sub             w12, w12, #1
  .rept 4
   .if \w == 24
@@ -422,7 +422,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
     SAD_X_\w x4, v19, v23
   .endif
  .endr
-    cbnz            w12, .loop_sad_sve2_x\x\()_\w\()x\h
+    cbnz            w12, .Loop_sad_sve2_x\x\()_\w\()x\h
     SAD_X_END_\w \x
 .vl_gt_16_sad_x_loop_\x\()_\w\()x\h\():
 .if \w == 24 || \w == 32
@@ -431,7 +431,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
 .else
     SAD_X_START_\w \x
     mov             w12, #\h/4
-.loop_sad_sve2_gt_16_x\x\()_\w\()x\h:
+.Loop_sad_sve2_gt_16_x\x\()_\w\()x\h:
     sub             w12, w12, #1
  .rept 4
   .if \w == 24
@@ -451,7 +451,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
     SAD_X_\w x4, v19, v23
   .endif
  .endr
-    cbnz            w12, .loop_sad_sve2_gt_16_x\x\()_\w\()x\h
+    cbnz            w12, .Loop_sad_sve2_gt_16_x\x\()_\w\()x\h
     SAD_X_END_\w \x
 .endif
 endfunc
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 20d7cac7c..7460825f1 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -55,12 +55,12 @@ function PFX(pixel_sad_\w\()x\h\()_neon)
     SAD_START_\w
 
     mov             w9, #\h/8
-.loop_\w\()x\h:
+.Loop_\w\()x\h:
     sub             w9, w9, #1
 .rept 4
     SAD_\w
 .endr
-    cbnz            w9, .loop_\w\()x\h
+    cbnz            w9, .Loop_\w\()x\h
 
     SAD_END_\w
 endfunc
@@ -129,7 +129,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_neon)
 .endif
     SAD_X_START_\w \x
     mov             w12, #\h/4
-.loop_sad_x\x\()_\w\()x\h:
+.Loop_sad_x\x\()_\w\()x\h:
     sub             w12, w12, #1
  .rept 4
   .if \w == 24
@@ -149,7 +149,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_neon)
     SAD_X_\w x4, v19, v23
   .endif
  .endr
-    cbnz            w12, .loop_sad_x\x\()_\w\()x\h
+    cbnz            w12, .Loop_sad_x\x\()_\w\()x\h
     SAD_X_END_\w \x
 endfunc
 .endm
diff --git a/source/common/aarch64/ssd-a-sve2.S b/source/common/aarch64/ssd-a-sve2.S
index de2603850..8077bd93c 100644
--- a/source/common/aarch64/ssd-a-sve2.S
+++ b/source/common/aarch64/ssd-a-sve2.S
@@ -43,7 +43,7 @@ function PFX(pixel_sse_pp_32x32_sve2)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_pp_32_sve2:
+.Loop_sse_pp_32_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
@@ -61,7 +61,7 @@ function PFX(pixel_sse_pp_32x32_sve2)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_32_sve2
+    cbnz            w12, .Loop_sse_pp_32_sve2
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 .vl_gt_16_pixel_sse_pp_32x32:
@@ -182,7 +182,7 @@ function PFX(pixel_sse_pp_64x64_sve2)
     movi            v0.16b, #0
     movi            v1.16b, #0
 
-.loop_sse_pp_64_sve2:
+.Loop_sse_pp_64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x0], x1
@@ -214,7 +214,7 @@ function PFX(pixel_sse_pp_64x64_sve2)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_64_sve2
+    cbnz            w12, .Loop_sse_pp_64_sve2
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 .vl_gt_16_pixel_sse_pp_64x64:
@@ -788,7 +788,7 @@ function PFX(pixel_ssd_s_16x16_sve2)
     mov             w12, #4
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_ssd_s_16_sve2:
+.Loop_ssd_s_16_sve2:
     sub             w12, w12, #1
 .rept 2
     ld1             {v4.16b,v5.16b}, [x0], x1
@@ -802,7 +802,7 @@ function PFX(pixel_ssd_s_16x16_sve2)
     smlal           v0.4s, v7.4h, v7.4h
     smlal2          v1.4s, v7.8h, v7.8h
 .endr
-    cbnz            w12, .loop_ssd_s_16_sve2
+    cbnz            w12, .Loop_ssd_s_16_sve2
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 .vl_gt_16_pixel_ssd_s_16x16:
@@ -830,7 +830,7 @@ function PFX(pixel_ssd_s_32x32_sve2)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_ssd_s_32:
+.Loop_ssd_s_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v4.16b-v7.16b}, [x0], x1
@@ -843,7 +843,7 @@ function PFX(pixel_ssd_s_32x32_sve2)
     smlal           v0.4s, v7.4h, v7.4h
     smlal2          v1.4s, v7.8h, v7.8h
 .endr
-    cbnz            w12, .loop_ssd_s_32
+    cbnz            w12, .Loop_ssd_s_32
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 .vl_gt_16_pixel_ssd_s_32x32:
diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S
index 7c778b4fe..f4b79304a 100644
--- a/source/common/aarch64/ssd-a.S
+++ b/source/common/aarch64/ssd-a.S
@@ -157,7 +157,7 @@ function PFX(pixel_sse_pp_32x32_neon)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_pp_32:
+.Loop_sse_pp_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
@@ -175,7 +175,7 @@ function PFX(pixel_sse_pp_32x32_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_32
+    cbnz            w12, .Loop_sse_pp_32
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -184,7 +184,7 @@ function PFX(pixel_sse_pp_32x64_neon)
     mov             w12, #16
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_pp_32x64:
+.Loop_sse_pp_32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
@@ -202,7 +202,7 @@ function PFX(pixel_sse_pp_32x64_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_32x64
+    cbnz            w12, .Loop_sse_pp_32x64
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -212,7 +212,7 @@ function PFX(pixel_sse_pp_64x64_neon)
     movi            v0.16b, #0
     movi            v1.16b, #0
 
-.loop_sse_pp_64:
+.Loop_sse_pp_64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x0], x1
@@ -244,7 +244,7 @@ function PFX(pixel_sse_pp_64x64_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_64
+    cbnz            w12, .Loop_sse_pp_64
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -301,7 +301,7 @@ function PFX(pixel_sse_ss_16x16_neon)
     mov             w12, #4
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_ss_16:
+.Loop_sse_ss_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b, v17.16b}, [x0], x1
@@ -313,7 +313,7 @@ function PFX(pixel_sse_ss_16x16_neon)
     smlal           v0.4s, v3.4h, v3.4h
     smlal2          v1.4s, v3.8h, v3.8h
 .endr
-    cbnz            w12, .loop_sse_ss_16
+    cbnz            w12, .Loop_sse_ss_16
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -325,7 +325,7 @@ function PFX(pixel_sse_ss_32x32_neon)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_ss_32:
+.Loop_sse_ss_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x0], x1
@@ -343,7 +343,7 @@ function PFX(pixel_sse_ss_32x32_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_ss_32
+    cbnz            w12, .Loop_sse_ss_32
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -357,7 +357,7 @@ function PFX(pixel_sse_ss_64x64_neon)
     mov             w12, #32
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_ss_64:
+.Loop_sse_ss_64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v16.16b-v19.16b}, [x0], #64
@@ -389,7 +389,7 @@ function PFX(pixel_sse_ss_64x64_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_ss_64
+    cbnz            w12, .Loop_sse_ss_64
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -433,7 +433,7 @@ function PFX(pixel_ssd_s_16x16_neon)
     mov             w12, #4
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_ssd_s_16:
+.Loop_ssd_s_16:
     sub             w12, w12, #1
 .rept 2
     ld1             {v4.16b,v5.16b}, [x0], x1
@@ -447,7 +447,7 @@ function PFX(pixel_ssd_s_16x16_neon)
     smlal           v0.4s, v7.4h, v7.4h
     smlal2          v1.4s, v7.8h, v7.8h
 .endr
-    cbnz            w12, .loop_ssd_s_16
+    cbnz            w12, .Loop_ssd_s_16
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -457,7 +457,7 @@ function PFX(pixel_ssd_s_32x32_neon)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_ssd_s_32:
+.Loop_ssd_s_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v4.16b-v7.16b}, [x0], x1
@@ -470,7 +470,7 @@ function PFX(pixel_ssd_s_32x32_neon)
     smlal           v0.4s, v7.4h, v7.4h
     smlal2          v1.4s, v7.8h, v7.8h
 .endr
-    cbnz            w12, .loop_ssd_s_32
+    cbnz            w12, .Loop_ssd_s_32
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
diff --git a/source/common/arm/blockcopy8.S b/source/common/arm/blockcopy8.S
index 1c868f464..8170160aa 100644
--- a/source/common/arm/blockcopy8.S
+++ b/source/common/arm/blockcopy8.S
@@ -795,7 +795,7 @@ function x265_count_nonzero_32_neon
     vmov            q2, q12
     vmov            q3, q14
 
-.loop:    
+.Loop:
     vldm            r0!, {q8-q15}
     subs            r1, #1
 
@@ -817,7 +817,7 @@ function x265_count_nonzero_32_neon
     vadd.s8         q1, q10
     vadd.s8         q2, q12
     vadd.s8         q3, q14
-    bgt            .loop
+    bgt            .Loop
 
     // sum
     vadd.s8         q0, q1
diff --git a/source/common/arm/dct-a.S b/source/common/arm/dct-a.S
index 42b193bf8..5be8847e9 100644
--- a/source/common/arm/dct-a.S
+++ b/source/common/arm/dct-a.S
@@ -422,7 +422,7 @@ function x265_dct_16x16_neon
     mov lr, #4*16*2
 
     // DCT-1D
-.loop1:
+.Loop1:
     // Row[0-3]
     vld1.16 {q8-q9}, [r0, :64], r2      // q8  = [07 06 05 04 03 02 01 00], q9  = [0F 0E 0D 0C 0B 0A 09 08]
     vld1.16 {q10-q11}, [r0, :64], r2    // q10 = [17 16 15 14 13 12 11 10], q11 = [1F 1E 1D 1C 1B 1A 19 18]
@@ -628,7 +628,7 @@ function x265_dct_16x16_neon
     // loop into next process group
     sub r3, #3*4*16*2
     subs r12, #1
-    bgt .loop1
+    bgt .Loop1
 
 
     // DCT-2D
@@ -637,7 +637,7 @@ function x265_dct_16x16_neon
     mov r3, #16*2*2
     mov r12, #16/4                      // Process 4 rows every loop
 
-.loop2:
+.Loop2:
     vldm r2, {q8-q15}
 
     // d16 = [30 20 10 00]
@@ -887,7 +887,7 @@ function x265_dct_16x16_neon
 
     sub r1, #(17*16-4)*2
     subs r12, #1
-    bgt .loop2
+    bgt .Loop2
 
     add sp, #16*16*2
     vpop {q4-q7}
diff --git a/source/common/arm/ipfilter8.S b/source/common/arm/ipfilter8.S
index 8b7f5b3ca..b1ec6cc8b 100644
--- a/source/common/arm/ipfilter8.S
+++ b/source/common/arm/ipfilter8.S
@@ -372,7 +372,7 @@ function x265_filterPixelToShort_32x16_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #8
-.loop_filterP2S_32x16:
+.Loop_filterP2S_32x16:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0], r1
@@ -391,7 +391,7 @@ function x265_filterPixelToShort_32x16_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_32x16
+    bgt         .Loop_filterP2S_32x16
     bx          lr
 endfunc
 
@@ -402,7 +402,7 @@ function x265_filterPixelToShort_32x24_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #12
-.loop_filterP2S_32x24:
+.Loop_filterP2S_32x24:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0], r1
@@ -421,7 +421,7 @@ function x265_filterPixelToShort_32x24_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_32x24
+    bgt         .Loop_filterP2S_32x24
     bx          lr
 endfunc
 
@@ -432,7 +432,7 @@ function x265_filterPixelToShort_32x32_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #16
-.loop_filterP2S_32x32:
+.Loop_filterP2S_32x32:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0], r1
@@ -451,7 +451,7 @@ function x265_filterPixelToShort_32x32_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_32x32
+    bgt         .Loop_filterP2S_32x32
     bx          lr
 endfunc
 
@@ -462,7 +462,7 @@ function x265_filterPixelToShort_32x64_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_32x64:
+.Loop_filterP2S_32x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0], r1
@@ -481,7 +481,7 @@ function x265_filterPixelToShort_32x64_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_32x64
+    bgt         .Loop_filterP2S_32x64
     bx          lr
 endfunc
 
@@ -493,7 +493,7 @@ function x265_filterPixelToShort_64x16_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #8
-.loop_filterP2S_64x16:
+.Loop_filterP2S_64x16:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -528,7 +528,7 @@ function x265_filterPixelToShort_64x16_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_64x16
+    bgt         .Loop_filterP2S_64x16
     bx          lr
 endfunc
 
@@ -540,7 +540,7 @@ function x265_filterPixelToShort_64x32_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #16
-.loop_filterP2S_64x32:
+.Loop_filterP2S_64x32:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -575,7 +575,7 @@ function x265_filterPixelToShort_64x32_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_64x32
+    bgt         .Loop_filterP2S_64x32
     bx          lr
 endfunc
 
@@ -587,7 +587,7 @@ function x265_filterPixelToShort_64x48_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #24
-.loop_filterP2S_64x48:
+.Loop_filterP2S_64x48:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -622,7 +622,7 @@ function x265_filterPixelToShort_64x48_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_64x48
+    bgt         .Loop_filterP2S_64x48
     bx          lr
 endfunc
 
@@ -634,7 +634,7 @@ function x265_filterPixelToShort_64x64_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_64x64:
+.Loop_filterP2S_64x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -669,7 +669,7 @@ function x265_filterPixelToShort_64x64_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_64x64
+    bgt         .Loop_filterP2S_64x64
     bx          lr
 endfunc
 
@@ -681,7 +681,7 @@ function x265_filterPixelToShort_48x64_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_48x64:
+.Loop_filterP2S_48x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -709,7 +709,7 @@ function x265_filterPixelToShort_48x64_neon
     vmla.s16    q3, q9, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_48x64
+    bgt         .Loop_filterP2S_48x64
     bx          lr
 endfunc
 
@@ -756,7 +756,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon
     vmovl.u8    q2, d4
     vmovl.u8    q3, d6
 
-.loop_4x\h:
+.Loop_4x\h:
     // TODO: read extra 1 row for speed optimize, may made crash on OS X platform!
     vld1.u32    {d16[0]}, [r0], r1
     vld1.u32    {d16[1]}, [r0], r1
@@ -795,7 +795,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon
     vst1.u32    {d18[1]}, [r2], r3
 
     subs        r12, #2
-    bne        .loop_4x4
+    bne        .Loop_4x4
 
     pop         {pc}
     .ltorg
@@ -945,13 +945,13 @@ LUMA_VPP_4xN 16
 
 .macro FILTER_VPP a b filterv
 
-.loop_\filterv\()_\a\()x\b:
+.Loop_\filterv\()_\a\()x\b:
 
     mov             r7, r2
     mov             r6, r0
     eor             r8, r8
 
-.loop_w8_\filterv\()_\a\()x\b:
+.Loop_w8_\filterv\()_\a\()x\b:
 
     add             r6, r0, r8
 
@@ -988,12 +988,12 @@ LUMA_VPP_4xN 16
 
     add             r8, #8
     cmp             r8, #\a
-    blt             .loop_w8_\filterv\()_\a\()x\b
+    blt             .Loop_w8_\filterv\()_\a\()x\b
 
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_\filterv\()_\a\()x\b 
+    bne             .Loop_\filterv\()_\a\()x\b
 
 .endm 
 
@@ -1063,7 +1063,7 @@ function x265_interp_8tap_vert_pp_12x16_neon
     sub             r0, r4
 
     mov             r4, #16
-.loop_vpp_12x16:
+.Loop_vpp_12x16:
 
     mov             r6, r0
     mov             r7, r2
@@ -1173,7 +1173,7 @@ function x265_interp_8tap_vert_pp_12x16_neon
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_vpp_12x16
+    bne             .Loop_vpp_12x16
 
     pop             {r4, r5, r6, r7}
     bx              lr
@@ -1194,7 +1194,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon
     add             r12, #2048
     vdup.32         q8, r12
     mov             r4, #\h
-.loop_vsp_4x\h:
+.Loop_vsp_4x\h:
     movrel          r12, g_lumaFilter
     add             r12, r5
     mov             r6, r0
@@ -1266,7 +1266,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon
 
     add             r0, r1
     subs            r4, #1
-    bne             .loop_vsp_4x\h
+    bne             .Loop_vsp_4x\h
     pop             {r4, r5, r6}
     bx              lr
     .ltorg
@@ -1369,13 +1369,13 @@ LUMA_VSP_4xN 16
 .macro FILTER_VSP a b filterv
 
     vpush           { q4 - q7}
-.loop_\filterv\()_\a\()x\b:
+.Loop_\filterv\()_\a\()x\b:
 
     mov             r7, r2
     mov             r6, r0
     eor             r8, r8
 
-.loop_w8_\filterv\()_\a\()x\b:
+.Loop_w8_\filterv\()_\a\()x\b:
 
     add             r6, r0, r8
 
@@ -1417,12 +1417,12 @@ LUMA_VSP_4xN 16
     mov             r12, #\a
     lsl             r12, #1
     cmp             r8, r12
-    blt             .loop_w8_\filterv\()_\a\()x\b
+    blt             .Loop_w8_\filterv\()_\a\()x\b
 
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_\filterv\()_\a\()x\b
+    bne             .Loop_\filterv\()_\a\()x\b
 
     vpop            { q4 - q7}
 
@@ -1498,7 +1498,7 @@ function x265_interp_8tap_vert_sp_12x16_neon
 
     mov             r4, #16
     vpush           { q4 - q7}
-.loop1_12x16:
+.Loop1_12x16:
 
     mov             r6, r0
     mov             r7, r2
@@ -1612,7 +1612,7 @@ function x265_interp_8tap_vert_sp_12x16_neon
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop1_12x16
+    bne             .Loop1_12x16
     vpop            { q4 - q7}
     pop             {r4, r5, r6, r7}
     bx              lr
@@ -1632,7 +1632,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon
     vdup.32         q8, r4
     mov             r4, #\h
 
-.loop_vps_4x\h:
+.Loop_vps_4x\h:
     movrel          r12, g_lumaFilter
     add             r12, r5
     mov             r6, r0
@@ -1702,7 +1702,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon
 
     add             r0, r1
     subs            r4, #1
-    bne             .loop_vps_4x\h
+    bne             .Loop_vps_4x\h
 
     pop             {r4, r5, r6}
     bx              lr
@@ -1717,13 +1717,13 @@ LUMA_VPS_4xN 16
 
 .macro FILTER_VPS a b filterv
 
-.loop_ps_\filterv\()_\a\()x\b:
+.Loop_ps_\filterv\()_\a\()x\b:
 
     mov             r7, r2
     mov             r6, r0
     eor             r8, r8
 
-.loop_ps_w8_\filterv\()_\a\()x\b:
+.Loop_ps_w8_\filterv\()_\a\()x\b:
 
     add             r6, r0, r8
 
@@ -1759,12 +1759,12 @@ LUMA_VPS_4xN 16
 
     add             r8, #8
     cmp             r8, #\a
-    blt             .loop_ps_w8_\filterv\()_\a\()x\b
+    blt             .Loop_ps_w8_\filterv\()_\a\()x\b
 
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_ps_\filterv\()_\a\()x\b 
+    bne             .Loop_ps_\filterv\()_\a\()x\b
 
 .endm 
 
@@ -1836,7 +1836,7 @@ function x265_interp_8tap_vert_ps_12x16_neon
     sub             r0, r4
 
     mov             r4, #16
-.loop_vps_12x16:
+.Loop_vps_12x16:
 
     mov             r6, r0
     mov             r7, r2
@@ -1942,7 +1942,7 @@ function x265_interp_8tap_vert_ps_12x16_neon
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_vps_12x16
+    bne             .Loop_vps_12x16
 
     pop             {r4, r5, r6, r7}
     bx              lr
@@ -2081,13 +2081,13 @@ endfunc
 
     vpush           {q4-q7}
 
-.loop_\filterv\()_\a\()x\b:
+.Loop_\filterv\()_\a\()x\b:
 
     mov             r7, r2
     mov             r6, r0
     eor             r8, r8
 
-.loop_w8_\filterv\()_\a\()x\b:
+.Loop_w8_\filterv\()_\a\()x\b:
 
     add             r6, r0, r8
 
@@ -2121,12 +2121,12 @@ endfunc
 
     add             r8, #8
     cmp             r8, #\a
-    blt             .loop_w8_\filterv\()_\a\()x\b
+    blt             .Loop_w8_\filterv\()_\a\()x\b
 
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_\filterv\()_\a\()x\b 
+    bne             .Loop_\filterv\()_\a\()x\b
     vpop            {q4-q7}
 .endm 
 
@@ -2217,13 +2217,13 @@ CHROMA_VPP 48 64
 
     vpush           {q4-q7}
 
-.loop_vps_\filterv\()_\a\()x\b:
+.Loop_vps_\filterv\()_\a\()x\b:
 
     mov             r7, r2
     mov             r6, r0
     eor             r8, r8
 
-.loop_vps_w8_\filterv\()_\a\()x\b:
+.Loop_vps_w8_\filterv\()_\a\()x\b:
 
     add             r6, r0, r8
 
@@ -2256,12 +2256,12 @@ CHROMA_VPP 48 64
 
     add             r8, #8
     cmp             r8, #\a
-    blt             .loop_vps_w8_\filterv\()_\a\()x\b
+    blt             .Loop_vps_w8_\filterv\()_\a\()x\b
 
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_vps_\filterv\()_\a\()x\b 
+    bne             .Loop_vps_\filterv\()_\a\()x\b
     vpop            {q4-q7}
 .endm 
 
@@ -2353,13 +2353,13 @@ CHROMA_VPS 48 64
 
     vpush           {q4-q7}
 
-.loop_vsp_\filterv\()_\a\()x\b:
+.Loop_vsp_\filterv\()_\a\()x\b:
 
     mov             r7, r2
     mov             r6, r0
     eor             r8, r8
 
-.loop_vsp_w8_\filterv\()_\a\()x\b:
+.Loop_vsp_w8_\filterv\()_\a\()x\b:
 
     add             r6, r0, r8
 
@@ -2392,12 +2392,12 @@ CHROMA_VPS 48 64
     mov             r12, #\a
     lsl             r12, #1
     cmp             r8, r12
-    blt             .loop_vsp_w8_\filterv\()_\a\()x\b
+    blt             .Loop_vsp_w8_\filterv\()_\a\()x\b
 
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_vsp_\filterv\()_\a\()x\b 
+    bne             .Loop_vsp_\filterv\()_\a\()x\b
     vpop            {q4-q7}
 .endm 
 
diff --git a/source/common/arm/mc-a.S b/source/common/arm/mc-a.S
index b10e9e816..839d192cd 100644
--- a/source/common/arm/mc-a.S
+++ b/source/common/arm/mc-a.S
@@ -554,7 +554,7 @@ function x265_cpy2Dto1D_shr_16x16_neon
     vsri.s16        q1, #1
     vneg.s16        q0, q0
     mov             r3, #4
-.loop_cpy2Dto1D_shr_16:
+.Loop_cpy2Dto1D_shr_16:
     subs            r3, #1
 .rept 4
     vld1.s16        {q2-q3}, [r1], r2
@@ -564,7 +564,7 @@ function x265_cpy2Dto1D_shr_16x16_neon
     vshl.s16        q3, q0
     vst1.16         {q2-q3}, [r0]!
 .endr
-    bgt             .loop_cpy2Dto1D_shr_16
+    bgt             .Loop_cpy2Dto1D_shr_16
     bx              lr
 endfunc
 
@@ -577,7 +577,7 @@ function x265_cpy2Dto1D_shr_32x32_neon
     vsri.s16        q1, #1
     vneg.s16        q0, q0
     mov             r3, 16
-.loop_cpy2Dto1D_shr_32:
+.Loop_cpy2Dto1D_shr_32:
     subs            r3, #1
 .rept 2
     vld1.s16        {q2-q3}, [r1]!
@@ -593,7 +593,7 @@ function x265_cpy2Dto1D_shr_32x32_neon
     vst1.16         {q2-q3}, [r0]!
     vst1.16         {q8-q9}, [r0]!
 .endr
-    bgt             .loop_cpy2Dto1D_shr_32
+    bgt             .Loop_cpy2Dto1D_shr_32
     bx              lr
 endfunc
 
diff --git a/source/common/arm/pixel-util.S b/source/common/arm/pixel-util.S
index c26b17acc..67719c8e5 100644
--- a/source/common/arm/pixel-util.S
+++ b/source/common/arm/pixel-util.S
@@ -848,36 +848,36 @@ function x265_pixel_planecopy_cp_neon
     vdup.8          q2, r12
     sub             r5, #1
 
-.loop_h:
+.Loop_h:
     mov             r6, r0
     mov             r12, r2
     eor             r7, r7
-.loop_w:
+.Loop_w:
     vld1.u8         {q0}, [r6]!
     vshl.u8         q0, q0, q2
     vst1.u8         {q0}, [r12]!
 
     add             r7, #16
     cmp             r7, r4
-    blt             .loop_w
+    blt             .Loop_w
 
     add             r0, r1
     add             r2, r3
 
     subs             r5, #1
-    bgt             .loop_h
+    bgt             .Loop_h
 
 // handle last row
     mov             r5, r4
     lsr             r5, #3
 
-.loopW8:
+.LoopW8:
     vld1.u8         d0, [r0]!
     vshl.u8         d0, d0, d4
     vst1.u8         d0, [r2]!
     subs            r4, r4, #8
     subs            r5, #1
-    bgt             .loopW8
+    bgt             .LoopW8
 
     mov             r5,#8
     sub             r5, r4
@@ -1970,7 +1970,7 @@ function x265_quant_neon
     eor             r5, r5
     veor.s32        q12, q12
 
-.loop_quant:
+.Loop_quant:
 
     vld1.s16        d16, [r0]!
     vmovl.s16       q9, d16                // q9= coef[blockpos]
@@ -1999,7 +1999,7 @@ function x265_quant_neon
     vst1.s16        d16, [r3]!
 
     subs            r4, #1
-    bne             .loop_quant
+    bne             .Loop_quant
 
     vadd.u32        d8, d9
     vpadd.u32       d8, d8
@@ -2023,7 +2023,7 @@ function x265_nquant_neon
     eor             r4, r4
     veor.s32        q12, q12
 
-.loop_nquant:
+.Loop_nquant:
 
     vld1.s16        d16, [r0]!
     vmovl.s16       q9, d16                // q9= coef[blockpos]
@@ -2049,7 +2049,7 @@ function x265_nquant_neon
     vst1.s16        d17, [r2]!
 
     subs            r3, #1
-    bne             .loop_nquant
+    bne             .Loop_nquant
 
     vadd.u32        d8, d9
     vpadd.u32       d8, d8
@@ -2148,7 +2148,7 @@ function x265_pixel_sa8d_32x64_neon
     mov             r10, #4
     eor             r9, r9
 
-.loop_32:
+.Loop_32:
 
     sa8d_16x16 r4
 
@@ -2166,7 +2166,7 @@ function x265_pixel_sa8d_32x64_neon
     sub             r2,  r2,  #24
 
     subs            r10, #1
-    bgt            .loop_32
+    bgt            .Loop_32
 
     mov             r0, r9
     vpop            {d8-d11}
@@ -2183,7 +2183,7 @@ function x265_pixel_sa8d_64x64_neon
     mov             r10, #4
     eor             r9, r9
 
-.loop_1:
+.Loop_1:
 
     sa8d_16x16 r4
 
@@ -2217,7 +2217,7 @@ function x265_pixel_sa8d_64x64_neon
     sub             r2,  r2,  #56
 
     subs            r10, #1
-    bgt            .loop_1
+    bgt            .Loop_1
 
     mov             r0, r9
     vpop            {d8-d11}
diff --git a/source/common/arm/sad-a.S b/source/common/arm/sad-a.S
index 6faf35957..b5cbded89 100644
--- a/source/common/arm/sad-a.S
+++ b/source/common/arm/sad-a.S
@@ -103,7 +103,7 @@ function x265_pixel_sad_16x\h\()_neon
     vabal.u8        q9, d5, d7
     mov             r12, #(\h-2)/2
 
-.loop_16x\h:
+.Loop_16x\h:
 
     subs            r12, #1
     vld1.8          {q0}, [r0], r1
@@ -115,7 +115,7 @@ function x265_pixel_sad_16x\h\()_neon
     vabal.u8        q9, d1, d3
     vabal.u8        q8, d4, d6
     vabal.u8        q9, d5, d7
-    bne             .loop_16x\h
+    bne             .Loop_16x\h
 
     vadd.u16        q8, q8, q9
 .if \h == 64
@@ -147,7 +147,7 @@ function x265_pixel_sad_32x\h\()_neon
     veor.u8         q11, q11
     mov             r12, #\h/8
 
-.loop_32x\h:
+.Loop_32x\h:
 
     subs            r12, #1
 .rept 4
@@ -166,7 +166,7 @@ function x265_pixel_sad_32x\h\()_neon
     vabal.u8        q10, d26, d30
     vabal.u8        q11, d27, d31
 .endr
-    bne             .loop_32x\h
+    bne             .Loop_32x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -213,7 +213,7 @@ function x265_pixel_sad_64x\h\()_neon
     sub             r3, r12
     mov             r12, #\h/8
 
-.loop_64x\h:
+.Loop_64x\h:
 
     subs            r12, #1
 .rept 4
@@ -246,7 +246,7 @@ function x265_pixel_sad_64x\h\()_neon
     vabal.u8        q10, d26, d30
     vabal.u8        q11, d27, d31
 .endr
-    bne             .loop_64x\h
+    bne             .Loop_64x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -283,7 +283,7 @@ function x265_pixel_sad_24x32_neon
     sub             r3, #16
     mov             r12, #8
 
-.loop_24x32:
+.Loop_24x32:
 
     subs            r12, #1
 .rept 4
@@ -296,7 +296,7 @@ function x265_pixel_sad_24x32_neon
     vld1.8          {d1}, [r2], r3
     vabal.u8        q10, d0, d1
 .endr
-    bne             .loop_24x32
+    bne             .Loop_24x32
 
     vadd.u16        q8, q8, q9
     vadd.u16        d16, d16, d17
@@ -322,7 +322,7 @@ function x265_pixel_sad_48x64_neon
     sub             r3, #32
     mov             r12, #16
 
-.loop_48x64:
+.Loop_48x64:
 
     subs            r12, #1
 .rept 4
@@ -337,7 +337,7 @@ function x265_pixel_sad_48x64_neon
     vabal.u8        q14, d4, d20
     vabal.u8        q15, d5, d21
 .endr
-    bne             .loop_48x64
+    bne             .Loop_48x64
 
     vadd.u16        q3, q3, q11
     vadd.u16        d6, d6, d7
@@ -635,12 +635,12 @@ function x265_sad_x\x\()_16x\h\()_neon
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_16x\h:
+.Loop_sad_x\x\()_16x\h:
 .rept 8
     SAD_X_16 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_16x\h
+    bne             .Loop_sad_x\x\()_16x\h
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -929,12 +929,12 @@ function x265_sad_x\x\()_64x\h\()_neon
     veor.u8         q14, q14
     veor.u8         q15, q15
 .endif
-.loop_sad_x\x\()_64x\h:
+.Loop_sad_x\x\()_64x\h:
 .rept 8
     SAD_X_64 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_64x\h
+    bne             .Loop_sad_x\x\()_64x\h
 
 .if \h <= 16
     vadd.u16        q8, q8, q9
@@ -1071,12 +1071,12 @@ function x265_sad_x\x\()_48x64_neon
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_48x64:
+.Loop_sad_x\x\()_48x64:
 .rept 8
     SAD_X_48 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_48x64
+    bne             .Loop_sad_x\x\()_48x64
 
     vpaddl.u16      q8, q8
     vpaddl.u16      q9, q9
@@ -1179,12 +1179,12 @@ function x265_sad_x\x\()_24x32_neon
     veor.u8         q15, q15
 .endif
 
-.loop_sad_x\x\()_24x32:
+.Loop_sad_x\x\()_24x32:
 .rept 8
     SAD_X_24 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_24x32
+    bne             .Loop_sad_x\x\()_24x32
 
     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
diff --git a/source/common/arm/ssd-a.S b/source/common/arm/ssd-a.S
index bb91a0bcb..c00ab0023 100644
--- a/source/common/arm/ssd-a.S
+++ b/source/common/arm/ssd-a.S
@@ -121,7 +121,7 @@ function x265_pixel_sse_pp_32x32_neon
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_pp_32:
+.Loop_sse_pp_32:
     subs        r12, #1
 .rept 4
     vld1.64     {q8-q9}, [r0], r1
@@ -139,7 +139,7 @@ function x265_pixel_sse_pp_32x32_neon
     vmlal.s16   q0, d26, d26
     vmlal.s16   q1, d27, d27
 .endr
-    bne         .loop_sse_pp_32
+    bne         .Loop_sse_pp_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -154,7 +154,7 @@ function x265_pixel_sse_pp_64x64_neon
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_pp_64:
+.Loop_sse_pp_64:
     subs        r12, #1
 .rept 4
     vld1.64     {q8-q9}, [r0]!
@@ -187,7 +187,7 @@ function x265_pixel_sse_pp_64x64_neon
     vmlal.s16   q0, d26, d26
     vmlal.s16   q1, d27, d27
 .endr
-    bne         .loop_sse_pp_64
+    bne         .Loop_sse_pp_64
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -257,7 +257,7 @@ function x265_pixel_sse_ss_16x16_neon
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_16:
+.Loop_sse_ss_16:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, [r0], r1
@@ -269,7 +269,7 @@ function x265_pixel_sse_ss_16x16_neon
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_16
+    bne         .Loop_sse_ss_16
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -286,7 +286,7 @@ function x265_pixel_sse_ss_32x32_neon
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_32:
+.Loop_sse_ss_32:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, [r0]!
@@ -307,7 +307,7 @@ function x265_pixel_sse_ss_32x32_neon
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_32
+    bne         .Loop_sse_ss_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -324,7 +324,7 @@ function x265_pixel_sse_ss_64x64_neon
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_sse_ss_64:
+.Loop_sse_ss_64:
     subs        r12, #1
 .rept 2
     vld1.s16    {q8-q9}, [r0]!
@@ -363,7 +363,7 @@ function x265_pixel_sse_ss_64x64_neon
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_64
+    bne         .Loop_sse_ss_64
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -417,7 +417,7 @@ function x265_pixel_ssd_s_16x16_neon
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_ssd_s_16:
+.Loop_ssd_s_16:
     subs        r12, #1
 .rept 2
     vld1.s16    {q8-q9}, [r0], r1
@@ -431,7 +431,7 @@ function x265_pixel_ssd_s_16x16_neon
     vmlal.s16   q0, d22, d22
     vmlal.s16   q1, d23, d23
 .endr
-    bne         .loop_ssd_s_16
+    bne         .Loop_ssd_s_16
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -446,7 +446,7 @@ function x265_pixel_ssd_s_32x32_neon
     veor.u8     q0, q0
     veor.u8     q1, q1
 
-.loop_ssd_s_32:
+.Loop_ssd_s_32:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, [r0]!
@@ -460,7 +460,7 @@ function x265_pixel_ssd_s_32x32_neon
     vmlal.s16   q0, d22, d22
     vmlal.s16   q1, d23, d23
 .endr
-    bne         .loop_ssd_s_32
+    bne         .Loop_ssd_s_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
-- 
2.42.1


>From d57af999032672d60b406abff4a559dda455aa4b Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Mon, 9 Oct 2023 15:57:33 +0100
Subject: [PATCH 03/12] AArch64: Refactor cross-compilation toolchains

The existing Cross Compilation toolchain for aarch64-linux uses
environment variables to optionally specify a different cross-compiler,
which are then used to set CMake variables.

Refactor toolchains using CMake options (boolean cache variables), so
that these can be set directly when running CMake to configure the
project.

Remove the hard-coded version from the aarch64-darwin toolchain, and add
the option to allow overriding this default, for example in order to use
a specific gcc version. Note that by removing the version, gcc/g++ will
invoke clang/clang++ on Darwin systems by default.

Update README to reflect this change, as well as to provide some
additional details regarding the use of toolchain files.
---
 build/README.txt                        | 17 +++++++++--------
 build/aarch64-darwin/crosscompile.cmake | 10 +++++++---
 build/aarch64-linux/crosscompile.cmake  | 10 +++-------
 3 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/build/README.txt b/build/README.txt
index 6346eb041..1528e9837 100644
--- a/build/README.txt
+++ b/build/README.txt
@@ -94,16 +94,17 @@ found, the version will be "unknown".
 
 = Build Instructions for cross-compilation for Arm AArch64 Targets=
 
-When the target platform is based on Arm AArch64 architecture, the x265 can be
-built in x86 platforms. However, the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER
-enviroment variables should be set to point to the cross compilers of the
-appropriate gcc. For example:
+Cross compilation of x265 for AArch64 targets is possible on x86 platforms by
+passing a toolchain file when running CMake to configure the project:
 
-1. export CMAKE_C_COMPILER=aarch64-unknown-linux-gnu-gcc
-2. export CMAKE_CXX_COMPILER=aarch64-unknown-linux-gnu-g++
+* cmake -DCMAKE_TOOLCHAIN_FILE=<path-to-toolchain-file>
 
-The default ones are aarch64-linux-gnu-gcc and aarch64-linux-gnu-g++.
-Then, the normal building process can be followed.
+Toolchain files for AArch64 cross-compilation exist in the /build directory.
+These specify a default cross-compiler to use; however this can be overridden
+by setting the CMAKE_C_COMPILER and CMAKE_CXX_COMPILER CMake variables when
+running CMake to configure the project. For example:
+
+* cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++
 
 Moreover, if the target platform supports SVE or SVE2 instruction set, the
 CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
diff --git a/build/aarch64-darwin/crosscompile.cmake b/build/aarch64-darwin/crosscompile.cmake
index d933daaa2..289c32bca 100644
--- a/build/aarch64-darwin/crosscompile.cmake
+++ b/build/aarch64-darwin/crosscompile.cmake
@@ -7,9 +7,13 @@ set(CROSS_COMPILE_ARM64 1)
 set(CMAKE_SYSTEM_NAME Darwin)
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
 
-# specify the cross compiler
-set(CMAKE_C_COMPILER gcc-12)
-set(CMAKE_CXX_COMPILER g++-12)
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
+if(NOT DEFINED CMAKE_C_COMPILER)
+    set(CMAKE_C_COMPILER gcc)
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
+    set(CMAKE_CXX_COMPILER g++)
+endif()
 
 # specify the target environment
 SET(CMAKE_FIND_ROOT_PATH  /opt/homebrew/bin/)
diff --git a/build/aarch64-linux/crosscompile.cmake b/build/aarch64-linux/crosscompile.cmake
index 8cfe3243d..932b472c4 100644
--- a/build/aarch64-linux/crosscompile.cmake
+++ b/build/aarch64-linux/crosscompile.cmake
@@ -7,15 +7,11 @@ set(CROSS_COMPILE_ARM64 1)
 set(CMAKE_SYSTEM_NAME Linux)
 set(CMAKE_SYSTEM_PROCESSOR aarch64)
 
-# specify the cross compiler
-if(DEFINED ENV{CMAKE_C_COMPILER})
-    set(CMAKE_C_COMPILER $ENV{CMAKE_C_COMPILER})
-else()
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
+if(NOT DEFINED CMAKE_C_COMPILER)
     set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)
 endif()
-if(DEFINED ENV{CMAKE_CXX_COMPILER})
-    set(CMAKE_CXX_COMPILER $ENV{CMAKE_CXX_COMPILER})
-else()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
     set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)
 endif()
 
-- 
2.42.1


>From 087758dd632a8043253762fa32c26889cd20985a Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Mon, 9 Oct 2023 15:40:03 +0100
Subject: [PATCH 04/12] AArch64: Refactor cross-compilation options

The existing Cross Compilation toolchains for AArch64 use environment
variables to specify optional features, such as SVE/SVE2, which are then
used to set CMake variables.

Refactor this using CMake options (boolean cache variables), so that
these can be set directly when running CMake to configure the project.
As the options are defined in the top-level CMakeLists file the
duplication between cross-compilation toolchain files is reduced.
---
 build/README.txt                        | 12 ++++++------
 build/aarch64-darwin/crosscompile.cmake |  7 -------
 build/aarch64-linux/crosscompile.cmake  |  7 -------
 source/CMakeLists.txt                   |  4 ++++
 4 files changed, 10 insertions(+), 20 deletions(-)

diff --git a/build/README.txt b/build/README.txt
index 1528e9837..af4abd21c 100644
--- a/build/README.txt
+++ b/build/README.txt
@@ -106,11 +106,11 @@ running CMake to configure the project. For example:
 
 * cmake -DCMAKE_C_COMPILER=aarch64-linux-gnu-gcc -DCMAKE_CXX_COMPILER=aarch64-linux-gnu-g++
 
-Moreover, if the target platform supports SVE or SVE2 instruction set, the
-CROSS_COMPILE_SVE or CROSS_COMPILE_SVE2 environment variables should be set
-to true, respectively. For example:
+Moreover, if the target platform supports SVE or SVE2, CROSS_COMPILE_SVE or
+CROSS_COMPILE_SVE2 CMake options should be set to ON, respectively.
+For example, when running CMake to configure the project:
 
-1. export CROSS_COMPILE_SVE2=true
-2. export CROSS_COMPILE_SVE=true
+1. cmake -DCROSS_COMPILE_SVE=ON  <other configuration options...>
+2. cmake -DCROSS_COMPILE_SVE2=ON <other configuration options...>
 
-Then, the normal building process can be followed.
+Then, the normal build process can be followed.
diff --git a/build/aarch64-darwin/crosscompile.cmake b/build/aarch64-darwin/crosscompile.cmake
index 289c32bca..6037d5ed6 100644
--- a/build/aarch64-darwin/crosscompile.cmake
+++ b/build/aarch64-darwin/crosscompile.cmake
@@ -18,10 +18,3 @@ endif()
 # specify the target environment
 SET(CMAKE_FIND_ROOT_PATH  /opt/homebrew/bin/)
 
-# specify whether SVE/SVE2 is supported by the target platform
-if(DEFINED ENV{CROSS_COMPILE_SVE2})
-    set(CROSS_COMPILE_SVE2 1)
-elseif(DEFINED ENV{CROSS_COMPILE_SVE})
-    set(CROSS_COMPILE_SVE 1)
-endif()
-
diff --git a/build/aarch64-linux/crosscompile.cmake b/build/aarch64-linux/crosscompile.cmake
index 932b472c4..caf26af77 100644
--- a/build/aarch64-linux/crosscompile.cmake
+++ b/build/aarch64-linux/crosscompile.cmake
@@ -18,10 +18,3 @@ endif()
 # specify the target environment
 SET(CMAKE_FIND_ROOT_PATH  /usr/aarch64-linux-gnu)
 
-# specify whether SVE/SVE2 is supported by the target platform
-if(DEFINED ENV{CROSS_COMPILE_SVE2})
-    set(CROSS_COMPILE_SVE2 1)
-elseif(DEFINED ENV{CROSS_COMPILE_SVE})
-    set(CROSS_COMPILE_SVE 1)
-endif()
-
diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index ab5ddfeb7..413311de7 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -88,6 +88,10 @@ elseif(ARM64MATCH GREATER "-1")
     message(STATUS "Detected ARM64 target processor")
     set(ARM64 1)
     add_definitions(-DX265_ARCH_ARM64=1 -DHAVE_NEON)
+
+    # Options for cross compiling AArch64 optional extensions
+    option(CROSS_COMPILE_SVE "Cross Compile for SVE Target" OFF)
+    option(CROSS_COMPILE_SVE2 "Cross Compile for SVE2 Target" OFF)
 else()
     message(STATUS "CMAKE_SYSTEM_PROCESSOR value `${CMAKE_SYSTEM_PROCESSOR}` is unknown")
     message(STATUS "Please add this value near ${CMAKE_CURRENT_LIST_FILE}:${CMAKE_CURRENT_LIST_LINE}")
-- 
2.42.1


>From 4c26020289976968e9e4fe51565e1e696f9b96b7 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Mon, 2 Oct 2023 20:53:56 +0100
Subject: [PATCH 05/12] AArch64: Add Clang cross toolchain for AArch64

Add a clang-specific cross-compilation toolchain for AArch64.

A separate toolchain is needed as we cannot simply set CMAKE_C_COMPILER
to clang - it requires specifying the target triple using the --target
flag.
---
 build/aarch64-linux-clang/crosscompile.cmake | 26 ++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 build/aarch64-linux-clang/crosscompile.cmake

diff --git a/build/aarch64-linux-clang/crosscompile.cmake b/build/aarch64-linux-clang/crosscompile.cmake
new file mode 100644
index 000000000..c0fb8a3cd
--- /dev/null
+++ b/build/aarch64-linux-clang/crosscompile.cmake
@@ -0,0 +1,26 @@
+# CMake toolchain file for cross compiling x265 for AArch64, using Clang.
+
+set(CROSS_COMPILE_ARM64 1)
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR aarch64)
+
+set(TARGET_TRIPLE aarch64-linux-gnu)
+
+# specify the cross compiler (giving precedence to user-supplied CC/CXX)
+if(NOT DEFINED CMAKE_C_COMPILER)
+    set(CMAKE_C_COMPILER clang)
+endif()
+if(NOT DEFINED CMAKE_CXX_COMPILER)
+    set(CMAKE_CXX_COMPILER clang++)
+endif()
+
+# specify compiler target
+set(CMAKE_C_COMPILER_TARGET ${TARGET_TRIPLE})
+set(CMAKE_CXX_COMPILER_TARGET ${TARGET_TRIPLE})
+
+# specify assembler target
+list(APPEND ASM_FLAGS "--target=${TARGET_TRIPLE}")
+
+# specify the target environment
+SET(CMAKE_FIND_ROOT_PATH /usr/aarch64-linux-gnu)
+
-- 
2.42.1


>From 986b5d2169353b9b7711f03c0a3b5e1099e1ef28 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Wed, 4 Oct 2023 18:16:42 +0100
Subject: [PATCH 06/12] AArch64: Don't pass -mstackrealign option

With the compiler option `-mstackrealign`, Clang emits function prologue
and epilogue instructions to create and then restore a frame record,
even when not necessary.

Omit this option when building for AArch64 as it is unnecessary.
---
 source/CMakeLists.txt | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/source/CMakeLists.txt b/source/CMakeLists.txt
index 413311de7..c74db2179 100755
--- a/source/CMakeLists.txt
+++ b/source/CMakeLists.txt
@@ -338,9 +338,11 @@ if(GCC)
     if (CC_HAS_FAST_MATH)
         add_definitions(-ffast-math)
     endif()
-    check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN) 
-    if (CC_HAS_STACK_REALIGN)
-        add_definitions(-mstackrealign)
+    if (NOT (ARM64 OR CROSS_COMPILE_ARM64))
+        check_cxx_compiler_flag(-mstackrealign CC_HAS_STACK_REALIGN)
+        if (CC_HAS_STACK_REALIGN)
+            add_definitions(-mstackrealign)
+        endif()
     endif()
     # Disable exceptions. Reduce executable size, increase compability.
     check_cxx_compiler_flag(-fno-exceptions CC_HAS_FNO_EXCEPTIONS_FLAG)
-- 
2.42.1


>From aefc4bf882c0420241f99317a49e43836ede13d1 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Mon, 19 Feb 2024 15:08:54 +0000
Subject: [PATCH 07/12] AArch64: Fix sad_x4_neon for X265_DEPTH > 10

Fix sad_x4_neon overflow for block widths greater than 32 when
HIGH_BIT_DEPTH=1 and X265_DEPTH > 10.
---
 source/common/aarch64/pixel-prim.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/source/common/aarch64/pixel-prim.cpp b/source/common/aarch64/pixel-prim.cpp
index 7164cd99f..f073251d3 100644
--- a/source/common/aarch64/pixel-prim.cpp
+++ b/source/common/aarch64/pixel-prim.cpp
@@ -1069,10 +1069,9 @@ void sad_x4_neon(const pixel *pix1, const pixel *pix2, const pixel *pix3, const
         {
             /* This is equivalent to adding across each of the sum vectors and then adding
              * to result. */
-            uint16x8_t a = vpaddq_s16(vsum16_0, vsum16_1);
-            uint16x8_t b = vpaddq_s16(vsum16_2, vsum16_3);
-            uint16x8_t c = vpaddq_s16(a, b);
-            result = vpadalq_s16(result, c);
+            uint32x4_t sum01 = vpaddlq_u16(vpaddq_u16(vsum16_0, vsum16_1));
+            uint32x4_t sum23 = vpaddlq_u16(vpaddq_u16(vsum16_2, vsum16_3));
+            result = vaddq_u32(result, vpaddq_u32(sum01, sum23));
         }
 
 #else
-- 
2.42.1


>From 24caff8b3ed8e84b3ca7cd64573c7a1540bf2883 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Wed, 21 Feb 2024 11:49:05 +0000
Subject: [PATCH 08/12] Fix typo in intrinsics setup function declaration

---
 source/common/primitives.cpp         | 2 +-
 source/common/primitives.h           | 2 +-
 source/common/vec/vec-primitives.cpp | 2 +-
 source/test/testbench.cpp            | 4 ++--
 4 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp
index 10e418884..0a7278f2e 100644
--- a/source/common/primitives.cpp
+++ b/source/common/primitives.cpp
@@ -259,7 +259,7 @@ void x265_setup_primitives(x265_param *param)
 
 #if ENABLE_ASSEMBLY
 #if X265_ARCH_X86
-        setupInstrinsicPrimitives(primitives, param->cpuid);
+        setupIntrinsicPrimitives(primitives, param->cpuid);
 #endif
         setupAssemblyPrimitives(primitives, param->cpuid);
 #endif
diff --git a/source/common/primitives.h b/source/common/primitives.h
index df1cae4b7..cc6b72e1a 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -470,7 +470,7 @@ inline int partitionFromLog2Size(int log2Size)
 }
 
 void setupCPrimitives(EncoderPrimitives &p);
-void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAliasPrimitives(EncoderPrimitives &p);
 #if X265_ARCH_ARM64
diff --git a/source/common/vec/vec-primitives.cpp b/source/common/vec/vec-primitives.cpp
index 855ea277f..d37f7ac3d 100644
--- a/source/common/vec/vec-primitives.cpp
+++ b/source/common/vec/vec-primitives.cpp
@@ -59,7 +59,7 @@ void setupIntrinsicDCT_ssse3(EncoderPrimitives&);
 void setupIntrinsicDCT_sse41(EncoderPrimitives&);
 
 /* Use primitives for the best available vector architecture */
-void setupInstrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
 {
 #ifdef HAVE_SSE3
     if (cpuMask & X265_CPU_SSE3)
diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
index ddcfd6139..20d29182d 100644
--- a/source/test/testbench.cpp
+++ b/source/test/testbench.cpp
@@ -193,7 +193,7 @@ int main(int argc, char *argv[])
 #if X265_ARCH_X86
         EncoderPrimitives vecprim;
         memset(&vecprim, 0, sizeof(vecprim));
-        setupInstrinsicPrimitives(vecprim, test_arch[i].flag);
+        setupIntrinsicPrimitives(vecprim, test_arch[i].flag);
         setupAliasPrimitives(vecprim);
         for (size_t h = 0; h < sizeof(harness) / sizeof(TestHarness*); h++)
         {
@@ -232,7 +232,7 @@ int main(int argc, char *argv[])
     EncoderPrimitives optprim;
     memset(&optprim, 0, sizeof(optprim));
 #if X265_ARCH_X86
-    setupInstrinsicPrimitives(optprim, cpuid);
+    setupIntrinsicPrimitives(optprim, cpuid);
 #endif
 
     setupAssemblyPrimitives(optprim, cpuid);
-- 
2.42.1


>From e60326c2aa3d5ed72147503ec43abc359900d02c Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Wed, 21 Feb 2024 12:01:27 +0000
Subject: [PATCH 09/12] AArch64: Remove unused function declaration

primitives.h contains a declaration for an AArch64 specific setup
function that was added as a temporary workaround in bc5837389, and
subsequently removed in 32b08c771. This patch cleans up the now
unused declaration which was left over.
---
 source/common/primitives.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/source/common/primitives.h b/source/common/primitives.h
index cc6b72e1a..fd343f5fe 100644
--- a/source/common/primitives.h
+++ b/source/common/primitives.h
@@ -473,9 +473,6 @@ void setupCPrimitives(EncoderPrimitives &p);
 void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask);
 void setupAliasPrimitives(EncoderPrimitives &p);
-#if X265_ARCH_ARM64
-void setupAliasCPrimitives(EncoderPrimitives &cp, EncoderPrimitives &asmp, int cpuMask);
-#endif
 #if HAVE_ALTIVEC
 void setupPixelPrimitives_altivec(EncoderPrimitives &p);
 void setupDCTPrimitives_altivec(EncoderPrimitives &p);
-- 
2.42.1


>From df6c54f1103999ea9010957e7682dba7907c6504 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Wed, 21 Feb 2024 14:25:46 +0000
Subject: [PATCH 10/12] AArch64: Separate setup of optimized primitives

Currently the intrinsics-based optimized primitives for AArch64 are
setup at the start of setupAssemblyPrimitives, rather than in the
separate, intrinsics-specific function. As this same combined setup
function is used when setting up the tests, only a specific subset of
intrinsics primitives are tested in testbench.cpp.

This patch moves the setup for AArch64 intrinsics implementations to
setupIntrinsicPrimitives, and enables separate testing for
intrinsics primitives. This change does not alter the function table
produced in x265_setup_primitives(), as the intrinsics primitives are
still setup prior to the ASM primitives.
---
 source/common/aarch64/asm-primitives.cpp | 35 +++++++++---------------
 source/common/primitives.cpp             |  2 +-
 source/test/testbench.cpp                |  4 +--
 3 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index 8c46bfd21..f20d1e57d 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -708,12 +708,6 @@ void interp_8tap_hv_pp_cpu(const pixel *src, intptr_t srcStride, pixel *dst, int
 
 void setupNeonPrimitives(EncoderPrimitives &p)
 {
-    setupPixelPrimitives_neon(p);
-    setupFilterPrimitives_neon(p);
-    setupDCTPrimitives_neon(p);
-    setupLoopFilterPrimitives_neon(p);
-    setupIntraPrimitives_neon(p);
-
     ALL_CHROMA_420_PU(p2s[NONALIGNED], filterPixelToShort, neon);
     ALL_CHROMA_422_PU(p2s[ALIGNED], filterPixelToShort, neon);
     ALL_CHROMA_444_PU(p2s[ALIGNED], filterPixelToShort, neon);
@@ -1083,14 +1077,6 @@ void setupNeonPrimitives(EncoderPrimitives &p)
 #if defined(HAVE_SVE2) || defined(HAVE_SVE)
 void setupSvePrimitives(EncoderPrimitives &p)
 {
-    // When these primitives will use SVE/SVE2 instructions set,
-    // change the following definitions to point to the SVE/SVE2 implementation
-    setupPixelPrimitives_neon(p);
-    setupFilterPrimitives_neon(p);
-    setupDCTPrimitives_neon(p);
-    setupLoopFilterPrimitives_neon(p);
-    setupIntraPrimitives_neon(p);
-
     CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2s[NONALIGNED]);
     CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
     CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
@@ -1499,14 +1485,6 @@ void setupSvePrimitives(EncoderPrimitives &p)
 #if defined(HAVE_SVE2)
 void setupSve2Primitives(EncoderPrimitives &p)
 {
-    // When these primitives will use SVE/SVE2 instructions set,
-    // change the following definitions to point to the SVE/SVE2 implementation
-    setupPixelPrimitives_neon(p);
-    setupFilterPrimitives_neon(p);
-    setupDCTPrimitives_neon(p);
-    setupLoopFilterPrimitives_neon(p);
-    setupIntraPrimitives_neon(p);
-
     CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2s[NONALIGNED]);
     CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
     CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
@@ -1961,4 +1939,17 @@ void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
 #endif
 
 }
+
+void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
+{
+    if (cpuMask & X265_CPU_NEON)
+    {
+        setupPixelPrimitives_neon(p);
+        setupFilterPrimitives_neon(p);
+        setupDCTPrimitives_neon(p);
+        setupLoopFilterPrimitives_neon(p);
+        setupIntraPrimitives_neon(p);
+    }
+}
+
 } // namespace X265_NS
diff --git a/source/common/primitives.cpp b/source/common/primitives.cpp
index 0a7278f2e..56d4319bf 100644
--- a/source/common/primitives.cpp
+++ b/source/common/primitives.cpp
@@ -258,7 +258,7 @@ void x265_setup_primitives(x265_param *param)
             primitives.cu[i].intra_pred_allangs = NULL;
 
 #if ENABLE_ASSEMBLY
-#if X265_ARCH_X86
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
         setupIntrinsicPrimitives(primitives, param->cpuid);
 #endif
         setupAssemblyPrimitives(primitives, param->cpuid);
diff --git a/source/test/testbench.cpp b/source/test/testbench.cpp
index 20d29182d..45da893a7 100644
--- a/source/test/testbench.cpp
+++ b/source/test/testbench.cpp
@@ -190,7 +190,7 @@ int main(int argc, char *argv[])
         else
             continue;
 
-#if X265_ARCH_X86
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
         EncoderPrimitives vecprim;
         memset(&vecprim, 0, sizeof(vecprim));
         setupIntrinsicPrimitives(vecprim, test_arch[i].flag);
@@ -231,7 +231,7 @@ int main(int argc, char *argv[])
 
     EncoderPrimitives optprim;
     memset(&optprim, 0, sizeof(optprim));
-#if X265_ARCH_X86
+#if defined(X265_ARCH_X86) || defined(X265_ARCH_ARM64)
     setupIntrinsicPrimitives(optprim, cpuid);
 #endif
 
-- 
2.42.1


>From 589559a79c7317a72bb9f055b3bee63b55cf8cbb Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Wed, 10 Apr 2024 21:04:05 +0100
Subject: [PATCH 11/12] AArch64: Remove explicit use of Neon functions from C

Remove references to Neon primitives in source/common/pixel.cpp. The
correct AArch64 optimised primitives are enabled in architecture
specific files, so these do not belong here.

This also serves to enable compilation when X265_NS is changed by
removing hard-coded function prefixes.
---
 source/common/pixel.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/source/common/pixel.cpp b/source/common/pixel.cpp
index 3cd074cfa..06e5732bd 100644
--- a/source/common/pixel.cpp
+++ b/source/common/pixel.cpp
@@ -266,10 +266,6 @@ int satd4(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t s
 {
     int satd = 0;
 
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
-    pixelcmp_t satd_4x4 = x265_pixel_satd_4x4_neon;
-#endif
-
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 4)
             satd += satd_4x4(pix1 + row * stride_pix1 + col, stride_pix1,
@@ -284,10 +280,6 @@ int satd8(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t s
 {
     int satd = 0;
 
-#if ENABLE_ASSEMBLY && X265_ARCH_ARM64 && !HIGH_BIT_DEPTH
-    pixelcmp_t satd_8x4 = x265_pixel_satd_8x4_neon;
-#endif
-
     for (int row = 0; row < h; row += 4)
         for (int col = 0; col < w; col += 8)
             satd += satd_8x4(pix1 + row * stride_pix1 + col, stride_pix1,
-- 
2.42.1


>From 095744b62f56c14c47197545223cdf021816d2d6 Mon Sep 17 00:00:00 2001
From: Hari Limaye <hari.limaye at arm.com>
Date: Wed, 10 Apr 2024 21:12:58 +0100
Subject: [PATCH 12/12] AArch64: Use PFX macro for all assembly functions

Use the PFX macro to wrap all AArch64 function declarations and
definitions, instead of hardcoding the prefix as `x265_`, to fix
compilation when X265_NS is changed.
---
 source/common/aarch64/fun-decls.h     | 140 +++++++++++++-------------
 source/common/aarch64/ipfilter-sve2.S |  16 +--
 source/common/aarch64/ipfilter.S      |  30 +++---
 3 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/source/common/aarch64/fun-decls.h b/source/common/aarch64/fun-decls.h
index 1a1f3b489..ec17deda2 100644
--- a/source/common/aarch64/fun-decls.h
+++ b/source/common/aarch64/fun-decls.h
@@ -155,69 +155,69 @@ DECLS(sve);
 DECLS(sve2);
 
 
-void x265_pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift);
+void PFX(pixel_planecopy_cp_neon(const uint8_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int width, int height, int shift));
 
-uint64_t x265_pixel_var_8x8_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_16x16_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_32x32_neon(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_64x64_neon(const pixel* pix, intptr_t stride);
+uint64_t PFX(pixel_var_8x8_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_16x16_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_32x32_neon(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_64x64_neon(const pixel* pix, intptr_t stride));
 
-void x265_getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void PFX(getResidual4_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual8_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual16_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual32_neon(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 
-void x265_scale1D_128to64_neon(pixel *dst, const pixel *src);
-void x265_scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride);
+void PFX(scale1D_128to64_neon(pixel *dst, const pixel *src));
+void PFX(scale2D_64to32_neon(pixel* dst, const pixel* src, intptr_t stride));
 
-int x265_pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int PFX(pixel_satd_4x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_4x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_12x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_12x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x4_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x12_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_16x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_24x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_24x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x8_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x24_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_48x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x16_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x32_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x48_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x64_neon(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
 
-int x265_pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
-int x265_pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2);
+int PFX(pixel_sa8d_8x8_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_8x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_16x16_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_16x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_32x32_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_32x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
+int PFX(pixel_sa8d_64x64_neon(const pixel* pix1, intptr_t i_pix1, const pixel* pix2, intptr_t i_pix2));
 
 uint32_t PFX(quant_neon)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
 uint32_t PFX(nquant_neon)(const int16_t* coef, const int32_t* quantCoeff, int16_t* qCoef, int qBits, int add, int numCoeff);
 
-void x265_dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
-void x265_dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+void PFX(dequant_scaling_neon(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift));
+void PFX(dequant_normal_neon(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift));
 
-void x265_ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
+void PFX(ssim_4x4x2_core_neon(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
 
 int PFX(psyCost_4x4_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 int PFX(psyCost_8x8_neon)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
@@ -226,30 +226,30 @@ void PFX(weight_sp_neon)(const int16_t* src, pixel* dst, intptr_t srcStride, int
 int PFX(scanPosLast_neon)(const uint16_t *scan, const coeff_t *coeff, uint16_t *coeffSign, uint16_t *coeffFlag, uint8_t *coeffNum, int numSig, const uint16_t* scanCG4x4, const int trSize);
 uint32_t PFX(costCoeffNxN_neon)(const uint16_t *scan, const coeff_t *coeff, intptr_t trSize, uint16_t *absCoeff, const uint8_t *tabSigCtx, uint32_t scanFlagMask, uint8_t *baseCtx, int offset, int scanPosSigOff, int subPosBase);
 
-uint64_t x265_pixel_var_8x8_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_16x16_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_32x32_sve2(const pixel* pix, intptr_t stride);
-uint64_t x265_pixel_var_64x64_sve2(const pixel* pix, intptr_t stride);
+uint64_t PFX(pixel_var_8x8_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_16x16_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_32x32_sve2(const pixel* pix, intptr_t stride));
+uint64_t PFX(pixel_var_64x64_sve2(const pixel* pix, intptr_t stride));
 
-void x265_getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
-void x265_getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride);
+void PFX(getResidual16_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
+void PFX(getResidual32_sve2(const pixel* fenc, const pixel* pred, int16_t* residual, intptr_t stride));
 
-void x265_scale1D_128to64_sve2(pixel *dst, const pixel *src);
-void x265_scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride);
+void PFX(scale1D_128to64_sve2(pixel *dst, const pixel *src));
+void PFX(scale2D_64to32_sve2(pixel* dst, const pixel* src, intptr_t stride));
 
-int x265_pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_8x12_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x16_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_32x32_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
-int x265_pixel_satd_64x48_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2);
+int PFX(pixel_satd_4x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x4_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_8x12_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x16_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_32x32_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
+int PFX(pixel_satd_64x48_sve(const pixel* pix1, intptr_t stride_pix1, const pixel* pix2, intptr_t stride_pix2));
 
 uint32_t PFX(quant_sve)(const int16_t* coef, const int32_t* quantCoeff, int32_t* deltaU, int16_t* qCoef, int qBits, int add, int numCoeff);
 
-void x265_dequant_scaling_sve2(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift);
-void x265_dequant_normal_sve2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift);
+void PFX(dequant_scaling_sve2(const int16_t* quantCoef, const int32_t* deQuantCoef, int16_t* coef, int num, int per, int shift));
+void PFX(dequant_normal_sve2(const int16_t* quantCoef, int16_t* coef, int num, int scale, int shift));
 
-void x265_ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]);
+void PFX(ssim_4x4x2_core_sve2(const pixel* pix1, intptr_t stride1, const pixel* pix2, intptr_t stride2, int sums[2][4]));
 
 int PFX(psyCost_8x8_sve2)(const pixel* source, intptr_t sstride, const pixel* recon, intptr_t rstride);
 void PFX(weight_sp_sve2)(const int16_t* src, pixel* dst, intptr_t srcStride, intptr_t dstStride, int width, int height, int w0, int round, int shift, int offset);
diff --git a/source/common/aarch64/ipfilter-sve2.S b/source/common/aarch64/ipfilter-sve2.S
index 525ed1172..ab0ad2fae 100644
--- a/source/common/aarch64/ipfilter-sve2.S
+++ b/source/common/aarch64/ipfilter-sve2.S
@@ -456,7 +456,7 @@
 
 // void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VPP_SVE2 w, h
-function x265_interp_8tap_vert_pp_\w\()x\h\()_sve2
+function PFX(interp_8tap_vert_pp_\w\()x\h\()_sve2)
     cmp             x4, #0
     b.eq            0f
     cmp             x4, #1
@@ -501,7 +501,7 @@ LUMA_VPP_SVE2 64, 48
 
 // void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VPS_4xN_SVE2 h
-function x265_interp_8tap_vert_ps_4x\h\()_sve2
+function PFX(interp_8tap_vert_ps_4x\h\()_sve2)
     lsl             x3, x3, #1
     lsl             x5, x4, #6
     lsl             x4, x1, #2
@@ -568,7 +568,7 @@ LUMA_VPS_4xN_SVE2 16
 
 // void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VSP_4xN_SVE2 h
-function x265_interp_8tap_vert_sp_4x\h\()_sve2
+function PFX(interp_8tap_vert_sp_4x\h\()_sve2)
     lsl             x5, x4, #6
     lsl             x1, x1, #1
     lsl             x4, x1, #2
@@ -736,7 +736,7 @@ LUMA_VSP_4xN_SVE2 16
 
 // void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VPS_SVE2 w, h
-function x265_interp_8tap_vert_ps_\w\()x\h\()_sve2
+function PFX(interp_8tap_vert_ps_\w\()x\h\()_sve2)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -830,7 +830,7 @@ LUMA_VPS_SVE2 64, 48
 
 // void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VSS_SVE2 w, h
-function x265_interp_8tap_vert_ss_\w\()x\h\()_sve2
+function PFX(interp_8tap_vert_ss_\w\()x\h\()_sve2)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -924,7 +924,7 @@ LUMA_VSS_SVE2 48, 64
 
 // void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro CHROMA_VPP_SVE2 w, h
-function x265_interp_4tap_vert_pp_\w\()x\h\()_sve2
+function PFX(interp_4tap_vert_pp_\w\()x\h\()_sve2)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -1047,7 +1047,7 @@ CHROMA_VPP_SVE2 48, 64
 
 // void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro CHROMA_VPS_SVE2 w, h
-function x265_interp_4tap_vert_ps_\w\()x\h\()_sve2
+function PFX(interp_4tap_vert_ps_\w\()x\h\()_sve2)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -1210,7 +1210,7 @@ CHROMA_VPS_SVE2 48, 64
 
 // void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro CHROMA_VSS_SVE2 w, h
-function x265_interp_4tap_vert_ss_\w\()x\h\()_sve2
+function PFX(interp_4tap_vert_ss_\w\()x\h\()_sve2)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
diff --git a/source/common/aarch64/ipfilter.S b/source/common/aarch64/ipfilter.S
index 228ffae29..0d1a374eb 100644
--- a/source/common/aarch64/ipfilter.S
+++ b/source/common/aarch64/ipfilter.S
@@ -51,7 +51,7 @@
 // ***** luma_vpp *****
 // void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VPP_4xN h
-function x265_interp_8tap_vert_pp_4x\h\()_neon
+function PFX(interp_8tap_vert_pp_4x\h\()_neon)
     movrel          x10, g_luma_s16
     sub             x0, x0, x1
     sub             x0, x0, x1, lsl #1         // src -= 3 * srcStride
@@ -135,7 +135,7 @@ LUMA_VPP_4xN 16
 
 // void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VPP w, h
-function x265_interp_8tap_vert_pp_\w\()x\h\()_neon
+function PFX(interp_8tap_vert_pp_\w\()x\h\()_neon)
     cmp             x4, #0
     b.eq            0f
     cmp             x4, #1
@@ -181,7 +181,7 @@ LUMA_VPP 64, 48
 // ***** luma_vps *****
 // void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VPS_4xN h
-function x265_interp_8tap_vert_ps_4x\h\()_neon
+function PFX(interp_8tap_vert_ps_4x\h\()_neon)
     lsl             x3, x3, #1
     lsl             x5, x4, #6
     lsl             x4, x1, #2
@@ -263,7 +263,7 @@ LUMA_VPS_4xN 16
 
 // void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VPS w, h
-function x265_interp_8tap_vert_ps_\w\()x\h\()_neon
+function PFX(interp_8tap_vert_ps_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -309,7 +309,7 @@ LUMA_VPS 64, 48
 // ***** luma_vsp *****
 // void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VSP_4xN h
-function x265_interp_8tap_vert_sp_4x\h\()_neon
+function PFX(interp_8tap_vert_sp_4x\h\()_neon)
     lsl             x5, x4, #6
     lsl             x1, x1, #1
     lsl             x4, x1, #2
@@ -379,7 +379,7 @@ LUMA_VSP_4xN 16
 
 // void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VSP w, h
-function x265_interp_8tap_vert_sp_\w\()x\h\()_neon
+function PFX(interp_8tap_vert_sp_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -425,7 +425,7 @@ LUMA_VSP 48, 64
 // ***** luma_vss *****
 // void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_VSS w, h
-function x265_interp_8tap_vert_ss_\w\()x\h\()_neon
+function PFX(interp_8tap_vert_ss_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -474,7 +474,7 @@ LUMA_VSS 48, 64
 // ***** luma_hpp *****
 // void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro LUMA_HPP w, h
-function x265_interp_horiz_pp_\w\()x\h\()_neon
+function PFX(interp_horiz_pp_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -523,7 +523,7 @@ LUMA_HPP 64, 64
 // ***** luma_hps *****
 // void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 .macro LUMA_HPS w, h
-function x265_interp_horiz_ps_\w\()x\h\()_neon
+function PFX(interp_horiz_ps_\w\()x\h\()_neon)
     mov             w10, #\h
     cmp             w5, #0
     b.eq            6f
@@ -580,7 +580,7 @@ LUMA_HPS 64, 64
 // ***** chroma_vpp *****
 // void interp_vert_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro CHROMA_VPP w, h
-function x265_interp_4tap_vert_pp_\w\()x\h\()_neon
+function PFX(interp_4tap_vert_pp_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -660,7 +660,7 @@ CHROMA_VPP 48, 64
 // ***** chroma_vps *****
 // void interp_vert_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro CHROMA_VPS w, h
-function x265_interp_4tap_vert_ps_\w\()x\h\()_neon
+function PFX(interp_4tap_vert_ps_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -740,7 +740,7 @@ CHROMA_VPS 48, 64
 // ***** chroma_vsp *****
 // void interp_vert_sp_c(const int16_t* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro CHROMA_VSP w, h
-function x265_interp_4tap_vert_sp_\w\()x\h\()_neon
+function PFX(interp_4tap_vert_sp_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -814,7 +814,7 @@ CHROMA_VSP 48, 64
 // ***** chroma_vss *****
 // void interp_vert_ss_c(const int16_t* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx)
 .macro CHROMA_VSS w, h
-function x265_interp_4tap_vert_ss_\w\()x\h\()_neon
+function PFX(interp_4tap_vert_ss_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -888,7 +888,7 @@ CHROMA_VSS 48, 64
 // ***** chroma_hpp *****
 // void interp_horiz_pp_c(const pixel* src, intptr_t srcStride, pixel* dst, intptr_t dstStride, int coeffIdx)
 .macro CHROMA_HPP w, h
-function x265_interp_4tap_horiz_pp_\w\()x\h\()_neon
+function PFX(interp_4tap_horiz_pp_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
@@ -968,7 +968,7 @@ CHROMA_HPP 64, 64
 // ***** chroma_hps *****
 // void interp_horiz_ps_c(const pixel* src, intptr_t srcStride, int16_t* dst, intptr_t dstStride, int coeffIdx, int isRowExt)
 .macro CHROMA_HPS w, h
-function x265_interp_4tap_horiz_ps_\w\()x\h\()_neon
+function PFX(interp_4tap_horiz_ps_\w\()x\h\()_neon)
     cmp             x4, #0
     beq             0f
     cmp             x4, #1
-- 
2.42.1



More information about the x265-devel mailing list