[x265] [PATCH 02/12] Arm: Use local labels for assembly routine loops

Hari Limaye hari.limaye at arm.com
Thu May 2 21:19:37 UTC 2024


Amend loop labels in Arm & AArch64 assembly files to start with `.Loop`
instead of `.loop`, as the GNU assembler prefix for local labels is
`.L`. This improves the output of tools like perf and gdb, as code under
loop labels are correctly attributed to their containing routine.
---
 source/common/aarch64/blockcopy8-sve.S  |  60 +++++------
 source/common/aarch64/blockcopy8.S      | 116 ++++++++++-----------
 source/common/aarch64/ipfilter-common.S | 104 +++++++++----------
 source/common/aarch64/ipfilter-sve2.S   |  56 +++++-----
 source/common/aarch64/ipfilter.S        |  12 +--
 source/common/aarch64/mc-a-sve2.S       |  88 ++++++++--------
 source/common/aarch64/mc-a.S            |  32 +++---
 source/common/aarch64/p2s-sve.S         |  12 +--
 source/common/aarch64/p2s.S             |  12 +--
 source/common/aarch64/pixel-util-sve.S  |   4 +-
 source/common/aarch64/pixel-util-sve2.S |  56 +++++-----
 source/common/aarch64/pixel-util.S      | 132 ++++++++++++------------
 source/common/aarch64/sad-a-sve2.S      |  16 +--
 source/common/aarch64/sad-a.S           |   8 +-
 source/common/aarch64/ssd-a-sve2.S      |  16 +--
 source/common/aarch64/ssd-a.S           |  32 +++---
 source/common/arm/blockcopy8.S          |   4 +-
 source/common/arm/dct-a.S               |   8 +-
 source/common/arm/ipfilter8.S           | 108 +++++++++----------
 source/common/arm/mc-a.S                |   8 +-
 source/common/arm/pixel-util.S          |  28 ++---
 source/common/arm/sad-a.S               |  36 +++----
 source/common/arm/ssd-a.S               |  28 ++---
 23 files changed, 488 insertions(+), 488 deletions(-)

diff --git a/source/common/aarch64/blockcopy8-sve.S b/source/common/aarch64/blockcopy8-sve.S
index 846927909..d5664af58 100644
--- a/source/common/aarch64/blockcopy8-sve.S
+++ b/source/common/aarch64/blockcopy8-sve.S
@@ -112,7 +112,7 @@ function PFX(blockcopy_sp_32x32_sve)
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, [x11]
-.loop_csp32_sve:
+.Loop_csp32_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, [x2], x3
@@ -124,7 +124,7 @@ function PFX(blockcopy_sp_32x32_sve)
     st1             {v0.16b-v1.16b}, [x0], x1
     st1             {v2.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_csp32_sve
+    cbnz            w12, .Loop_csp32_sve
     ret
 .vl_gt_16_blockcopy_sp_32_32:
     cmp             x9, #48
@@ -199,7 +199,7 @@ function PFX(blockcopy_ps_32x32_sve)
     bgt             .vl_gt_16_blockcopy_ps_32_32
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_cps32_sve:
+.Loop_cps32_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, [x2], x3
@@ -215,7 +215,7 @@ function PFX(blockcopy_ps_32x32_sve)
     st1             {v0.8h-v3.8h}, [x0], x1
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps32_sve
+    cbnz            w12, .Loop_cps32_sve
     ret
 .vl_gt_16_blockcopy_ps_32_32:
     cmp             x9, #48
@@ -248,7 +248,7 @@ function PFX(blockcopy_ps_64x64_sve)
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_cps64_sve:
+.Loop_cps64_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x2], x3
@@ -263,7 +263,7 @@ function PFX(blockcopy_ps_64x64_sve)
     st1             {v0.8h-v3.8h}, [x0], #64
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps64_sve
+    cbnz            w12, .Loop_cps64_sve
     ret
 .vl_gt_16_blockcopy_ps_64_64:
     cmp             x9, #48
@@ -338,13 +338,13 @@ function PFX(blockcopy_ss_32x32_sve)
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #4
-.loop_css32_sve:
+.Loop_css32_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], x3
     st1             {v0.8h-v3.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css32_sve
+    cbnz            w12, .Loop_css32_sve
     ret
 .vl_gt_16_blockcopy_ss_32_32:
     cmp             x9, #48
@@ -379,7 +379,7 @@ function PFX(blockcopy_ss_64x64_sve)
     lsl             x3, x3, #1
     sub             x3, x3, #64
     mov             w12, #8
-.loop_css64_sve:
+.Loop_css64_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], #64
@@ -387,7 +387,7 @@ function PFX(blockcopy_ss_64x64_sve)
     st1             {v0.8h-v3.8h}, [x0], #64
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css64_sve
+    cbnz            w12, .Loop_css64_sve
     ret
 .vl_gt_16_blockcopy_ss_64_64:
     cmp             x9, #48
@@ -474,13 +474,13 @@ function PFX(blockcopy_ss_32x64_sve)
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #8
-.loop_css32x64_sve:
+.Loop_css32x64_sve:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], x3
     st1             {v0.8h-v3.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css32x64_sve
+    cbnz            w12, .Loop_css32x64_sve
     ret
 .vl_gt_16_blockcopy_ss_32_64:
     cmp             x9, #48
@@ -570,7 +570,7 @@ function PFX(blockcopy_ps_32x64_sve)
     bgt             .vl_gt_16_blockcopy_ps_32_64
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_cps32x64_sve:
+.Loop_cps32x64_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, [x2], x3
@@ -586,7 +586,7 @@ function PFX(blockcopy_ps_32x64_sve)
     st1             {v0.8h-v3.8h}, [x0], x1
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps32x64_sve
+    cbnz            w12, .Loop_cps32x64_sve
     ret
 .vl_gt_16_blockcopy_ps_32_64:
     cmp             x9, #48
@@ -730,13 +730,13 @@ function PFX(blockcopy_pp_32x\h\()_sve)
     rdvl            x9, #1
     cmp             x9, #16
     bgt             .vl_gt_16_blockcopy_pp_32xN_\h
-.loop_sve_32x\h\():
+.Loop_sve_32x\h\():
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b-v1.16b}, [x2], x3
     st1             {v0.16b-v1.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_sve_32x\h
+    cbnz            w12, .Loop_sve_32x\h
     ret
 .vl_gt_16_blockcopy_pp_32xN_\h:
     ptrue           p0.b, vl32
@@ -765,13 +765,13 @@ function PFX(blockcopy_pp_64x\h\()_sve)
     rdvl            x9, #1
     cmp             x9, #16
     bgt             .vl_gt_16_blockcopy_pp_64xN_\h
-.loop_sve_64x\h\():
+.Loop_sve_64x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, [x2], x3
     st1             {v0.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_sve_64x\h
+    cbnz            w12, .Loop_sve_64x\h
     ret
 .vl_gt_16_blockcopy_pp_64xN_\h:
     cmp             x9, #48
@@ -856,7 +856,7 @@ function PFX(cpy2Dto1D_shl_16x16_sve)
     bgt             .vl_gt_16_cpy2Dto1D_shl_16x16
     cpy2Dto1D_shl_start_sve
     mov             w12, #4
-.loop_cpy2Dto1D_shl_16_sve:
+.Loop_cpy2Dto1D_shl_16_sve:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.16b-v3.16b}, [x1], x2
@@ -864,7 +864,7 @@ function PFX(cpy2Dto1D_shl_16x16_sve)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.16b-v3.16b}, [x0], #32
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_16_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_16_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_16x16:
     ptrue           p0.h, vl16
@@ -885,7 +885,7 @@ function PFX(cpy2Dto1D_shl_32x32_sve)
     bgt             .vl_gt_16_cpy2Dto1D_shl_32x32
     cpy2Dto1D_shl_start_sve
     mov             w12, #16
-.loop_cpy2Dto1D_shl_32_sve:
+.Loop_cpy2Dto1D_shl_32_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], x2
@@ -895,7 +895,7 @@ function PFX(cpy2Dto1D_shl_32x32_sve)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_32_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_32_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_32x32:
     cmp             x9, #48
@@ -931,7 +931,7 @@ function PFX(cpy2Dto1D_shl_64x64_sve)
     cpy2Dto1D_shl_start_sve
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy2Dto1D_shl_64_sve:
+.Loop_cpy2Dto1D_shl_64_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -947,7 +947,7 @@ function PFX(cpy2Dto1D_shl_64x64_sve)
     st1             {v2.16b-v5.16b}, [x0], #64
     st1             {v16.16b-v19.16b}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_64_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shl_64_sve
     ret
 .vl_gt_16_cpy2Dto1D_shl_64x64:
     dup             z0.h, w3
@@ -1055,7 +1055,7 @@ function PFX(cpy2Dto1D_shr_32x32_sve)
     bgt             .vl_gt_16_cpy2Dto1D_shr_32x32
     cpy2Dto1D_shr_start
     mov             w12, #16
-.loop_cpy2Dto1D_shr_32_sve:
+.Loop_cpy2Dto1D_shr_32_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.8h-v5.8h}, [x1], x2
@@ -1069,7 +1069,7 @@ function PFX(cpy2Dto1D_shr_32x32_sve)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.8h-v5.8h}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shr_32_sve
+    cbnz            w12, .Loop_cpy2Dto1D_shr_32_sve
     ret
 .vl_gt_16_cpy2Dto1D_shr_32x32:
     dup             z0.h, w3
@@ -1218,7 +1218,7 @@ function PFX(cpy1Dto2D_shr_16x16_sve)
     bgt             .vl_gt_16_cpy1Dto2D_shr_16x16
     cpy1Dto2D_shr_start
     mov             w12, #4
-.loop_cpy1Dto2D_shr_16:
+.Loop_cpy1Dto2D_shr_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.8h-v3.8h}, [x1], #32
@@ -1228,7 +1228,7 @@ function PFX(cpy1Dto2D_shr_16x16_sve)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.8h-v3.8h}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_16
+    cbnz            w12, .Loop_cpy1Dto2D_shr_16
     ret
 .vl_gt_16_cpy1Dto2D_shr_16x16:
     dup             z0.h, w3
@@ -1254,7 +1254,7 @@ function PFX(cpy1Dto2D_shr_32x32_sve)
     bgt             .vl_gt_16_cpy1Dto2D_shr_32x32
     cpy1Dto2D_shr_start
     mov             w12, #16
-.loop_cpy1Dto2D_shr_32_sve:
+.Loop_cpy1Dto2D_shr_32_sve:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1268,7 +1268,7 @@ function PFX(cpy1Dto2D_shr_32x32_sve)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_32_sve
+    cbnz            w12, .Loop_cpy1Dto2D_shr_32_sve
     ret
 .vl_gt_16_cpy1Dto2D_shr_32x32:
     dup             z0.h, w3
diff --git a/source/common/aarch64/blockcopy8.S b/source/common/aarch64/blockcopy8.S
index 495ee7ea2..1ad371c57 100644
--- a/source/common/aarch64/blockcopy8.S
+++ b/source/common/aarch64/blockcopy8.S
@@ -86,7 +86,7 @@ function PFX(blockcopy_sp_32x32_neon)
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, [x11]
-.loop_csp32:
+.Loop_csp32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, [x2], x3
@@ -98,7 +98,7 @@ function PFX(blockcopy_sp_32x32_neon)
     st1             {v0.16b-v1.16b}, [x0], x1
     st1             {v2.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_csp32
+    cbnz            w12, .Loop_csp32
     ret
 endfunc

@@ -108,7 +108,7 @@ function PFX(blockcopy_sp_64x64_neon)
     sub             x3, x3, #64
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, [x11]
-.loop_csp64:
+.Loop_csp64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, [x2], #64
@@ -119,7 +119,7 @@ function PFX(blockcopy_sp_64x64_neon)
     tbl             v3.16b, {v6.16b,v7.16b}, v31.16b
     st1             {v0.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_csp64
+    cbnz            w12, .Loop_csp64
     ret
 endfunc

@@ -168,7 +168,7 @@ endfunc
 function PFX(blockcopy_ps_32x32_neon)
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_cps32:
+.Loop_cps32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, [x2], x3
@@ -184,7 +184,7 @@ function PFX(blockcopy_ps_32x32_neon)
     st1             {v0.8h-v3.8h}, [x0], x1
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps32
+    cbnz            w12, .Loop_cps32
     ret
 endfunc

@@ -192,7 +192,7 @@ function PFX(blockcopy_ps_64x64_neon)
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_cps64:
+.Loop_cps64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x2], x3
@@ -207,7 +207,7 @@ function PFX(blockcopy_ps_64x64_neon)
     st1             {v0.8h-v3.8h}, [x0], #64
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps64
+    cbnz            w12, .Loop_cps64
     ret
 endfunc

@@ -252,13 +252,13 @@ function PFX(blockcopy_ss_32x32_neon)
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #4
-.loop_css32:
+.Loop_css32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], x3
     st1             {v0.8h-v3.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css32
+    cbnz            w12, .Loop_css32
     ret
 endfunc

@@ -268,7 +268,7 @@ function PFX(blockcopy_ss_64x64_neon)
     lsl             x3, x3, #1
     sub             x3, x3, #64
     mov             w12, #8
-.loop_css64:
+.Loop_css64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], #64
@@ -276,7 +276,7 @@ function PFX(blockcopy_ss_64x64_neon)
     st1             {v0.8h-v3.8h}, [x0], #64
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css64
+    cbnz            w12, .Loop_css64
     ret
 endfunc

@@ -321,13 +321,13 @@ function PFX(blockcopy_ss_32x64_neon)
     lsl             x1, x1, #1
     lsl             x3, x3, #1
     mov             w12, #8
-.loop_css32x64:
+.Loop_css32x64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8h-v3.8h}, [x2], x3
     st1             {v0.8h-v3.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_css32x64
+    cbnz            w12, .Loop_css32x64
     ret
 endfunc

@@ -376,7 +376,7 @@ endfunc
 function PFX(blockcopy_ps_32x64_neon)
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_cps32x64:
+.Loop_cps32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v17.16b}, [x2], x3
@@ -392,7 +392,7 @@ function PFX(blockcopy_ps_32x64_neon)
     st1             {v0.8h-v3.8h}, [x0], x1
     st1             {v4.8h-v7.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_cps32x64
+    cbnz            w12, .Loop_cps32x64
     ret
 endfunc

@@ -443,7 +443,7 @@ function PFX(blockcopy_sp_32x64_neon)
     lsl             x3, x3, #1
     movrel          x11, xtn_xtn2_table
     ld1             {v31.16b}, [x11]
-.loop_csp32x64:
+.Loop_csp32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.8h-v3.8h}, [x2], x3
@@ -455,7 +455,7 @@ function PFX(blockcopy_sp_32x64_neon)
     st1             {v0.16b-v1.16b}, [x0], x1
     st1             {v2.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_csp32x64
+    cbnz            w12, .Loop_csp32x64
     ret
 endfunc

@@ -595,13 +595,13 @@ blockcopy_pp_8xN_neon 32

 function PFX(blockcopy_pp_8x64_neon)
     mov             w12, #4
-.loop_pp_8x64:
+.Loop_pp_8x64:
     sub             w12, w12, #1
 .rept 16
     ld1             {v0.4h}, [x2], x3
     st1             {v0.4h}, [x0], x1
 .endr
-    cbnz            w12, .loop_pp_8x64
+    cbnz            w12, .Loop_pp_8x64
     ret
 endfunc

@@ -623,13 +623,13 @@ blockcopy_pp_16xN_neon 16
 .macro blockcopy_pp_16xN1_neon h
 function PFX(blockcopy_pp_16x\h\()_neon)
     mov             w12, #\h / 8
-.loop_16x\h\():
+.Loop_16x\h\():
 .rept 8
     ld1             {v0.8h}, [x2], x3
     st1             {v0.8h}, [x0], x1
 .endr
     sub             w12, w12, #1
-    cbnz            w12, .loop_16x\h
+    cbnz            w12, .Loop_16x\h
     ret
 endfunc
 .endm
@@ -651,38 +651,38 @@ endfunc
 function PFX(blockcopy_pp_12x32_neon)
     sub             x1, x1, #8
     mov             w12, #4
-.loop_pp_12x32:
+.Loop_pp_12x32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b}, [x2], x3
     str             d0, [x0], #8
     st1             {v0.s}[2], [x0], x1
 .endr
-    cbnz            w12, .loop_pp_12x32
+    cbnz            w12, .Loop_pp_12x32
     ret
 endfunc

 function PFX(blockcopy_pp_24x32_neon)
     mov             w12, #4
-.loop_24x32:
+.Loop_24x32:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.8b-v2.8b}, [x2], x3
     st1             {v0.8b-v2.8b}, [x0], x1
 .endr
-    cbnz            w12, .loop_24x32
+    cbnz            w12, .Loop_24x32
     ret
 endfunc

 function PFX(blockcopy_pp_24x64_neon)
     mov             w12, #4
-.loop_24x64:
+.Loop_24x64:
     sub             w12, w12, #1
 .rept 16
     ld1             {v0.8b-v2.8b}, [x2], x3
     st1             {v0.8b-v2.8b}, [x0], x1
 .endr
-    cbnz            w12, .loop_24x64
+    cbnz            w12, .Loop_24x64
     ret
 endfunc

@@ -697,13 +697,13 @@ endfunc
 .macro blockcopy_pp_32xN_neon h
 function PFX(blockcopy_pp_32x\h\()_neon)
     mov             w12, #\h / 8
-.loop_32x\h\():
+.Loop_32x\h\():
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b-v1.16b}, [x2], x3
     st1             {v0.16b-v1.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_32x\h
+    cbnz            w12, .Loop_32x\h
     ret
 endfunc
 .endm
@@ -716,26 +716,26 @@ blockcopy_pp_32xN_neon 48

 function PFX(blockcopy_pp_48x64_neon)
     mov             w12, #8
-.loop_48x64:
+.Loop_48x64:
     sub             w12, w12, #1
 .rept 8
     ld1             {v0.16b-v2.16b}, [x2], x3
     st1             {v0.16b-v2.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_48x64
+    cbnz            w12, .Loop_48x64
     ret
 endfunc

 .macro blockcopy_pp_64xN_neon h
 function PFX(blockcopy_pp_64x\h\()_neon)
     mov             w12, #\h / 4
-.loop_64x\h\():
+.Loop_64x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, [x2], x3
     st1             {v0.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_64x\h
+    cbnz            w12, .Loop_64x\h
     ret
 endfunc
 .endm
@@ -950,11 +950,11 @@ function PFX(count_nonzero_32_neon)
     trn1            v16.16b, v16.16b, v17.16b
     movi            v18.16b, #0
     mov             w12, #16
-.loop_count_nonzero_32:
+.Loop_count_nonzero_32:
     sub             w12, w12, #1
     COUNT_NONZERO_8
     add             v18.16b, v18.16b, v0.16b
-    cbnz            w12, .loop_count_nonzero_32
+    cbnz            w12, .Loop_count_nonzero_32

     uaddlv          s0, v18.8h
     fmov            w0, s0
@@ -994,7 +994,7 @@ endfunc
 function PFX(cpy2Dto1D_shl_16x16_neon)
     cpy2Dto1D_shl_start
     mov             w12, #4
-.loop_cpy2Dto1D_shl_16:
+.Loop_cpy2Dto1D_shl_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.16b-v3.16b}, [x1], x2
@@ -1002,14 +1002,14 @@ function PFX(cpy2Dto1D_shl_16x16_neon)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.16b-v3.16b}, [x0], #32
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_16
+    cbnz            w12, .Loop_cpy2Dto1D_shl_16
     ret
 endfunc

 function PFX(cpy2Dto1D_shl_32x32_neon)
     cpy2Dto1D_shl_start
     mov             w12, #16
-.loop_cpy2Dto1D_shl_32:
+.Loop_cpy2Dto1D_shl_32:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], x2
@@ -1019,7 +1019,7 @@ function PFX(cpy2Dto1D_shl_32x32_neon)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_32
+    cbnz            w12, .Loop_cpy2Dto1D_shl_32
     ret
 endfunc

@@ -1027,7 +1027,7 @@ function PFX(cpy2Dto1D_shl_64x64_neon)
     cpy2Dto1D_shl_start
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy2Dto1D_shl_64:
+.Loop_cpy2Dto1D_shl_64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1043,7 +1043,7 @@ function PFX(cpy2Dto1D_shl_64x64_neon)
     st1             {v2.16b-v5.16b}, [x0], #64
     st1             {v16.16b-v19.16b}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shl_64
+    cbnz            w12, .Loop_cpy2Dto1D_shl_64
     ret
 endfunc

@@ -1079,7 +1079,7 @@ endfunc
 function PFX(cpy2Dto1D_shr_16x16_neon)
     cpy2Dto1D_shr_start
     mov             w12, #4
-.loop_cpy2Dto1D_shr_16:
+.Loop_cpy2Dto1D_shr_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.8h-v3.8h}, [x1], x2
@@ -1089,14 +1089,14 @@ function PFX(cpy2Dto1D_shr_16x16_neon)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.8h-v3.8h}, [x0], #32
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shr_16
+    cbnz            w12, .Loop_cpy2Dto1D_shr_16
     ret
 endfunc

 function PFX(cpy2Dto1D_shr_32x32_neon)
     cpy2Dto1D_shr_start
     mov             w12, #16
-.loop_cpy2Dto1D_shr_32:
+.Loop_cpy2Dto1D_shr_32:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.8h-v5.8h}, [x1], x2
@@ -1110,7 +1110,7 @@ function PFX(cpy2Dto1D_shr_32x32_neon)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.8h-v5.8h}, [x0], #64
 .endr
-    cbnz            w12, .loop_cpy2Dto1D_shr_32
+    cbnz            w12, .Loop_cpy2Dto1D_shr_32
     ret
 endfunc

@@ -1147,7 +1147,7 @@ endfunc
 function PFX(cpy1Dto2D_shl_16x16_neon)
     cpy1Dto2D_shl_start
     mov             w12, #4
-.loop_cpy1Dto2D_shl_16:
+.Loop_cpy1Dto2D_shl_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.16b-v3.16b}, [x1], #32
@@ -1155,14 +1155,14 @@ function PFX(cpy1Dto2D_shl_16x16_neon)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.16b-v3.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shl_16
+    cbnz            w12, .Loop_cpy1Dto2D_shl_16
     ret
 endfunc

 function PFX(cpy1Dto2D_shl_32x32_neon)
     cpy1Dto2D_shl_start
     mov             w12, #16
-.loop_cpy1Dto2D_shl_32:
+.Loop_cpy1Dto2D_shl_32:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1172,7 +1172,7 @@ function PFX(cpy1Dto2D_shl_32x32_neon)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shl_32
+    cbnz            w12, .Loop_cpy1Dto2D_shl_32
     ret
 endfunc

@@ -1180,7 +1180,7 @@ function PFX(cpy1Dto2D_shl_64x64_neon)
     cpy1Dto2D_shl_start
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy1Dto2D_shl_64:
+.Loop_cpy1Dto2D_shl_64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1196,7 +1196,7 @@ function PFX(cpy1Dto2D_shl_64x64_neon)
     st1             {v2.16b-v5.16b}, [x0], #64
     st1             {v16.16b-v19.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shl_64
+    cbnz            w12, .Loop_cpy1Dto2D_shl_64
     ret
 endfunc

@@ -1231,7 +1231,7 @@ endfunc
 function PFX(cpy1Dto2D_shr_16x16_neon)
     cpy1Dto2D_shr_start
     mov             w12, #4
-.loop_cpy1Dto2D_shr_16:
+.Loop_cpy1Dto2D_shr_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v2.8h-v3.8h}, [x1], #32
@@ -1241,14 +1241,14 @@ function PFX(cpy1Dto2D_shr_16x16_neon)
     sshl            v3.8h, v3.8h, v0.8h
     st1             {v2.8h-v3.8h}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_16
+    cbnz            w12, .Loop_cpy1Dto2D_shr_16
     ret
 endfunc

 function PFX(cpy1Dto2D_shr_32x32_neon)
     cpy1Dto2D_shr_start
     mov             w12, #16
-.loop_cpy1Dto2D_shr_32:
+.Loop_cpy1Dto2D_shr_32:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1262,7 +1262,7 @@ function PFX(cpy1Dto2D_shr_32x32_neon)
     sshl            v5.8h, v5.8h, v0.8h
     st1             {v2.16b-v5.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_32
+    cbnz            w12, .Loop_cpy1Dto2D_shr_32
     ret
 endfunc

@@ -1270,7 +1270,7 @@ function PFX(cpy1Dto2D_shr_64x64_neon)
     cpy1Dto2D_shr_start
     mov             w12, #32
     sub             x2, x2, #64
-.loop_cpy1Dto2D_shr_64:
+.Loop_cpy1Dto2D_shr_64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v2.16b-v5.16b}, [x1], #64
@@ -1294,6 +1294,6 @@ function PFX(cpy1Dto2D_shr_64x64_neon)
     st1             {v2.16b-v5.16b}, [x0], #64
     st1             {v16.16b-v19.16b}, [x0], x2
 .endr
-    cbnz            w12, .loop_cpy1Dto2D_shr_64
+    cbnz            w12, .Loop_cpy1Dto2D_shr_64
     ret
 endfunc
diff --git a/source/common/aarch64/ipfilter-common.S b/source/common/aarch64/ipfilter-common.S
index b7c61ee64..a08c3c165 100644
--- a/source/common/aarch64/ipfilter-common.S
+++ b/source/common/aarch64/ipfilter-common.S
@@ -800,10 +800,10 @@
     mov             w12, #32
     dup             v31.8h, w12
     qpel_start_\v
-.loop_luma_vpp_\v\()_\w\()x\h:
+.Loop_luma_vpp_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vpp_w8_\v\()_\w\()x\h:
+.Loop_luma_vpp_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
 .if \w == 8 || \w == 24
     qpel_load_32b \v
@@ -833,11 +833,11 @@
     add             x9, x9, #16
 .endif
     cmp             x9, #\w
-    blt             .loop_luma_vpp_w8_\v\()_\w\()x\h
+    blt             .Loop_luma_vpp_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vpp_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vpp_\v\()_\w\()x\h
     ret
 .endm

@@ -854,10 +854,10 @@
     mov             w12, #8192
     dup             v31.8h, w12
     qpel_start_\v
-.loop_ps_\v\()_\w\()x\h:
+.Loop_ps_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_ps_w8_\v\()_\w\()x\h:
+.Loop_ps_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
 .if \w == 8 || \w == 24
     qpel_load_32b \v
@@ -885,11 +885,11 @@
     add             x9, x9, #16
 .endif
     cmp             x9, #\w
-    blt             .loop_ps_w8_\v\()_\w\()x\h
+    blt             .Loop_ps_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_ps_\v\()_\w\()x\h
+    cbnz            x5, .Loop_ps_\v\()_\w\()x\h
     ret
 .endm

@@ -914,10 +914,10 @@
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_\v\()_1
-.loop_luma_vsp_\v\()_\w\()x\h:
+.Loop_luma_vsp_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vsp_w8_\v\()_\w\()x\h:
+.Loop_luma_vsp_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_load_64b \v
     qpel_filter_\v\()_32b_1
@@ -933,11 +933,11 @@
     add             x9, x9, #8
 .endif
     cmp             x9, x12
-    blt             .loop_luma_vsp_w8_\v\()_\w\()x\h
+    blt             .Loop_luma_vsp_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vsp_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vsp_\v\()_\w\()x\h
     ret
 .endm

@@ -957,10 +957,10 @@
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_\v\()_1
-.loop_luma_vss_\v\()_\w\()x\h:
+.Loop_luma_vss_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vss_w8_\v\()_\w\()x\h:
+.Loop_luma_vss_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_load_64b \v
     qpel_filter_\v\()_32b_1
@@ -981,11 +981,11 @@
 .endif
 .endif
     cmp             x9, x12
-    blt             .loop_luma_vss_w8_\v\()_\w\()x\h
+    blt             .Loop_luma_vss_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vss_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vss_\v\()_\w\()x\h
     ret
 .endm

@@ -1013,11 +1013,11 @@
 .endr
     ret
 .else
-.loop1_hpp_\v\()_\w\()x\h:
+.Loop1_hpp_\v\()_\w\()x\h:
     mov             x7, #\w
     mov             x11, x0
     sub             x11, x11, #4
-.loop2_hpp_\v\()_\w\()x\h:
+.Loop2_hpp_\v\()_\w\()x\h:
     vextin8 \v
     qpel_filter_\v\()_32b
     hpp_end
@@ -1031,11 +1031,11 @@
     str             s17, [x2], #4
     sub             x7, x7, #4
 .endif
-    cbnz            x7, .loop2_hpp_\v\()_\w\()x\h
+    cbnz            x7, .Loop2_hpp_\v\()_\w\()x\h
     sub             x6, x6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            x6, .loop1_hpp_\v\()_\w\()x\h
+    cbnz            x6, .Loop1_hpp_\v\()_\w\()x\h
     ret
 .endif
 .endm
@@ -1051,7 +1051,7 @@
     dup             v31.8h, w12
     qpel_start_\v
 .if \w == 4
-.loop_hps_\v\()_\w\()x\h\():
+.Loop_hps_\v\()_\w\()x\h\():
     mov             x11, x0
     sub             x11, x11, #4
     vextin8 \v
@@ -1061,14 +1061,14 @@
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop_hps_\v\()_\w\()x\h
+    cbnz            w6, .Loop_hps_\v\()_\w\()x\h
     ret
 .else
-.loop1_hps_\v\()_\w\()x\h\():
+.Loop1_hps_\v\()_\w\()x\h\():
     mov             w7, #\w
     mov             x11, x0
     sub             x11, x11, #4
-.loop2_hps_\v\()_\w\()x\h\():
+.Loop2_hps_\v\()_\w\()x\h\():
 .if \w == 8 || \w == 12 || \w == 24
     vextin8 \v
     qpel_filter_\v\()_32b
@@ -1092,11 +1092,11 @@
     sub             w7, w7, #16
     sub             x11, x11, #16
 .endif
-    cbnz            w7, .loop2_hps_\v\()_\w\()x\h
+    cbnz            w7, .Loop2_hps_\v\()_\w\()x\h
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop1_hps_\v\()_\w\()x\h
+    cbnz            w6, .Loop1_hps_\v\()_\w\()x\h
     ret
 .endif
 .endm
@@ -1107,10 +1107,10 @@
     dup             v31.8h, w12
     sub             x0, x0, x1
     mov             x5, #\h
-.loop_chroma_vpp_\v\()_\w\()x\h:
+.Loop_chroma_vpp_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_chroma_vpp_w8_\v\()_\w\()x\h:
+.Loop_chroma_vpp_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_32b \v
     qpel_filter_chroma_\v\()_32b
@@ -1137,11 +1137,11 @@
     str             d17, [x7], #8
 .endif
     cmp             x9, #\w
-    blt             .loop_chroma_vpp_w8_\v\()_\w\()x\h
+    blt             .Loop_chroma_vpp_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_chroma_vpp_\v\()_\w\()x\h
+    cbnz            x5, .Loop_chroma_vpp_\v\()_\w\()x\h
     ret
 .endm

@@ -1152,10 +1152,10 @@
     lsl             x3, x3, #1
     sub             x0, x0, x1
     mov             x5, #\h
-.loop_vps_\v\()_\w\()x\h:
+.Loop_vps_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_vps_w8_\v\()_\w\()x\h:
+.Loop_vps_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_32b \v
     qpel_filter_chroma_\v\()_32b
@@ -1180,12 +1180,12 @@
     str             q17, [x7], #16
 .endif
     cmp             x9, #\w
-    blt             .loop_vps_w8_\v\()_\w\()x\h
+    blt             .Loop_vps_w8_\v\()_\w\()x\h

     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vps_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vps_\v\()_\w\()x\h
     ret
 .endm

@@ -1200,10 +1200,10 @@
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_chroma_\v\()_1
-.loop_vsp_\v\()_\w\()x\h:
+.Loop_vsp_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_vsp_w8_\v\()_\w\()x\h:
+.Loop_vsp_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_64b \v
     qpel_filter_chroma_\v\()_32b_1
@@ -1223,11 +1223,11 @@
     str             d17, [x7], #8
 .endif
     cmp             x9, x12
-    blt             .loop_vsp_w8_\v\()_\w\()x\h
+    blt             .Loop_vsp_w8_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vsp_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vsp_\v\()_\w\()x\h
     ret
 .endm

@@ -1239,7 +1239,7 @@
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_chroma_\v\()_1
-.loop_vss_\v\()_\w\()x\h:
+.Loop_vss_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
 .if \w == 4
@@ -1252,7 +1252,7 @@
     add             x9, x9, #4
 .endr
 .else
-.loop_vss_w8_\v\()_\w\()x\h:
+.Loop_vss_w8_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_64b \v
     qpel_filter_chroma_\v\()_32b_1
@@ -1268,12 +1268,12 @@
     add             x9, x9, #8
 .endif
     cmp             x9, x12
-    blt             .loop_vss_w8_\v\()_\w\()x\h
+    blt             .Loop_vss_w8_\v\()_\w\()x\h
 .endif
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vss_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vss_\v\()_\w\()x\h
     ret
 .endm

@@ -1284,7 +1284,7 @@
     mov             w6, #\h
     sub             x3, x3, #\w
 .if \w == 2 || \w == 4 || \w == 6 || \w == 12
-.loop4_chroma_hpp_\v\()_\w\()x\h:
+.Loop4_chroma_hpp_\v\()_\w\()x\h:
     mov             x11, x0
     sub             x11, x11, #2
     vextin8_chroma \v
@@ -1310,15 +1310,15 @@
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop4_chroma_hpp_\v\()_\w\()x\h
+    cbnz            w6, .Loop4_chroma_hpp_\v\()_\w\()x\h
     ret
 .else
-.loop2_chroma_hpp_\v\()_\w\()x\h:
+.Loop2_chroma_hpp_\v\()_\w\()x\h:
     mov             x7, #\w
     lsr             x7, x7, #3
     mov             x11, x0
     sub             x11, x11, #2
-.loop3_chroma_hpp_\v\()_\w\()x\h:
+.Loop3_chroma_hpp_\v\()_\w\()x\h:
 .if \w == 8 || \w == 24
     vextin8_chroma \v
     qpel_filter_chroma_\v\()_32b
@@ -1336,11 +1336,11 @@
     sub             x7, x7, #2
     sub             x11, x11, #16
 .endif
-    cbnz            x7, .loop3_chroma_hpp_\v\()_\w\()x\h
+    cbnz            x7, .Loop3_chroma_hpp_\v\()_\w\()x\h
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop2_chroma_hpp_\v\()_\w\()x\h
+    cbnz            w6, .Loop2_chroma_hpp_\v\()_\w\()x\h
     ret
 .endif
 .endm
@@ -1397,12 +1397,12 @@
     add             w10, w10, #3
 9:
     mov             w6, w10
-.loop1_chroma_hps_\v\()_\w\()x\h\():
+.Loop1_chroma_hps_\v\()_\w\()x\h\():
     mov             x7, #\w
     lsr             x7, x7, #3
     mov             x11, x0
     sub             x11, x11, #2
-.loop2_chroma_hps_\v\()_\w\()x\h\():
+.Loop2_chroma_hps_\v\()_\w\()x\h\():
 .if \w == 8 || \w == 24
     vextin8_chroma \v
     qpel_filter_chroma_\v\()_32b
@@ -1419,11 +1419,11 @@
     sub             x7, x7, #2
     sub             x11, x11, #16
 .endif
-    cbnz            x7, .loop2_chroma_hps_\v\()_\w\()x\h\()
+    cbnz            x7, .Loop2_chroma_hps_\v\()_\w\()x\h\()
     sub             w6, w6, #1
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w6, .loop1_chroma_hps_\v\()_\w\()x\h\()
+    cbnz            w6, .Loop1_chroma_hps_\v\()_\w\()x\h\()
     ret
 .endif
 .endm
diff --git a/source/common/aarch64/ipfilter-sve2.S b/source/common/aarch64/ipfilter-sve2.S
index 95657db55..525ed1172 100644
--- a/source/common/aarch64/ipfilter-sve2.S
+++ b/source/common/aarch64/ipfilter-sve2.S
@@ -370,10 +370,10 @@
     cmp             x9, #16
     bgt             .vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h
     qpel_start_\v
-.loop_luma_vpp_sve2_\v\()_\w\()x\h:
+.Loop_luma_vpp_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
+.Loop_luma_vpp_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
 .if \w == 8 || \w == 24
     qpel_load_32b \v
@@ -403,11 +403,11 @@
     add             x9, x9, #16
 .endif
     cmp             x9, #\w
-    blt             .loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_luma_vpp_w8_sve2_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vpp_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vpp_sve2_\v\()_\w\()x\h
     ret
 .vl_gt_16_FILTER_LUMA_VPP_\v\()_\w\()x\h:
     ptrue           p0.h, vl8
@@ -522,7 +522,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_sve2
     ld1rd           {z22.d}, p0/z, [x12, #48]
     ld1rd           {z23.d}, p0/z, [x12, #56]

-.loop_vps_sve2_4x\h:
+.Loop_vps_sve2_4x\h:
     mov             x6, x0

     ld1b            {z0.s}, p0/z, [x6]
@@ -557,7 +557,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_sve2

     add             x0, x0, x1
     sub             x4, x4, #1
-    cbnz            x4, .loop_vps_sve2_4x\h
+    cbnz            x4, .Loop_vps_sve2_4x\h
     ret
 endfunc
 .endm
@@ -593,7 +593,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_sve2
     ld1rd           {z22.d}, p0/z, [x12, #48]
     ld1rd           {z23.d}, p0/z, [x12, #56]

-.loop_vsp_sve2_4x\h:
+.Loop_vsp_sve2_4x\h:
     mov             x6, x0

     ld1             {v0.8b}, [x6], x1
@@ -630,7 +630,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_sve2

     add             x0, x0, x1
     sub             x4, x4, #1
-    cbnz            x4, .loop_vsp_sve2_4x\h
+    cbnz            x4, .Loop_vsp_sve2_4x\h
     ret
 endfunc
 .endm
@@ -654,10 +654,10 @@ LUMA_VSP_4xN_SVE2 16
     cmp             x14, #16
     bgt             .vl_gt_16_FILTER_VPS_\v\()_\w\()x\h
     qpel_start_\v
-.loop_ps_sve2_\v\()_\w\()x\h:
+.Loop_ps_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_ps_w8_sve2_\v\()_\w\()x\h:
+.Loop_ps_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
 .if \w == 8 || \w == 24
     qpel_load_32b \v
@@ -685,11 +685,11 @@ LUMA_VSP_4xN_SVE2 16
     add             x9, x9, #16
 .endif
     cmp             x9, #\w
-    blt             .loop_ps_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_ps_w8_sve2_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_ps_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_ps_sve2_\v\()_\w\()x\h
     ret
 .vl_gt_16_FILTER_VPS_\v\()_\w\()x\h:
     ptrue           p0.h, vl8
@@ -796,10 +796,10 @@ LUMA_VPS_SVE2 64, 48
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_\v\()_1
-.loop_luma_vss_sve2_\v\()_\w\()x\h:
+.Loop_luma_vss_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_luma_vss_w8_sve2_\v\()_\w\()x\h:
+.Loop_luma_vss_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_load_64b \v
     qpel_filter_\v\()_32b_1
@@ -820,11 +820,11 @@ LUMA_VPS_SVE2 64, 48
 .endif
 .endif
     cmp             x9, x12
-    blt             .loop_luma_vss_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_luma_vss_w8_sve2_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_luma_vss_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_luma_vss_sve2_\v\()_\w\()x\h
     ret
 .endm

@@ -884,10 +884,10 @@ LUMA_VSS_SVE2 48, 64
     mov             z31.h, #32
     sub             x0, x0, x1
     mov             x5, #\h
-.loop_chroma_vpp_sve2_\v\()_\w\()x\h:
+.Loop_chroma_vpp_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h:
+.Loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_32b_sve2 \v
     qpel_filter_chroma_sve2_\v\()_32b
@@ -914,11 +914,11 @@ LUMA_VSS_SVE2 48, 64
     str             d17, [x7], #8
 .endif
     cmp             x9, #\w
-    blt             .loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_chroma_vpp_w8_sve2_\v\()_\w\()x\h
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_chroma_vpp_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_chroma_vpp_sve2_\v\()_\w\()x\h
     ret
 .endm

@@ -1008,10 +1008,10 @@ CHROMA_VPP_SVE2 48, 64
     lsl             x3, x3, #1
     sub             x0, x0, x1
     mov             x5, #\h
-.loop_vps_sve2_\v\()_\w\()x\h:
+.Loop_vps_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
-.loop_vps_w8_sve2_\v\()_\w\()x\h:
+.Loop_vps_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_32b_sve2 \v
     qpel_filter_chroma_sve2_\v\()_32b
@@ -1036,12 +1036,12 @@ CHROMA_VPP_SVE2 48, 64
     str             q17, [x7], #16
 .endif
     cmp             x9, #\w
-    blt             .loop_vps_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_vps_w8_sve2_\v\()_\w\()x\h

     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vps_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vps_sve2_\v\()_\w\()x\h
     ret
 .endm

@@ -1170,7 +1170,7 @@ CHROMA_VPS_SVE2 48, 64
     mov             x12, #\w
     lsl             x12, x12, #1
     qpel_start_chroma_sve2_\v\()_1
-.loop_vss_sve2_\v\()_\w\()x\h:
+.Loop_vss_sve2_\v\()_\w\()x\h:
     mov             x7, x2
     mov             x9, #0
 .if \w == 4
@@ -1183,7 +1183,7 @@ CHROMA_VPS_SVE2 48, 64
     add             x9, x9, #4
 .endr
 .else
-.loop_vss_w8_sve2_\v\()_\w\()x\h:
+.Loop_vss_w8_sve2_\v\()_\w\()x\h:
     add             x6, x0, x9
     qpel_chroma_load_64b \v
     qpel_filter_chroma_\v\()_32b_1
@@ -1199,12 +1199,12 @@ CHROMA_VPS_SVE2 48, 64
     add             x9, x9, #8
 .endif
     cmp             x9, x12
-    blt             .loop_vss_w8_sve2_\v\()_\w\()x\h
+    blt             .Loop_vss_w8_sve2_\v\()_\w\()x\h
 .endif
     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_vss_sve2_\v\()_\w\()x\h
+    cbnz            x5, .Loop_vss_sve2_\v\()_\w\()x\h
     ret
 .endm

diff --git a/source/common/aarch64/ipfilter.S b/source/common/aarch64/ipfilter.S
index 80624862d..228ffae29 100644
--- a/source/common/aarch64/ipfilter.S
+++ b/source/common/aarch64/ipfilter.S
@@ -85,7 +85,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon
     ushll           v3.8h, v3.8b, #0

     mov             x9, #\h
-.loop_4x\h:
+.Loop_4x\h:
     ld1             {v4.s}[0], [x0], x1
     ld1             {v4.s}[1], [x0], x1
     ushll           v4.8h, v4.8b, #0
@@ -124,7 +124,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon
     st1             {v16.s}[1], [x2], x3

     sub             x9, x9, #2
-    cbnz            x9, .loop_4x\h
+    cbnz            x9, .Loop_4x\h
     ret
 endfunc
 .endm
@@ -202,7 +202,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon
     ld1r            {v22.2d}, [x12], #8
     ld1r            {v23.2d}, [x12], #8

-.loop_vps_4x\h:
+.Loop_vps_4x\h:
     mov             x6, x0

     ld1             {v0.s}[0], [x6], x1
@@ -252,7 +252,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon

     add             x0, x0, x1
     sub             x4, x4, #1
-    cbnz            x4, .loop_vps_4x\h
+    cbnz            x4, .Loop_vps_4x\h
     ret
 endfunc
 .endm
@@ -331,7 +331,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon
     ld1r            {v21.2d}, [x12], #8
     ld1r            {v22.2d}, [x12], #8
     ld1r            {v23.2d}, [x12], #8
-.loop_vsp_4x\h:
+.Loop_vsp_4x\h:
     mov             x6, x0

     ld1             {v0.8b}, [x6], x1
@@ -368,7 +368,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon

     add             x0, x0, x1
     sub             x4, x4, #1
-    cbnz            x4, .loop_vsp_4x\h
+    cbnz            x4, .Loop_vsp_4x\h
     ret
 endfunc
 .endm
diff --git a/source/common/aarch64/mc-a-sve2.S b/source/common/aarch64/mc-a-sve2.S
index 704bdaed0..e4540ce9b 100644
--- a/source/common/aarch64/mc-a-sve2.S
+++ b/source/common/aarch64/mc-a-sve2.S
@@ -219,7 +219,7 @@ function PFX(pixel_avg_pp_48x64_sve2)
     mov             x11, #0
     whilelt         p0.b, x11, x10
     mov             w12, #8
-.loop_gt_32_pixel_avg_pp_48x64:
+.Loop_gt_32_pixel_avg_pp_48x64:
     sub             w12, w12, #1
 .rept 8
     ld1b            {z0.b}, p0/z, [x2]
@@ -230,7 +230,7 @@ function PFX(pixel_avg_pp_48x64_sve2)
     st1b            {z0.b}, p0, [x0]
     add             x0, x0, x1
 .endr
-    cbnz            w12, .loop_gt_32_pixel_avg_pp_48x64
+    cbnz            w12, .Loop_gt_32_pixel_avg_pp_48x64
     ret
 endfunc

@@ -339,7 +339,7 @@ function PFX(addAvg_6x\h\()_sve2)
     mov             w12, #\h / 2
     ptrue           p0.b, vl16
     ptrue           p2.h, vl6
-.loop_sve2_addavg_6x\h\():
+.Loop_sve2_addavg_6x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -359,7 +359,7 @@ function PFX(addAvg_6x\h\()_sve2)
     add             x2, x2, x5
     st1b            {z2.h}, p2, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_6x\h
+    cbnz            w12, .Loop_sve2_addavg_6x\h
     ret
 endfunc
 .endm
@@ -398,7 +398,7 @@ endfunc
 function PFX(addAvg_8x\h\()_sve2)
     mov             w12, #\h / 2
     ptrue           p0.b, vl16
-.loop_sve2_addavg_8x\h\():
+.Loop_sve2_addavg_8x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -418,7 +418,7 @@ function PFX(addAvg_8x\h\()_sve2)
     add             x2, x2, x5
     st1b            {z2.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_8x\h
+    cbnz            w12, .Loop_sve2_addavg_8x\h
     ret
 endfunc
 .endm
@@ -440,7 +440,7 @@ function PFX(addAvg_12x\h\()_sve2)
     bgt             .vl_gt_16_addAvg_12x\h
     ptrue           p0.b, vl16
     ptrue           p1.b, vl8
-.loop_sve2_addavg_12x\h\():
+.Loop_sve2_addavg_12x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -457,13 +457,13 @@ function PFX(addAvg_12x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z2.h}, p1, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_addavg_12x\h
+    cbnz            w12, .Loop_sve2_addavg_12x\h
     ret
 .vl_gt_16_addAvg_12x\h\():
     mov             x10, #24
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_sve2_gt_16_addavg_12x\h\():
+.Loop_sve2_gt_16_addavg_12x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -476,7 +476,7 @@ function PFX(addAvg_12x\h\()_sve2)
     add             z2.b, z2.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_sve2_gt_16_addavg_12x\h
+    cbnz            w12, .Loop_sve2_gt_16_addavg_12x\h
     ret
 endfunc
 .endm
@@ -491,7 +491,7 @@ function PFX(addAvg_16x\h\()_sve2)
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_16x\h
     ptrue           p0.b, vl16
-.loop_eq_16_sve2_addavg_16x\h\():
+.Loop_eq_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -508,13 +508,13 @@ function PFX(addAvg_16x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z2.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_16x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_16x\h
     ret
 .vl_gt_16_addAvg_16x\h\():
     cmp             x9, #32
     bgt             .vl_gt_32_addAvg_16x\h
     ptrue           p0.b, vl32
-.loop_gt_16_sve2_addavg_16x\h\():
+.Loop_gt_16_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -525,13 +525,13 @@ function PFX(addAvg_16x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p1, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_16_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_16x\h
     ret
 .vl_gt_32_addAvg_16x\h\():
     mov             x10, #48
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_32_sve2_addavg_16x\h\():
+.Loop_gt_32_sve2_addavg_16x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     add             x0, x0, x3, lsl #1
@@ -541,7 +541,7 @@ function PFX(addAvg_16x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_32_sve2_addavg_16x\h
+    cbnz            w12, .Loop_gt_32_sve2_addavg_16x\h
     ret
 endfunc
 .endm
@@ -561,7 +561,7 @@ function PFX(addAvg_24x\h\()_sve2)
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_24x\h
     addAvg_start
-.loop_eq_16_sve2_addavg_24x\h\():
+.Loop_eq_16_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1             {v0.16b-v2.16b}, [x0], x3
     ld1             {v3.16b-v5.16b}, [x1], x4
@@ -572,14 +572,14 @@ function PFX(addAvg_24x\h\()_sve2)
     sqxtun          v1.8b, v1.8h
     sqxtun          v2.8b, v2.8h
     st1             {v0.8b-v2.8b}, [x2], x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_24x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_24x\h
     ret
 .vl_gt_16_addAvg_24x\h\():
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_24x\h
     ptrue           p0.b, vl32
     ptrue           p1.b, vl16
-.loop_gt_16_sve2_addavg_24x\h\():
+.Loop_gt_16_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p1/z, [x0, #1, mul vl]
@@ -596,13 +596,13 @@ function PFX(addAvg_24x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p1, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_16_sve2_addavg_24x\h
+    cbnz            w12, .Loop_gt_16_sve2_addavg_24x\h
     ret
 .vl_gt_48_addAvg_24x\h\():
     mov             x10, #48
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_48_sve2_addavg_24x\h\():
+.Loop_gt_48_sve2_addavg_24x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z2.b}, p0/z, [x1]
@@ -613,7 +613,7 @@ function PFX(addAvg_24x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_48_sve2_addavg_24x\h
+    cbnz            w12, .Loop_gt_48_sve2_addavg_24x\h
     ret
 endfunc
 .endm
@@ -628,7 +628,7 @@ function PFX(addAvg_32x\h\()_sve2)
     cmp             x9, #16
     bgt             .vl_gt_16_addAvg_32x\h
     ptrue           p0.b, vl16
-.loop_eq_16_sve2_addavg_32x\h\():
+.Loop_eq_16_sve2_addavg_32x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -657,13 +657,13 @@ function PFX(addAvg_32x\h\()_sve2)
     st1b            {z2.h}, p0, [x2, #2, mul vl]
     st1b            {z3.h}, p0, [x2, #3, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_32x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_32x\h
     ret
 .vl_gt_16_addAvg_32x\h\():
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_32x\h
     ptrue           p0.b, vl32
-.loop_gt_eq_32_sve2_addavg_32x\h\():
+.Loop_gt_eq_32_sve2_addavg_32x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -680,11 +680,11 @@ function PFX(addAvg_32x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_32_sve2_addavg_32x\h
+    cbnz            w12, .Loop_gt_eq_32_sve2_addavg_32x\h
     ret
 .vl_gt_48_addAvg_32x\h\():
     ptrue           p0.b, vl64
-.loop_eq_64_sve2_addavg_32x\h\():
+.Loop_eq_64_sve2_addavg_32x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x1]
@@ -695,7 +695,7 @@ function PFX(addAvg_32x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_eq_64_sve2_addavg_32x\h
+    cbnz            w12, .Loop_eq_64_sve2_addavg_32x\h
     ret
 endfunc
 .endm
@@ -715,7 +715,7 @@ function PFX(addAvg_48x64_sve2)
     addAvg_start
     sub             x3, x3, #64
     sub             x4, x4, #64
-.loop_eq_16_sve2_addavg_48x64:
+.Loop_eq_16_sve2_addavg_48x64:
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], #64
     ld1             {v4.8h-v7.8h}, [x1], #64
@@ -734,13 +734,13 @@ function PFX(addAvg_48x64_sve2)
     sqxtun          v2.8b, v20.8h
     sqxtun2         v2.16b, v21.8h
     st1             {v0.16b-v2.16b}, [x2], x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_48x64
+    cbnz            w12, .Loop_eq_16_sve2_addavg_48x64
     ret
 .vl_gt_16_addAvg_48x64:
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_48x64
     ptrue           p0.b, vl32
-.loop_gt_eq_32_sve2_addavg_48x64:
+.Loop_gt_eq_32_sve2_addavg_48x64:
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -763,14 +763,14 @@ function PFX(addAvg_48x64_sve2)
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     st1b            {z2.h}, p0, [x2, #2, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_32_sve2_addavg_48x64
+    cbnz            w12, .Loop_gt_eq_32_sve2_addavg_48x64
     ret
 .vl_gt_48_addAvg_48x64:
     cmp             x9, #112
     bgt             .vl_gt_112_addAvg_48x64
     ptrue           p0.b, vl64
     ptrue           p1.b, vl32
-.loop_gt_48_sve2_addavg_48x64:
+.Loop_gt_48_sve2_addavg_48x64:
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p1/z, [x0, #1, mul vl]
@@ -787,13 +787,13 @@ function PFX(addAvg_48x64_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p1, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_48_sve2_addavg_48x64
+    cbnz            w12, .Loop_gt_48_sve2_addavg_48x64
     ret
 .vl_gt_112_addAvg_48x64:
     mov             x10, #96
     mov             x11, #0
     whilelt         p0.b, x11, x10
-.loop_gt_112_sve2_addavg_48x64:
+.Loop_gt_112_sve2_addavg_48x64:
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z4.b}, p0/z, [x1]
@@ -804,7 +804,7 @@ function PFX(addAvg_48x64_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_112_sve2_addavg_48x64
+    cbnz            w12, .Loop_gt_112_sve2_addavg_48x64
     ret
 endfunc

@@ -817,7 +817,7 @@ function PFX(addAvg_64x\h\()_sve2)
     addAvg_start
     sub             x3, x3, #64
     sub             x4, x4, #64
-.loop_eq_16_sve2_addavg_64x\h\():
+.Loop_eq_16_sve2_addavg_64x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], #64
     ld1             {v4.8h-v7.8h}, [x1], #64
@@ -840,13 +840,13 @@ function PFX(addAvg_64x\h\()_sve2)
     sqxtun          v3.8b, v22.8h
     sqxtun2         v3.16b, v23.8h
     st1             {v0.16b-v3.16b}, [x2], x5
-    cbnz            w12, .loop_eq_16_sve2_addavg_64x\h
+    cbnz            w12, .Loop_eq_16_sve2_addavg_64x\h
     ret
 .vl_gt_16_addAvg_64x\h\():
     cmp             x9, #48
     bgt             .vl_gt_48_addAvg_64x\h
     ptrue           p0.b, vl32
-.loop_gt_eq_32_sve2_addavg_64x\h\():
+.Loop_gt_eq_32_sve2_addavg_64x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -875,13 +875,13 @@ function PFX(addAvg_64x\h\()_sve2)
     st1b            {z2.h}, p0, [x2, #2, mul vl]
     st1b            {z3.h}, p0, [x2, #3, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_32_sve2_addavg_64x\h
+    cbnz            w12, .Loop_gt_eq_32_sve2_addavg_64x\h
     ret
 .vl_gt_48_addAvg_64x\h\():
     cmp             x9, #112
     bgt             .vl_gt_112_addAvg_64x\h
     ptrue           p0.b, vl64
-.loop_gt_eq_48_sve2_addavg_64x\h\():
+.Loop_gt_eq_48_sve2_addavg_64x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z1.b}, p0/z, [x0, #1, mul vl]
@@ -898,11 +898,11 @@ function PFX(addAvg_64x\h\()_sve2)
     st1b            {z0.h}, p0, [x2]
     st1b            {z1.h}, p0, [x2, #1, mul vl]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_48_sve2_addavg_64x\h
+    cbnz            w12, .Loop_gt_eq_48_sve2_addavg_64x\h
     ret
 .vl_gt_112_addAvg_64x\h\():
     ptrue           p0.b, vl128
-.loop_gt_eq_128_sve2_addavg_64x\h\():
+.Loop_gt_eq_128_sve2_addavg_64x\h\():
     sub             w12, w12, #1
     ld1b            {z0.b}, p0/z, [x0]
     ld1b            {z4.b}, p0/z, [x1]
@@ -913,7 +913,7 @@ function PFX(addAvg_64x\h\()_sve2)
     add             z0.b, z0.b, #0x80
     st1b            {z0.h}, p0, [x2]
     add             x2, x2, x5
-    cbnz            w12, .loop_gt_eq_128_sve2_addavg_64x\h
+    cbnz            w12, .Loop_gt_eq_128_sve2_addavg_64x\h
     ret
 endfunc
 .endm
diff --git a/source/common/aarch64/mc-a.S b/source/common/aarch64/mc-a.S
index d122b8bb3..8c2878b3e 100644
--- a/source/common/aarch64/mc-a.S
+++ b/source/common/aarch64/mc-a.S
@@ -283,7 +283,7 @@ function PFX(addAvg_6x\h\()_neon)
     addAvg_start
     mov             w12, #\h / 2
     sub             x5, x5, #4
-.loop_addavg_6x\h:
+.Loop_addavg_6x\h:
     sub             w12, w12, #1
     ld1             {v0.16b}, [x0], x3
     ld1             {v1.16b}, [x1], x4
@@ -305,7 +305,7 @@ function PFX(addAvg_6x\h\()_neon)
     st1             {v0.h}[2], [x2], x5
     str             s1, [x2], #4
     st1             {v1.h}[2], [x2], x5
-    cbnz            w12, .loop_addavg_6x\h
+    cbnz            w12, .Loop_addavg_6x\h
     ret
 endfunc
 .endm
@@ -344,7 +344,7 @@ endfunc
 function PFX(addAvg_8x\h\()_neon)
     addAvg_start
     mov             w12, #\h / 2
-.loop_addavg_8x\h:
+.Loop_addavg_8x\h:
     sub             w12, w12, #1
     ld1             {v0.16b}, [x0], x3
     ld1             {v1.16b}, [x1], x4
@@ -364,7 +364,7 @@ function PFX(addAvg_8x\h\()_neon)
     sqxtun          v1.8b, v1.8h
     st1             {v0.8b}, [x2], x5
     st1             {v1.8b}, [x2], x5
-    cbnz            w12, .loop_addavg_8x\h
+    cbnz            w12, .Loop_addavg_8x\h
     ret
 endfunc
 .endm
@@ -385,7 +385,7 @@ function PFX(addAvg_12x\h\()_neon)
     sub             x4, x4, #16
     sub             x5, x5, #8
     mov             w12, #\h
-.loop_addAvg_12X\h\():
+.Loop_addAvg_12X\h\():
     sub             w12, w12, #1
     ld1             {v0.16b}, [x0], #16
     ld1             {v1.16b}, [x1], #16
@@ -403,7 +403,7 @@ function PFX(addAvg_12x\h\()_neon)
     sqxtun          v1.8b, v1.8h
     st1             {v0.8b}, [x2], #8
     st1             {v1.s}[0], [x2], x5
-    cbnz            w12, .loop_addAvg_12X\h
+    cbnz            w12, .Loop_addAvg_12X\h
     ret
 endfunc
 .endm
@@ -415,7 +415,7 @@ addAvg_12xN 32
 function PFX(addAvg_16x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_16x\h:
+.Loop_addavg_16x\h:
     sub             w12, w12, #1
     ld1             {v0.8h-v1.8h}, [x0], x3
     ld1             {v2.8h-v3.8h}, [x1], x4
@@ -424,7 +424,7 @@ function PFX(addAvg_16x\h\()_neon)
     sqxtun          v0.8b, v0.8h
     sqxtun2         v0.16b, v1.8h
     st1             {v0.16b}, [x2], x5
-    cbnz            w12, .loop_addavg_16x\h
+    cbnz            w12, .Loop_addavg_16x\h
     ret
 endfunc
 .endm
@@ -441,7 +441,7 @@ addAvg_16xN 64
 function PFX(addAvg_24x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_24x\h\():
+.Loop_addavg_24x\h\():
     sub             w12, w12, #1
     ld1             {v0.16b-v2.16b}, [x0], x3
     ld1             {v3.16b-v5.16b}, [x1], x4
@@ -452,7 +452,7 @@ function PFX(addAvg_24x\h\()_neon)
     sqxtun          v1.8b, v1.8h
     sqxtun          v2.8b, v2.8h
     st1             {v0.8b-v2.8b}, [x2], x5
-    cbnz            w12, .loop_addavg_24x\h
+    cbnz            w12, .Loop_addavg_24x\h
     ret
 endfunc
 .endm
@@ -464,7 +464,7 @@ addAvg_24xN 64
 function PFX(addAvg_32x\h\()_neon)
     addAvg_start
     mov             w12, #\h
-.loop_addavg_32x\h\():
+.Loop_addavg_32x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], x3
     ld1             {v4.8h-v7.8h}, [x1], x4
@@ -477,7 +477,7 @@ function PFX(addAvg_32x\h\()_neon)
     sqxtun          v2.8b, v2.8h
     sqxtun          v3.8b, v3.8h
     st1             {v0.8b-v3.8b}, [x2], x5
-    cbnz            w12, .loop_addavg_32x\h
+    cbnz            w12, .Loop_addavg_32x\h
     ret
 endfunc
 .endm
@@ -494,7 +494,7 @@ function PFX(addAvg_48x64_neon)
     sub             x3, x3, #64
     sub             x4, x4, #64
     mov             w12, #64
-.loop_addavg_48x64:
+.Loop_addavg_48x64:
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], #64
     ld1             {v4.8h-v7.8h}, [x1], #64
@@ -513,7 +513,7 @@ function PFX(addAvg_48x64_neon)
     sqxtun          v2.8b, v20.8h
     sqxtun2         v2.16b, v21.8h
     st1             {v0.16b-v2.16b}, [x2], x5
-    cbnz            w12, .loop_addavg_48x64
+    cbnz            w12, .Loop_addavg_48x64
     ret
 endfunc

@@ -523,7 +523,7 @@ function PFX(addAvg_64x\h\()_neon)
     mov             w12, #\h
     sub             x3, x3, #64
     sub             x4, x4, #64
-.loop_addavg_64x\h\():
+.Loop_addavg_64x\h\():
     sub             w12, w12, #1
     ld1             {v0.8h-v3.8h}, [x0], #64
     ld1             {v4.8h-v7.8h}, [x1], #64
@@ -546,7 +546,7 @@ function PFX(addAvg_64x\h\()_neon)
     sqxtun          v3.8b, v22.8h
     sqxtun2         v3.16b, v23.8h
     st1             {v0.16b-v3.16b}, [x2], x5
-    cbnz            w12, .loop_addavg_64x\h
+    cbnz            w12, .Loop_addavg_64x\h
     ret
 endfunc
 .endm
diff --git a/source/common/aarch64/p2s-sve.S b/source/common/aarch64/p2s-sve.S
index dc32df2e6..85bb14b3d 100644
--- a/source/common/aarch64/p2s-sve.S
+++ b/source/common/aarch64/p2s-sve.S
@@ -204,7 +204,7 @@ function PFX(filterPixelToShort_32x\h\()_sve)
 #else
     p2s_start
     mov             x9, #\h
-.loop_filter_sve_P2S_32x\h:
+.Loop_filter_sve_P2S_32x\h:
     sub             x9, x9, #1
     ld1             {v0.16b-v1.16b}, [x0], x1
     ushll           v22.8h, v0.8b,  #P2S_SHIFT
@@ -216,7 +216,7 @@ function PFX(filterPixelToShort_32x\h\()_sve)
     add             v24.8h, v24.8h, v31.8h
     add             v25.8h, v25.8h, v31.8h
     st1             {v22.16b-v25.16b}, [x2], x3
-    cbnz            x9, .loop_filter_sve_P2S_32x\h
+    cbnz            x9, .Loop_filter_sve_P2S_32x\h
     ret
 #endif
 endfunc
@@ -331,7 +331,7 @@ function PFX(filterPixelToShort_64x\h\()_sve)
     p2s_start
     sub             x3, x3, #64
     mov             x9, #\h
-.loop_filter_sve_P2S_64x\h:
+.Loop_filter_sve_P2S_64x\h:
     sub             x9, x9, #1
     ld1             {v0.16b-v3.16b}, [x0], x1
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
@@ -352,7 +352,7 @@ function PFX(filterPixelToShort_64x\h\()_sve)
     add             v23.8h, v23.8h, v31.8h
     st1             {v16.16b-v19.16b}, [x2], #64
     st1             {v20.16b-v23.16b}, [x2], x3
-    cbnz            x9, .loop_filter_sve_P2S_64x\h
+    cbnz            x9, .Loop_filter_sve_P2S_64x\h
     ret
 #endif
 endfunc
@@ -422,7 +422,7 @@ function PFX(filterPixelToShort_48x64_sve)
     p2s_start
     sub             x3, x3, #64
     mov             x9, #64
-.loop_filterP2S_sve_48x64:
+.Loop_filterP2S_sve_48x64:
     sub            x9, x9, #1
     ld1             {v0.16b-v2.16b}, [x0], x1
     ushll           v16.8h, v0.8b,  #P2S_SHIFT
@@ -439,7 +439,7 @@ function PFX(filterPixelToShort_48x64_sve)
     add             v21.8h, v21.8h, v31.8h
     st1             {v16.16b-v19.16b}, [x2], #64
     st1             {v20.16b-v21.16b}, [x2], x3
-    cbnz            x9, .loop_filterP2S_sve_48x64
+    cbnz            x9, .Loop_filterP2S_sve_48x64
     ret
 #endif
 endfunc
diff --git a/source/common/aarch64/p2s.S b/source/common/aarch64/p2s.S
index 58301c9bf..b15835b34 100644
--- a/source/common/aarch64/p2s.S
+++ b/source/common/aarch64/p2s.S
@@ -262,7 +262,7 @@ p2s_24xN 64
 function PFX(filterPixelToShort_32x\h\()_neon)
     p2s_start
     mov             x9, #\h
-.loop_filterP2S_32x\h:
+.Loop_filterP2S_32x\h:
     sub             x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, [x0], x1
@@ -282,7 +282,7 @@ function PFX(filterPixelToShort_32x\h\()_neon)
     add             v24.8h, v24.8h, v31.8h
     add             v25.8h, v25.8h, v31.8h
     st1             {v22.16b-v25.16b}, [x2], x3
-    cbnz            x9, .loop_filterP2S_32x\h
+    cbnz            x9, .Loop_filterP2S_32x\h
     ret
 endfunc
 .endm
@@ -302,7 +302,7 @@ function PFX(filterPixelToShort_64x\h\()_neon)
 #endif
     sub             x3, x3, #64
     mov             x9, #\h
-.loop_filterP2S_64x\h:
+.Loop_filterP2S_64x\h:
     sub             x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, [x0], #64
@@ -336,7 +336,7 @@ function PFX(filterPixelToShort_64x\h\()_neon)
     add             v23.8h, v23.8h, v31.8h
     st1             {v16.16b-v19.16b}, [x2], #64
     st1             {v20.16b-v23.16b}, [x2], x3
-    cbnz            x9, .loop_filterP2S_64x\h
+    cbnz            x9, .Loop_filterP2S_64x\h
     ret
 endfunc
 .endm
@@ -353,7 +353,7 @@ function PFX(filterPixelToShort_48x64_neon)
 #endif
     sub             x3, x3, #64
     mov             x9, #64
-.loop_filterP2S_48x64:
+.Loop_filterP2S_48x64:
     sub            x9, x9, #1
 #if HIGH_BIT_DEPTH
     ld1             {v0.16b-v3.16b}, [x0], #64
@@ -381,6 +381,6 @@ function PFX(filterPixelToShort_48x64_neon)
     add             v21.8h, v21.8h, v31.8h
     st1             {v16.16b-v19.16b}, [x2], #64
     st1             {v20.16b-v21.16b}, [x2], x3
-    cbnz            x9, .loop_filterP2S_48x64
+    cbnz            x9, .Loop_filterP2S_48x64
     ret
 endfunc
diff --git a/source/common/aarch64/pixel-util-sve.S b/source/common/aarch64/pixel-util-sve.S
index 715fcc1cb..c1d6b4129 100644
--- a/source/common/aarch64/pixel-util-sve.S
+++ b/source/common/aarch64/pixel-util-sve.S
@@ -333,7 +333,7 @@ function PFX(quant_sve)
     eor             w10, w10, w10
     eor             z17.d, z17.d, z17.d

-.loop_quant_sve:
+.Loop_quant_sve:
     ld1             {v18.4h}, [x0], #8
     ld1             {v7.4s}, [x1], #16
     sxtl            v6.4s, v18.4h
@@ -364,7 +364,7 @@ function PFX(quant_sve)
     st1             {v5.4h}, [x3], #8

     subs            w6, w6, #1
-    b.ne             .loop_quant_sve
+    b.ne             .Loop_quant_sve

     addv            s4, v4.4s
     mov             w9, v4.s[0]
diff --git a/source/common/aarch64/pixel-util-sve2.S b/source/common/aarch64/pixel-util-sve2.S
index dbd138f62..2af5d63c1 100644
--- a/source/common/aarch64/pixel-util-sve2.S
+++ b/source/common/aarch64/pixel-util-sve2.S
@@ -64,11 +64,11 @@ function PFX(pixel_var_16x16_sve2)
     bgt             .vl_gt_16_pixel_var_16x16
     pixel_var_start
     mov             w12, #16
-.loop_var_16_sve2:
+.Loop_var_16_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b}, [x0], x1
     pixel_var_1 v4
-    cbnz            w12, .loop_var_16_sve2
+    cbnz            w12, .Loop_var_16_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_16x16:
@@ -95,12 +95,12 @@ function PFX(pixel_var_32x32_sve2)
     bgt             .vl_gt_16_pixel_var_32x32
     pixel_var_start
     mov             w12, #32
-.loop_var_32_sve2:
+.Loop_var_32_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, [x0], x1
     pixel_var_1 v4
     pixel_var_1 v5
-    cbnz            w12, .loop_var_32_sve2
+    cbnz            w12, .Loop_var_32_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_32x32:
@@ -150,14 +150,14 @@ function PFX(pixel_var_64x64_sve2)
     bgt             .vl_gt_16_pixel_var_64x64
     pixel_var_start
     mov             w12, #64
-.loop_var_64_sve2:
+.Loop_var_64_sve2:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x0], x1
     pixel_var_1 v4
     pixel_var_1 v5
     pixel_var_1 v6
     pixel_var_1 v7
-    cbnz            w12, .loop_var_64_sve2
+    cbnz            w12, .Loop_var_64_sve2
     pixel_var_end
     ret
 .vl_gt_16_pixel_var_64x64:
@@ -268,7 +268,7 @@ function PFX(getResidual32_sve2)
     bgt             .vl_gt_16_getResidual32
     lsl             x4, x3, #1
     mov             w12, #4
-.loop_residual_32:
+.Loop_residual_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x0], x3
@@ -286,7 +286,7 @@ function PFX(getResidual32_sve2)
     st1             {v16.8h-v19.8h}, [x2], x4
     st1             {v20.8h-v23.8h}, [x2], x4
 .endr
-    cbnz            w12, .loop_residual_32
+    cbnz            w12, .Loop_residual_32
     ret
 .vl_gt_16_getResidual32:
     cmp             x9, #48
@@ -323,7 +323,7 @@ function PFX(pixel_sub_ps_32x32_sve2)
     bgt             .vl_gt_16_pixel_sub_ps_32x32
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_sub_ps_32_sve2:
+.Loop_sub_ps_32_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -341,7 +341,7 @@ function PFX(pixel_sub_ps_32x32_sve2)
     st1             {v16.8h-v19.8h}, [x0], x1
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_32_sve2
+    cbnz            w12, .Loop_sub_ps_32_sve2
     ret
 .vl_gt_16_pixel_sub_ps_32x32:
     cmp             x9, #48
@@ -387,7 +387,7 @@ function PFX(pixel_sub_ps_64x64_sve2)
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_sub_ps_64_sve2:
+.Loop_sub_ps_64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, [x2], x4
@@ -403,7 +403,7 @@ function PFX(pixel_sub_ps_64x64_sve2)
     st1             {v16.8h-v19.8h}, [x0], #64
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_64_sve2
+    cbnz            w12, .Loop_sub_ps_64_sve2
     ret
 .vl_gt_16_pixel_sub_ps_64x64:
     rdvl            x9, #1
@@ -473,7 +473,7 @@ function PFX(pixel_sub_ps_32x64_sve2)
     bgt             .vl_gt_16_pixel_sub_ps_32x64
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_sub_ps_32x64_sve2:
+.Loop_sub_ps_32x64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -491,7 +491,7 @@ function PFX(pixel_sub_ps_32x64_sve2)
     st1             {v16.8h-v19.8h}, [x0], x1
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_32x64_sve2
+    cbnz            w12, .Loop_sub_ps_32x64_sve2
     ret
 .vl_gt_16_pixel_sub_ps_32x64:
     cmp             x9, #48
@@ -609,7 +609,7 @@ pixel_add_ps_16xN_sve2 32
     bgt             .vl_gt_16_pixel_add_ps_32x\h
     lsl             x5, x5, #1
     mov             w12, #\h / 4
-.loop_add_ps__sve2_32x\h\():
+.Loop_add_ps__sve2_32x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -628,7 +628,7 @@ pixel_add_ps_16xN_sve2 32
     sqxtun2         v5.16b, v27.8h
     st1             {v4.16b-v5.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_add_ps__sve2_32x\h
+    cbnz            w12, .Loop_add_ps__sve2_32x\h
     ret
 .vl_gt_16_pixel_add_ps_32x\h\():
     cmp             x9, #48
@@ -1157,7 +1157,7 @@ function PFX(ssimDist16_sve2)
     bgt             .vl_gt_16_ssimDist16
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist16_sve2:
+.Loop_ssimDist16_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
     ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
@@ -1171,7 +1171,7 @@ function PFX(ssimDist16_sve2)
     add             x2, x2, x3
     ssimDist_1_sve2 z4, z5, z8, z9
     ssimDist_1_sve2 z6, z7, z10, z11
-    cbnz            w12, .loop_ssimDist16_sve2
+    cbnz            w12, .Loop_ssimDist16_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist16:
@@ -1217,7 +1217,7 @@ function PFX(ssimDist32_sve2)
     bgt             .vl_gt_16_ssimDist32
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist32_sve2:
+.Loop_ssimDist32_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, [x0]
     ld1b            {z3.s}, p0/z, [x0, #1, mul vl]
@@ -1241,7 +1241,7 @@ function PFX(ssimDist32_sve2)
     ssimDist_1_sve2 z4, z5, z12, z13
     ssimDist_1_sve2 z6, z7, z14, z15
     ssimDist_1_sve2 z8, z9, z30, z31
-    cbnz            w12, .loop_ssimDist32_sve2
+    cbnz            w12, .Loop_ssimDist32_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist32:
@@ -1309,7 +1309,7 @@ function PFX(ssimDist64_sve2)
     bgt             .vl_gt_16_ssimDist64
     ssimDist_start
     ptrue           p0.s, vl4
-.loop_ssimDist64_sve2:
+.Loop_ssimDist64_sve2:
     sub             w12, w12, #1
     ld1b            {z2.s}, p0/z, [x0]
     ld1b            {z3.s}, p0/z, [x0, #1, mul vl]
@@ -1357,7 +1357,7 @@ function PFX(ssimDist64_sve2)
     ssimDist_1_sve2 z8, z9, z29, z30
     add             x0, x0, x1
     add             x2, x2, x3
-    cbnz            w12, .loop_ssimDist64_sve2
+    cbnz            w12, .Loop_ssimDist64_sve2
     ssimDist_end
     ret
 .vl_gt_16_ssimDist64:
@@ -1482,7 +1482,7 @@ function PFX(normFact16_sve2)
     bgt             .vl_gt_16_normFact16
     normFact_start
     ptrue           p0.s, vl4
-.loop_normFact16_sve2:
+.Loop_normFact16_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
     ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
@@ -1491,7 +1491,7 @@ function PFX(normFact16_sve2)
     add             x0, x0, x1
     normFact_1_sve2 z4, z5
     normFact_1_sve2 z6, z7
-    cbnz            w12, .loop_normFact16_sve2
+    cbnz            w12, .Loop_normFact16_sve2
     normFact_end
     ret
 .vl_gt_16_normFact16:
@@ -1529,7 +1529,7 @@ function PFX(normFact32_sve2)
     bgt             .vl_gt_16_normFact32
     normFact_start
     ptrue           p0.s, vl4
-.loop_normFact32_sve2:
+.Loop_normFact32_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
     ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
@@ -1544,7 +1544,7 @@ function PFX(normFact32_sve2)
     normFact_1_sve2 z6, z7
     normFact_1_sve2 z8, z9
     normFact_1_sve2 z10, z11
-    cbnz            w12, .loop_normFact32_sve2
+    cbnz            w12, .Loop_normFact32_sve2
     normFact_end
     ret
 .vl_gt_16_normFact32:
@@ -1599,7 +1599,7 @@ function PFX(normFact64_sve2)
     bgt             .vl_gt_16_normFact64
     normFact_start
     ptrue           p0.s, vl4
-.loop_normFact64_sve2:
+.Loop_normFact64_sve2:
     sub             w12, w12, #1
     ld1b            {z4.s}, p0/z, [x0]
     ld1b            {z5.s}, p0/z, [x0, #1, mul vl]
@@ -1628,7 +1628,7 @@ function PFX(normFact64_sve2)
     normFact_1_sve2 z8, z9
     normFact_1_sve2 z10, z11
     add             x0, x0, x1
-    cbnz            w12, .loop_normFact64_sve2
+    cbnz            w12, .Loop_normFact64_sve2
     normFact_end
     ret
 .vl_gt_16_normFact64:
diff --git a/source/common/aarch64/pixel-util.S b/source/common/aarch64/pixel-util.S
index 378c6891c..1df49ba6e 100644
--- a/source/common/aarch64/pixel-util.S
+++ b/source/common/aarch64/pixel-util.S
@@ -60,11 +60,11 @@ endfunc
 function PFX(pixel_var_16x16_neon)
     pixel_var_start
     mov             w12, #16
-.loop_var_16:
+.Loop_var_16:
     sub             w12, w12, #1
     ld1             {v4.16b}, [x0], x1
     pixel_var_1 v4
-    cbnz            w12, .loop_var_16
+    cbnz            w12, .Loop_var_16
     pixel_var_end
     ret
 endfunc
@@ -72,12 +72,12 @@ endfunc
 function PFX(pixel_var_32x32_neon)
     pixel_var_start
     mov             w12, #32
-.loop_var_32:
+.Loop_var_32:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, [x0], x1
     pixel_var_1 v4
     pixel_var_1 v5
-    cbnz            w12, .loop_var_32
+    cbnz            w12, .Loop_var_32
     pixel_var_end
     ret
 endfunc
@@ -85,14 +85,14 @@ endfunc
 function PFX(pixel_var_64x64_neon)
     pixel_var_start
     mov             w12, #64
-.loop_var_64:
+.Loop_var_64:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x0], x1
     pixel_var_1 v4
     pixel_var_1 v5
     pixel_var_1 v6
     pixel_var_1 v7
-    cbnz            w12, .loop_var_64
+    cbnz            w12, .Loop_var_64
     pixel_var_end
     ret
 endfunc
@@ -148,7 +148,7 @@ endfunc
 function PFX(getResidual32_neon)
     lsl             x4, x3, #1
     mov             w12, #4
-.loop_residual_32:
+.Loop_residual_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x0], x3
@@ -166,7 +166,7 @@ function PFX(getResidual32_neon)
     st1             {v16.8h-v19.8h}, [x2], x4
     st1             {v20.8h-v23.8h}, [x2], x4
 .endr
-    cbnz            w12, .loop_residual_32
+    cbnz            w12, .Loop_residual_32
     ret
 endfunc

@@ -221,7 +221,7 @@ endfunc
 function PFX(pixel_sub_ps_32x32_neon)
     lsl             x1, x1, #1
     mov             w12, #4
-.loop_sub_ps_32:
+.Loop_sub_ps_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -239,7 +239,7 @@ function PFX(pixel_sub_ps_32x32_neon)
     st1             {v16.8h-v19.8h}, [x0], x1
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_32
+    cbnz            w12, .Loop_sub_ps_32
     ret
 endfunc

@@ -247,7 +247,7 @@ function PFX(pixel_sub_ps_64x64_neon)
     lsl             x1, x1, #1
     sub             x1, x1, #64
     mov             w12, #16
-.loop_sub_ps_64:
+.Loop_sub_ps_64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v3.16b}, [x2], x4
@@ -263,7 +263,7 @@ function PFX(pixel_sub_ps_64x64_neon)
     st1             {v16.8h-v19.8h}, [x0], #64
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_64
+    cbnz            w12, .Loop_sub_ps_64
     ret
 endfunc

@@ -318,7 +318,7 @@ endfunc
 function PFX(pixel_sub_ps_32x64_neon)
     lsl             x1, x1, #1
     mov             w12, #8
-.loop_sub_ps_32x64:
+.Loop_sub_ps_32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -336,7 +336,7 @@ function PFX(pixel_sub_ps_32x64_neon)
     st1             {v16.8h-v19.8h}, [x0], x1
     st1             {v20.8h-v23.8h}, [x0], x1
 .endr
-    cbnz            w12, .loop_sub_ps_32x64
+    cbnz            w12, .Loop_sub_ps_32x64
     ret
 endfunc

@@ -383,7 +383,7 @@ endfunc
 function PFX(pixel_add_ps_16x\h\()_neon)
     lsl             x5, x5, #1
     mov             w12, #\h / 8
-.loop_add_ps_16x\h\():
+.Loop_add_ps_16x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b}, [x2], x4
@@ -405,7 +405,7 @@ function PFX(pixel_add_ps_16x\h\()_neon)
     st1             {v4.16b}, [x0], x1
     st1             {v5.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_add_ps_16x\h
+    cbnz            w12, .Loop_add_ps_16x\h
     ret
 endfunc
 .endm
@@ -417,7 +417,7 @@ pixel_add_ps_16xN_neon 32
  function PFX(pixel_add_ps_32x\h\()_neon)
     lsl             x5, x5, #1
     mov             w12, #\h / 4
-.loop_add_ps_32x\h\():
+.Loop_add_ps_32x\h\():
     sub             w12, w12, #1
 .rept 4
     ld1             {v0.16b-v1.16b}, [x2], x4
@@ -436,7 +436,7 @@ pixel_add_ps_16xN_neon 32
     sqxtun2         v5.16b, v27.8h
     st1             {v4.16b-v5.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_add_ps_32x\h
+    cbnz            w12, .Loop_add_ps_32x\h
     ret
 endfunc
 .endm
@@ -448,7 +448,7 @@ function PFX(pixel_add_ps_64x64_neon)
     lsl             x5, x5, #1
     sub             x5, x5, #64
     mov             w12, #32
-.loop_add_ps_64x64:
+.Loop_add_ps_64x64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v0.16b-v3.16b}, [x2], x4
@@ -480,7 +480,7 @@ function PFX(pixel_add_ps_64x64_neon)
     sqxtun2         v3.16b, v7.8h
     st1             {v0.16b-v3.16b}, [x0], x1
 .endr
-    cbnz            w12, .loop_add_ps_64x64
+    cbnz            w12, .Loop_add_ps_64x64
     ret
 endfunc

@@ -548,7 +548,7 @@ endfunc
 // void scale2D_64to32(pixel* dst, const pixel* src, intptr_t stride)
 function PFX(scale2D_64to32_neon)
     mov             w12, #32
-.loop_scale2D:
+.Loop_scale2D:
     ld1             {v0.16b-v3.16b}, [x1], x2
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x1], x2
@@ -561,7 +561,7 @@ function PFX(scale2D_64to32_neon)
     uqrshrn         v1.8b, v2.8h, #2
     uqrshrn2        v1.16b, v3.8h, #2
     st1             {v0.16b-v1.16b}, [x0], #32
-    cbnz            w12, .loop_scale2D
+    cbnz            w12, .Loop_scale2D
     ret
 endfunc

@@ -569,33 +569,33 @@ endfunc
 function PFX(pixel_planecopy_cp_neon)
     dup             v2.16b, w6
     sub             x5, x5, #1
-.loop_h:
+.Loop_h:
     mov             x6, x0
     mov             x12, x2
     mov             x7, #0
-.loop_w:
+.Loop_w:
     ldr             q0, [x6], #16
     ushl            v0.16b, v0.16b, v2.16b
     str             q0, [x12], #16
     add             x7, x7, #16
     cmp             x7, x4
-    blt             .loop_w
+    blt             .Loop_w

     add             x0, x0, x1
     add             x2, x2, x3
     sub             x5, x5, #1
-    cbnz            x5, .loop_h
+    cbnz            x5, .Loop_h

 // handle last row
     mov             x5, x4
     lsr             x5, x5, #3
-.loopW8:
+.LoopW8:
     ldr             d0, [x0], #8
     ushl            v0.8b, v0.8b, v2.8b
     str             d0, [x2], #8
     sub             x4, x4, #8
     sub             x5, x5, #1
-    cbnz            x5, .loopW8
+    cbnz            x5, .LoopW8

     mov             x5, #8
     sub             x5, x5, x4
@@ -1508,7 +1508,7 @@ function PFX(pixel_sa8d_32x64_neon)
     mov             x10, x30
     mov             w11, #4
     mov             w9, #0
-.loop_sa8d_32:
+.Loop_sa8d_32:
     sub             w11, w11, #1
     sa8d_16x16      w4
     sub             x0, x0, x1, lsl #4
@@ -1520,7 +1520,7 @@ function PFX(pixel_sa8d_32x64_neon)
     add             w9, w9, w4
     sub             x0, x0, #24
     sub             x2, x2, #24
-    cbnz            w11, .loop_sa8d_32
+    cbnz            w11, .Loop_sa8d_32
     mov             w0, w9
     ret             x10
 endfunc
@@ -1529,7 +1529,7 @@ function PFX(pixel_sa8d_64x64_neon)
     mov             x10, x30
     mov             w11, #4
     mov             w9, #0
-.loop_sa8d_64:
+.Loop_sa8d_64:
     sub             w11, w11, #1
     sa8d_16x16      w4
     sub             x0, x0, x1, lsl #4
@@ -1554,7 +1554,7 @@ function PFX(pixel_sa8d_64x64_neon)

     sub             x0, x0, #56
     sub             x2, x2, #56
-    cbnz            w11, .loop_sa8d_64
+    cbnz            w11, .Loop_sa8d_64
     mov             w0, w9
     ret             x10
 endfunc
@@ -1807,7 +1807,7 @@ function PFX(quant_neon)
     eor             w10, w10, w10
     eor             v17.16b, v17.16b, v17.16b

-.loop_quant:
+.Loop_quant:

     ld1             {v18.4h}, [x0], #8
     ld1             {v7.4s}, [x1], #16
@@ -1839,7 +1839,7 @@ function PFX(quant_neon)
     st1             {v5.4h}, [x3], #8

     subs            w6, w6, #1
-    b.ne             .loop_quant
+    b.ne             .Loop_quant

     addv            s4, v4.4s
     mov             w9, v4.s[0]
@@ -1858,7 +1858,7 @@ function PFX(nquant_neon)
     mov             x4, #0
     movi            v22.4s, #0

-.loop_nquant:
+.Loop_nquant:
     ld1             {v16.4h}, [x0], #8
     sub             w5, w5, #1
     sxtl            v19.4s, v16.4h         // v19 = coef[blockpos]
@@ -1883,7 +1883,7 @@ function PFX(nquant_neon)
     abs             v17.4h, v16.4h
     st1             {v17.4h}, [x2], #8

-    cbnz            w5, .loop_nquant
+    cbnz            w5, .Loop_nquant

     uaddlv          d4, v4.4s
     fmov            x12, d4
@@ -1937,7 +1937,7 @@ endfunc
 function PFX(ssimDist16_neon)
     mov w12, #16
     ssimDist_start
-.loop_ssimDist16:
+.Loop_ssimDist16:
     sub             w12, w12, #1
     ld1             {v4.16b}, [x0], x1
     ld1             {v5.16b}, [x2], x3
@@ -1947,7 +1947,7 @@ function PFX(ssimDist16_neon)
     uxtl2           v5.8h, v5.16b
     ssimDist_1      v6, v7
     ssimDist_1      v4, v5
-    cbnz            w12, .loop_ssimDist16
+    cbnz            w12, .Loop_ssimDist16
     ssimDist_end
     ret
 endfunc
@@ -1955,7 +1955,7 @@ endfunc
 function PFX(ssimDist32_neon)
     mov w12, #32
     ssimDist_start
-.loop_ssimDist32:
+.Loop_ssimDist32:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, [x0], x1
     ld1             {v6.16b-v7.16b}, [x2], x3
@@ -1971,7 +1971,7 @@ function PFX(ssimDist32_neon)
     ssimDist_1      v23, v24
     ssimDist_1      v25, v26
     ssimDist_1      v27, v28
-    cbnz            w12, .loop_ssimDist32
+    cbnz            w12, .Loop_ssimDist32
     ssimDist_end
     ret
 endfunc
@@ -1979,7 +1979,7 @@ endfunc
 function PFX(ssimDist64_neon)
     mov w12, #64
     ssimDist_start
-.loop_ssimDist64:
+.Loop_ssimDist64:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x0], x1
     ld1             {v16.16b-v19.16b}, [x2], x3
@@ -2007,7 +2007,7 @@ function PFX(ssimDist64_neon)
     ssimDist_1      v23, v24
     ssimDist_1      v25, v26
     ssimDist_1      v27, v28
-    cbnz            w12, .loop_ssimDist64
+    cbnz            w12, .Loop_ssimDist64
     ssimDist_end
     ret
 endfunc
@@ -2035,14 +2035,14 @@ endfunc
 function PFX(normFact16_neon)
     mov w12, #16
     normFact_start
-.loop_normFact16:
+.Loop_normFact16:
     sub             w12, w12, #1
     ld1             {v4.16b}, [x0], x1
     uxtl            v5.8h, v4.8b
     uxtl2           v4.8h, v4.16b
     normFact_1      v5
     normFact_1      v4
-    cbnz            w12, .loop_normFact16
+    cbnz            w12, .Loop_normFact16
     normFact_end
     ret
 endfunc
@@ -2050,7 +2050,7 @@ endfunc
 function PFX(normFact32_neon)
     mov w12, #32
     normFact_start
-.loop_normFact32:
+.Loop_normFact32:
     sub             w12, w12, #1
     ld1             {v4.16b-v5.16b}, [x0], x1
     uxtl            v6.8h, v4.8b
@@ -2061,7 +2061,7 @@ function PFX(normFact32_neon)
     normFact_1      v5
     normFact_1      v6
     normFact_1      v7
-    cbnz            w12, .loop_normFact32
+    cbnz            w12, .Loop_normFact32
     normFact_end
     ret
 endfunc
@@ -2069,7 +2069,7 @@ endfunc
 function PFX(normFact64_neon)
     mov w12, #64
     normFact_start
-.loop_normFact64:
+.Loop_normFact64:
     sub             w12, w12, #1
     ld1             {v4.16b-v7.16b}, [x0], x1
     uxtl            v26.8h, v4.8b
@@ -2088,7 +2088,7 @@ function PFX(normFact64_neon)
     normFact_1      v25
     normFact_1      v26
     normFact_1      v27
-    cbnz            w12, .loop_normFact64
+    cbnz            w12, .Loop_normFact64
     normFact_end
     ret
 endfunc
@@ -2120,9 +2120,9 @@ function PFX(weight_pp_neon)
     cbnz            w11, .widenTo32Bit

     // 16-bit arithmetic is enough.
-.loopHpp:
+.LoopHpp:
     mov             x12, x3
-.loopWpp:
+.LoopWpp:
     ldr             q0, [x0], #16
     sub             x12, x12, #16
     umull           v1.8h, v0.8b, v25.8b  // val *= w0 << correction >> shift
@@ -2132,18 +2132,18 @@ function PFX(weight_pp_neon)
     sqxtun          v0.8b, v1.8h          // val = x265_clip(val)
     sqxtun2         v0.16b, v2.8h
     str             q0, [x1], #16
-    cbnz            x12, .loopWpp
+    cbnz            x12, .LoopWpp
     add             x1, x1, x2
     add             x0, x0, x2
     sub             x4, x4, #1
-    cbnz            x4, .loopHpp
+    cbnz            x4, .LoopHpp
     ret

     // 32-bit arithmetic is needed.
 .widenTo32Bit:
-.loopHpp32:
+.LoopHpp32:
     mov             x12, x3
-.loopWpp32:
+.LoopWpp32:
     ldr             d0, [x0], #8
     sub             x12, x12, #8
     uxtl            v0.8h, v0.8b
@@ -2155,11 +2155,11 @@ function PFX(weight_pp_neon)
     sqxtn2          v0.8h, v2.4s
     sqxtun          v0.8b, v0.8h
     str             d0, [x1], #8
-    cbnz            x12, .loopWpp32
+    cbnz            x12, .LoopWpp32
     add             x1, x1, x2
     add             x0, x0, x2
     sub             x4, x4, #1
-    cbnz            x4, .loopHpp32
+    cbnz            x4, .LoopHpp32
     ret

     // The shift right cannot be moved out of the loop.
@@ -2169,9 +2169,9 @@ function PFX(weight_pp_neon)
     neg             w7, w7                // -shift
     dup             v27.4s, w7
     dup             v29.4s, w9            // offset
-.loopHppUS:
+.LoopHppUS:
     mov             x12, x3
-.loopWppUS:
+.LoopWppUS:
     ldr             d0, [x0], #8
     sub             x12, x12, #8
     uxtl            v0.8h, v0.8b
@@ -2187,11 +2187,11 @@ function PFX(weight_pp_neon)
     sqxtn2          v0.8h, v2.4s
     sqxtun          v0.8b, v0.8h
     str             d0, [x1], #8
-    cbnz            x12, .loopWppUS
+    cbnz            x12, .LoopWppUS
     add             x1, x1, x2
     add             x0, x0, x2
     sub             x4, x4, #1
-    cbnz            x4, .loopHppUS
+    cbnz            x4, .LoopHppUS
     ret
 endfunc

@@ -2220,7 +2220,7 @@ function PFX(scanPosLast_neon)
     add             x11, x10, x7    // 3*x7
     add             x9, x4, #1      // CG count

-.loop_spl:
+.Loop_spl:
     // position of current CG
     ldrh            w6, [x0], #32
     add             x6, x1, x6, lsl #1
@@ -2267,14 +2267,14 @@ function PFX(scanPosLast_neon)
     // accelerate by preparing w13 = w13 & w15
     and             w13, w13, w15
     mov             x14, xzr
-.loop_spl_1:
+.Loop_spl_1:
     cbz             w15, .pext_end
     clz             w6, w15
     lsl             w13, w13, w6
     lsl             w15, w15, w6
     extr            w14, w14, w13, #31
     bfm             w15, wzr, #1, #0
-    b               .loop_spl_1
+    b               .Loop_spl_1
 .pext_end:
     strh            w14, [x2], #2

@@ -2285,7 +2285,7 @@ function PFX(scanPosLast_neon)
     sub             x5, x5, x6
     strb            w6, [x4], #1

-    cbnz            x5, .loop_spl
+    cbnz            x5, .Loop_spl

     // count trailing zeros
     rbit            w13, w12
@@ -2364,7 +2364,7 @@ function PFX(costCoeffNxN_neon)
     mov             x11, #0
     movi            v31.16b, #0
     cbz             x2, .idx_zero
-.loop_ccnn:
+.Loop_ccnn:
 //   {
 //        const uint32_t cnt = tabSigCtx[blkPos] + offset + posOffset;
 //        ctxSig = cnt & posZeroMask;
@@ -2403,7 +2403,7 @@ function PFX(costCoeffNxN_neon)
     cmp             w9, #1
     csel            w10, w11, w10, eq
     strb            w10, [x6, x14]
-    cbnz            x2, .loop_ccnn
+    cbnz            x2, .Loop_ccnn
 .idx_zero:

     add             x13, x3, x4, lsl #1
diff --git a/source/common/aarch64/sad-a-sve2.S b/source/common/aarch64/sad-a-sve2.S
index 9c86d84b6..599a3719a 100644
--- a/source/common/aarch64/sad-a-sve2.S
+++ b/source/common/aarch64/sad-a-sve2.S
@@ -217,12 +217,12 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
     SAD_START_\w

     mov             w9, #\h/8
-.loop_sve2_\w\()x\h:
+.Loop_sve2_\w\()x\h:
     sub             w9, w9, #1
 .rept 4
     SAD_\w
 .endr
-    cbnz            w9, .loop_sve2_\w\()x\h
+    cbnz            w9, .Loop_sve2_\w\()x\h

     SAD_END_\w

@@ -231,12 +231,12 @@ function PFX(pixel_sad_\w\()x\h\()_sve2)
     SAD_START_\w

     mov             w9, #\h/8
-.loop_sve2_loop_\w\()x\h:
+.Loop_sve2_loop_\w\()x\h:
     sub             w9, w9, #1
 .rept 4
     SAD_\w
 .endr
-    cbnz            w9, .loop_sve2_loop_\w\()x\h
+    cbnz            w9, .Loop_sve2_loop_\w\()x\h

     SAD_END_\w
 .else
@@ -402,7 +402,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
     bgt             .vl_gt_16_sad_x_loop_\x\()_\w\()x\h
     SAD_X_START_\w \x
     mov             w12, #\h/4
-.loop_sad_sve2_x\x\()_\w\()x\h:
+.Loop_sad_sve2_x\x\()_\w\()x\h:
     sub             w12, w12, #1
  .rept 4
   .if \w == 24
@@ -422,7 +422,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
     SAD_X_\w x4, v19, v23
   .endif
  .endr
-    cbnz            w12, .loop_sad_sve2_x\x\()_\w\()x\h
+    cbnz            w12, .Loop_sad_sve2_x\x\()_\w\()x\h
     SAD_X_END_\w \x
 .vl_gt_16_sad_x_loop_\x\()_\w\()x\h\():
 .if \w == 24 || \w == 32
@@ -431,7 +431,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
 .else
     SAD_X_START_\w \x
     mov             w12, #\h/4
-.loop_sad_sve2_gt_16_x\x\()_\w\()x\h:
+.Loop_sad_sve2_gt_16_x\x\()_\w\()x\h:
     sub             w12, w12, #1
  .rept 4
   .if \w == 24
@@ -451,7 +451,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_sve2)
     SAD_X_\w x4, v19, v23
   .endif
  .endr
-    cbnz            w12, .loop_sad_sve2_gt_16_x\x\()_\w\()x\h
+    cbnz            w12, .Loop_sad_sve2_gt_16_x\x\()_\w\()x\h
     SAD_X_END_\w \x
 .endif
 endfunc
diff --git a/source/common/aarch64/sad-a.S b/source/common/aarch64/sad-a.S
index 20d7cac7c..7460825f1 100644
--- a/source/common/aarch64/sad-a.S
+++ b/source/common/aarch64/sad-a.S
@@ -55,12 +55,12 @@ function PFX(pixel_sad_\w\()x\h\()_neon)
     SAD_START_\w

     mov             w9, #\h/8
-.loop_\w\()x\h:
+.Loop_\w\()x\h:
     sub             w9, w9, #1
 .rept 4
     SAD_\w
 .endr
-    cbnz            w9, .loop_\w\()x\h
+    cbnz            w9, .Loop_\w\()x\h

     SAD_END_\w
 endfunc
@@ -129,7 +129,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_neon)
 .endif
     SAD_X_START_\w \x
     mov             w12, #\h/4
-.loop_sad_x\x\()_\w\()x\h:
+.Loop_sad_x\x\()_\w\()x\h:
     sub             w12, w12, #1
  .rept 4
   .if \w == 24
@@ -149,7 +149,7 @@ function PFX(sad_x\x\()_\w\()x\h\()_neon)
     SAD_X_\w x4, v19, v23
   .endif
  .endr
-    cbnz            w12, .loop_sad_x\x\()_\w\()x\h
+    cbnz            w12, .Loop_sad_x\x\()_\w\()x\h
     SAD_X_END_\w \x
 endfunc
 .endm
diff --git a/source/common/aarch64/ssd-a-sve2.S b/source/common/aarch64/ssd-a-sve2.S
index de2603850..8077bd93c 100644
--- a/source/common/aarch64/ssd-a-sve2.S
+++ b/source/common/aarch64/ssd-a-sve2.S
@@ -43,7 +43,7 @@ function PFX(pixel_sse_pp_32x32_sve2)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_pp_32_sve2:
+.Loop_sse_pp_32_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
@@ -61,7 +61,7 @@ function PFX(pixel_sse_pp_32x32_sve2)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_32_sve2
+    cbnz            w12, .Loop_sse_pp_32_sve2
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 .vl_gt_16_pixel_sse_pp_32x32:
@@ -182,7 +182,7 @@ function PFX(pixel_sse_pp_64x64_sve2)
     movi            v0.16b, #0
     movi            v1.16b, #0

-.loop_sse_pp_64_sve2:
+.Loop_sse_pp_64_sve2:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x0], x1
@@ -214,7 +214,7 @@ function PFX(pixel_sse_pp_64x64_sve2)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_64_sve2
+    cbnz            w12, .Loop_sse_pp_64_sve2
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 .vl_gt_16_pixel_sse_pp_64x64:
@@ -788,7 +788,7 @@ function PFX(pixel_ssd_s_16x16_sve2)
     mov             w12, #4
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_ssd_s_16_sve2:
+.Loop_ssd_s_16_sve2:
     sub             w12, w12, #1
 .rept 2
     ld1             {v4.16b,v5.16b}, [x0], x1
@@ -802,7 +802,7 @@ function PFX(pixel_ssd_s_16x16_sve2)
     smlal           v0.4s, v7.4h, v7.4h
     smlal2          v1.4s, v7.8h, v7.8h
 .endr
-    cbnz            w12, .loop_ssd_s_16_sve2
+    cbnz            w12, .Loop_ssd_s_16_sve2
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 .vl_gt_16_pixel_ssd_s_16x16:
@@ -830,7 +830,7 @@ function PFX(pixel_ssd_s_32x32_sve2)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_ssd_s_32:
+.Loop_ssd_s_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v4.16b-v7.16b}, [x0], x1
@@ -843,7 +843,7 @@ function PFX(pixel_ssd_s_32x32_sve2)
     smlal           v0.4s, v7.4h, v7.4h
     smlal2          v1.4s, v7.8h, v7.8h
 .endr
-    cbnz            w12, .loop_ssd_s_32
+    cbnz            w12, .Loop_ssd_s_32
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 .vl_gt_16_pixel_ssd_s_32x32:
diff --git a/source/common/aarch64/ssd-a.S b/source/common/aarch64/ssd-a.S
index 7c778b4fe..f4b79304a 100644
--- a/source/common/aarch64/ssd-a.S
+++ b/source/common/aarch64/ssd-a.S
@@ -157,7 +157,7 @@ function PFX(pixel_sse_pp_32x32_neon)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_pp_32:
+.Loop_sse_pp_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
@@ -175,7 +175,7 @@ function PFX(pixel_sse_pp_32x32_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_32
+    cbnz            w12, .Loop_sse_pp_32
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -184,7 +184,7 @@ function PFX(pixel_sse_pp_32x64_neon)
     mov             w12, #16
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_pp_32x64:
+.Loop_sse_pp_32x64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b,v17.16b}, [x0], x1
@@ -202,7 +202,7 @@ function PFX(pixel_sse_pp_32x64_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_32x64
+    cbnz            w12, .Loop_sse_pp_32x64
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -212,7 +212,7 @@ function PFX(pixel_sse_pp_64x64_neon)
     movi            v0.16b, #0
     movi            v1.16b, #0

-.loop_sse_pp_64:
+.Loop_sse_pp_64:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x0], x1
@@ -244,7 +244,7 @@ function PFX(pixel_sse_pp_64x64_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_pp_64
+    cbnz            w12, .Loop_sse_pp_64
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -301,7 +301,7 @@ function PFX(pixel_sse_ss_16x16_neon)
     mov             w12, #4
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_ss_16:
+.Loop_sse_ss_16:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b, v17.16b}, [x0], x1
@@ -313,7 +313,7 @@ function PFX(pixel_sse_ss_16x16_neon)
     smlal           v0.4s, v3.4h, v3.4h
     smlal2          v1.4s, v3.8h, v3.8h
 .endr
-    cbnz            w12, .loop_sse_ss_16
+    cbnz            w12, .Loop_sse_ss_16
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -325,7 +325,7 @@ function PFX(pixel_sse_ss_32x32_neon)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_ss_32:
+.Loop_sse_ss_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v16.16b-v19.16b}, [x0], x1
@@ -343,7 +343,7 @@ function PFX(pixel_sse_ss_32x32_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_ss_32
+    cbnz            w12, .Loop_sse_ss_32
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -357,7 +357,7 @@ function PFX(pixel_sse_ss_64x64_neon)
     mov             w12, #32
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_sse_ss_64:
+.Loop_sse_ss_64:
     sub             w12, w12, #1
 .rept 2
     ld1             {v16.16b-v19.16b}, [x0], #64
@@ -389,7 +389,7 @@ function PFX(pixel_sse_ss_64x64_neon)
     smlal           v0.4s, v5.4h, v5.4h
     smlal2          v1.4s, v5.8h, v5.8h
 .endr
-    cbnz            w12, .loop_sse_ss_64
+    cbnz            w12, .Loop_sse_ss_64
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -433,7 +433,7 @@ function PFX(pixel_ssd_s_16x16_neon)
     mov             w12, #4
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_ssd_s_16:
+.Loop_ssd_s_16:
     sub             w12, w12, #1
 .rept 2
     ld1             {v4.16b,v5.16b}, [x0], x1
@@ -447,7 +447,7 @@ function PFX(pixel_ssd_s_16x16_neon)
     smlal           v0.4s, v7.4h, v7.4h
     smlal2          v1.4s, v7.8h, v7.8h
 .endr
-    cbnz            w12, .loop_ssd_s_16
+    cbnz            w12, .Loop_ssd_s_16
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
@@ -457,7 +457,7 @@ function PFX(pixel_ssd_s_32x32_neon)
     mov             w12, #8
     movi            v0.16b, #0
     movi            v1.16b, #0
-.loop_ssd_s_32:
+.Loop_ssd_s_32:
     sub             w12, w12, #1
 .rept 4
     ld1             {v4.16b-v7.16b}, [x0], x1
@@ -470,7 +470,7 @@ function PFX(pixel_ssd_s_32x32_neon)
     smlal           v0.4s, v7.4h, v7.4h
     smlal2          v1.4s, v7.8h, v7.8h
 .endr
-    cbnz            w12, .loop_ssd_s_32
+    cbnz            w12, .Loop_ssd_s_32
     add             v0.4s, v0.4s, v1.4s
     ret_v0_w0
 endfunc
diff --git a/source/common/arm/blockcopy8.S b/source/common/arm/blockcopy8.S
index 1c868f464..8170160aa 100644
--- a/source/common/arm/blockcopy8.S
+++ b/source/common/arm/blockcopy8.S
@@ -795,7 +795,7 @@ function x265_count_nonzero_32_neon
     vmov            q2, q12
     vmov            q3, q14

-.loop:
+.Loop:
     vldm            r0!, {q8-q15}
     subs            r1, #1

@@ -817,7 +817,7 @@ function x265_count_nonzero_32_neon
     vadd.s8         q1, q10
     vadd.s8         q2, q12
     vadd.s8         q3, q14
-    bgt            .loop
+    bgt            .Loop

     // sum
     vadd.s8         q0, q1
diff --git a/source/common/arm/dct-a.S b/source/common/arm/dct-a.S
index 42b193bf8..5be8847e9 100644
--- a/source/common/arm/dct-a.S
+++ b/source/common/arm/dct-a.S
@@ -422,7 +422,7 @@ function x265_dct_16x16_neon
     mov lr, #4*16*2

     // DCT-1D
-.loop1:
+.Loop1:
     // Row[0-3]
     vld1.16 {q8-q9}, [r0, :64], r2      // q8  = [07 06 05 04 03 02 01 00], q9  = [0F 0E 0D 0C 0B 0A 09 08]
     vld1.16 {q10-q11}, [r0, :64], r2    // q10 = [17 16 15 14 13 12 11 10], q11 = [1F 1E 1D 1C 1B 1A 19 18]
@@ -628,7 +628,7 @@ function x265_dct_16x16_neon
     // loop into next process group
     sub r3, #3*4*16*2
     subs r12, #1
-    bgt .loop1
+    bgt .Loop1


     // DCT-2D
@@ -637,7 +637,7 @@ function x265_dct_16x16_neon
     mov r3, #16*2*2
     mov r12, #16/4                      // Process 4 rows every loop

-.loop2:
+.Loop2:
     vldm r2, {q8-q15}

     // d16 = [30 20 10 00]
@@ -887,7 +887,7 @@ function x265_dct_16x16_neon

     sub r1, #(17*16-4)*2
     subs r12, #1
-    bgt .loop2
+    bgt .Loop2

     add sp, #16*16*2
     vpop {q4-q7}
diff --git a/source/common/arm/ipfilter8.S b/source/common/arm/ipfilter8.S
index 8b7f5b3ca..b1ec6cc8b 100644
--- a/source/common/arm/ipfilter8.S
+++ b/source/common/arm/ipfilter8.S
@@ -372,7 +372,7 @@ function x265_filterPixelToShort_32x16_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #8
-.loop_filterP2S_32x16:
+.Loop_filterP2S_32x16:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0], r1
@@ -391,7 +391,7 @@ function x265_filterPixelToShort_32x16_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_32x16
+    bgt         .Loop_filterP2S_32x16
     bx          lr
 endfunc

@@ -402,7 +402,7 @@ function x265_filterPixelToShort_32x24_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #12
-.loop_filterP2S_32x24:
+.Loop_filterP2S_32x24:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0], r1
@@ -421,7 +421,7 @@ function x265_filterPixelToShort_32x24_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_32x24
+    bgt         .Loop_filterP2S_32x24
     bx          lr
 endfunc

@@ -432,7 +432,7 @@ function x265_filterPixelToShort_32x32_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #16
-.loop_filterP2S_32x32:
+.Loop_filterP2S_32x32:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0], r1
@@ -451,7 +451,7 @@ function x265_filterPixelToShort_32x32_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_32x32
+    bgt         .Loop_filterP2S_32x32
     bx          lr
 endfunc

@@ -462,7 +462,7 @@ function x265_filterPixelToShort_32x64_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_32x64:
+.Loop_filterP2S_32x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0], r1
@@ -481,7 +481,7 @@ function x265_filterPixelToShort_32x64_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_32x64
+    bgt         .Loop_filterP2S_32x64
     bx          lr
 endfunc

@@ -493,7 +493,7 @@ function x265_filterPixelToShort_64x16_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #8
-.loop_filterP2S_64x16:
+.Loop_filterP2S_64x16:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -528,7 +528,7 @@ function x265_filterPixelToShort_64x16_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_64x16
+    bgt         .Loop_filterP2S_64x16
     bx          lr
 endfunc

@@ -540,7 +540,7 @@ function x265_filterPixelToShort_64x32_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #16
-.loop_filterP2S_64x32:
+.Loop_filterP2S_64x32:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -575,7 +575,7 @@ function x265_filterPixelToShort_64x32_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_64x32
+    bgt         .Loop_filterP2S_64x32
     bx          lr
 endfunc

@@ -587,7 +587,7 @@ function x265_filterPixelToShort_64x48_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #24
-.loop_filterP2S_64x48:
+.Loop_filterP2S_64x48:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -622,7 +622,7 @@ function x265_filterPixelToShort_64x48_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_64x48
+    bgt         .Loop_filterP2S_64x48
     bx          lr
 endfunc

@@ -634,7 +634,7 @@ function x265_filterPixelToShort_64x64_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_64x64:
+.Loop_filterP2S_64x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -669,7 +669,7 @@ function x265_filterPixelToShort_64x64_neon
     vmla.s16    q3, q10, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_64x64
+    bgt         .Loop_filterP2S_64x64
     bx          lr
 endfunc

@@ -681,7 +681,7 @@ function x265_filterPixelToShort_48x64_neon
     vmov.u16    q1, #8192
     vneg.s16    q1, q1
     mov         r12, #32
-.loop_filterP2S_48x64:
+.Loop_filterP2S_48x64:
     subs        r12, #1
 .rept 2
     vld1.u8     {q9-q10}, [r0]!
@@ -709,7 +709,7 @@ function x265_filterPixelToShort_48x64_neon
     vmla.s16    q3, q9, q0
     vst1.16     {q2-q3}, [r2], r3
 .endr
-    bgt         .loop_filterP2S_48x64
+    bgt         .Loop_filterP2S_48x64
     bx          lr
 endfunc

@@ -756,7 +756,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon
     vmovl.u8    q2, d4
     vmovl.u8    q3, d6

-.loop_4x\h:
+.Loop_4x\h:
     // TODO: read extra 1 row for speed optimize, may made crash on OS X platform!
     vld1.u32    {d16[0]}, [r0], r1
     vld1.u32    {d16[1]}, [r0], r1
@@ -795,7 +795,7 @@ function x265_interp_8tap_vert_pp_4x\h\()_neon
     vst1.u32    {d18[1]}, [r2], r3

     subs        r12, #2
-    bne        .loop_4x4
+    bne        .Loop_4x4

     pop         {pc}
     .ltorg
@@ -945,13 +945,13 @@ LUMA_VPP_4xN 16

 .macro FILTER_VPP a b filterv

-.loop_\filterv\()_\a\()x\b:
+.Loop_\filterv\()_\a\()x\b:

     mov             r7, r2
     mov             r6, r0
     eor             r8, r8

-.loop_w8_\filterv\()_\a\()x\b:
+.Loop_w8_\filterv\()_\a\()x\b:

     add             r6, r0, r8

@@ -988,12 +988,12 @@ LUMA_VPP_4xN 16

     add             r8, #8
     cmp             r8, #\a
-    blt             .loop_w8_\filterv\()_\a\()x\b
+    blt             .Loop_w8_\filterv\()_\a\()x\b

     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_\filterv\()_\a\()x\b
+    bne             .Loop_\filterv\()_\a\()x\b

 .endm

@@ -1063,7 +1063,7 @@ function x265_interp_8tap_vert_pp_12x16_neon
     sub             r0, r4

     mov             r4, #16
-.loop_vpp_12x16:
+.Loop_vpp_12x16:

     mov             r6, r0
     mov             r7, r2
@@ -1173,7 +1173,7 @@ function x265_interp_8tap_vert_pp_12x16_neon
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_vpp_12x16
+    bne             .Loop_vpp_12x16

     pop             {r4, r5, r6, r7}
     bx              lr
@@ -1194,7 +1194,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon
     add             r12, #2048
     vdup.32         q8, r12
     mov             r4, #\h
-.loop_vsp_4x\h:
+.Loop_vsp_4x\h:
     movrel          r12, g_lumaFilter
     add             r12, r5
     mov             r6, r0
@@ -1266,7 +1266,7 @@ function x265_interp_8tap_vert_sp_4x\h\()_neon

     add             r0, r1
     subs            r4, #1
-    bne             .loop_vsp_4x\h
+    bne             .Loop_vsp_4x\h
     pop             {r4, r5, r6}
     bx              lr
     .ltorg
@@ -1369,13 +1369,13 @@ LUMA_VSP_4xN 16
 .macro FILTER_VSP a b filterv

     vpush           { q4 - q7}
-.loop_\filterv\()_\a\()x\b:
+.Loop_\filterv\()_\a\()x\b:

     mov             r7, r2
     mov             r6, r0
     eor             r8, r8

-.loop_w8_\filterv\()_\a\()x\b:
+.Loop_w8_\filterv\()_\a\()x\b:

     add             r6, r0, r8

@@ -1417,12 +1417,12 @@ LUMA_VSP_4xN 16
     mov             r12, #\a
     lsl             r12, #1
     cmp             r8, r12
-    blt             .loop_w8_\filterv\()_\a\()x\b
+    blt             .Loop_w8_\filterv\()_\a\()x\b

     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_\filterv\()_\a\()x\b
+    bne             .Loop_\filterv\()_\a\()x\b

     vpop            { q4 - q7}

@@ -1498,7 +1498,7 @@ function x265_interp_8tap_vert_sp_12x16_neon

     mov             r4, #16
     vpush           { q4 - q7}
-.loop1_12x16:
+.Loop1_12x16:

     mov             r6, r0
     mov             r7, r2
@@ -1612,7 +1612,7 @@ function x265_interp_8tap_vert_sp_12x16_neon
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop1_12x16
+    bne             .Loop1_12x16
     vpop            { q4 - q7}
     pop             {r4, r5, r6, r7}
     bx              lr
@@ -1632,7 +1632,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon
     vdup.32         q8, r4
     mov             r4, #\h

-.loop_vps_4x\h:
+.Loop_vps_4x\h:
     movrel          r12, g_lumaFilter
     add             r12, r5
     mov             r6, r0
@@ -1702,7 +1702,7 @@ function x265_interp_8tap_vert_ps_4x\h\()_neon

     add             r0, r1
     subs            r4, #1
-    bne             .loop_vps_4x\h
+    bne             .Loop_vps_4x\h

     pop             {r4, r5, r6}
     bx              lr
@@ -1717,13 +1717,13 @@ LUMA_VPS_4xN 16

 .macro FILTER_VPS a b filterv

-.loop_ps_\filterv\()_\a\()x\b:
+.Loop_ps_\filterv\()_\a\()x\b:

     mov             r7, r2
     mov             r6, r0
     eor             r8, r8

-.loop_ps_w8_\filterv\()_\a\()x\b:
+.Loop_ps_w8_\filterv\()_\a\()x\b:

     add             r6, r0, r8

@@ -1759,12 +1759,12 @@ LUMA_VPS_4xN 16

     add             r8, #8
     cmp             r8, #\a
-    blt             .loop_ps_w8_\filterv\()_\a\()x\b
+    blt             .Loop_ps_w8_\filterv\()_\a\()x\b

     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_ps_\filterv\()_\a\()x\b
+    bne             .Loop_ps_\filterv\()_\a\()x\b

 .endm

@@ -1836,7 +1836,7 @@ function x265_interp_8tap_vert_ps_12x16_neon
     sub             r0, r4

     mov             r4, #16
-.loop_vps_12x16:
+.Loop_vps_12x16:

     mov             r6, r0
     mov             r7, r2
@@ -1942,7 +1942,7 @@ function x265_interp_8tap_vert_ps_12x16_neon
     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_vps_12x16
+    bne             .Loop_vps_12x16

     pop             {r4, r5, r6, r7}
     bx              lr
@@ -2081,13 +2081,13 @@ endfunc

     vpush           {q4-q7}

-.loop_\filterv\()_\a\()x\b:
+.Loop_\filterv\()_\a\()x\b:

     mov             r7, r2
     mov             r6, r0
     eor             r8, r8

-.loop_w8_\filterv\()_\a\()x\b:
+.Loop_w8_\filterv\()_\a\()x\b:

     add             r6, r0, r8

@@ -2121,12 +2121,12 @@ endfunc

     add             r8, #8
     cmp             r8, #\a
-    blt             .loop_w8_\filterv\()_\a\()x\b
+    blt             .Loop_w8_\filterv\()_\a\()x\b

     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_\filterv\()_\a\()x\b
+    bne             .Loop_\filterv\()_\a\()x\b
     vpop            {q4-q7}
 .endm

@@ -2217,13 +2217,13 @@ CHROMA_VPP 48 64

     vpush           {q4-q7}

-.loop_vps_\filterv\()_\a\()x\b:
+.Loop_vps_\filterv\()_\a\()x\b:

     mov             r7, r2
     mov             r6, r0
     eor             r8, r8

-.loop_vps_w8_\filterv\()_\a\()x\b:
+.Loop_vps_w8_\filterv\()_\a\()x\b:

     add             r6, r0, r8

@@ -2256,12 +2256,12 @@ CHROMA_VPP 48 64

     add             r8, #8
     cmp             r8, #\a
-    blt             .loop_vps_w8_\filterv\()_\a\()x\b
+    blt             .Loop_vps_w8_\filterv\()_\a\()x\b

     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_vps_\filterv\()_\a\()x\b
+    bne             .Loop_vps_\filterv\()_\a\()x\b
     vpop            {q4-q7}
 .endm

@@ -2353,13 +2353,13 @@ CHROMA_VPS 48 64

     vpush           {q4-q7}

-.loop_vsp_\filterv\()_\a\()x\b:
+.Loop_vsp_\filterv\()_\a\()x\b:

     mov             r7, r2
     mov             r6, r0
     eor             r8, r8

-.loop_vsp_w8_\filterv\()_\a\()x\b:
+.Loop_vsp_w8_\filterv\()_\a\()x\b:

     add             r6, r0, r8

@@ -2392,12 +2392,12 @@ CHROMA_VPS 48 64
     mov             r12, #\a
     lsl             r12, #1
     cmp             r8, r12
-    blt             .loop_vsp_w8_\filterv\()_\a\()x\b
+    blt             .Loop_vsp_w8_\filterv\()_\a\()x\b

     add             r0, r1
     add             r2, r3
     subs            r4, #1
-    bne             .loop_vsp_\filterv\()_\a\()x\b
+    bne             .Loop_vsp_\filterv\()_\a\()x\b
     vpop            {q4-q7}
 .endm

diff --git a/source/common/arm/mc-a.S b/source/common/arm/mc-a.S
index b10e9e816..839d192cd 100644
--- a/source/common/arm/mc-a.S
+++ b/source/common/arm/mc-a.S
@@ -554,7 +554,7 @@ function x265_cpy2Dto1D_shr_16x16_neon
     vsri.s16        q1, #1
     vneg.s16        q0, q0
     mov             r3, #4
-.loop_cpy2Dto1D_shr_16:
+.Loop_cpy2Dto1D_shr_16:
     subs            r3, #1
 .rept 4
     vld1.s16        {q2-q3}, [r1], r2
@@ -564,7 +564,7 @@ function x265_cpy2Dto1D_shr_16x16_neon
     vshl.s16        q3, q0
     vst1.16         {q2-q3}, [r0]!
 .endr
-    bgt             .loop_cpy2Dto1D_shr_16
+    bgt             .Loop_cpy2Dto1D_shr_16
     bx              lr
 endfunc

@@ -577,7 +577,7 @@ function x265_cpy2Dto1D_shr_32x32_neon
     vsri.s16        q1, #1
     vneg.s16        q0, q0
     mov             r3, 16
-.loop_cpy2Dto1D_shr_32:
+.Loop_cpy2Dto1D_shr_32:
     subs            r3, #1
 .rept 2
     vld1.s16        {q2-q3}, [r1]!
@@ -593,7 +593,7 @@ function x265_cpy2Dto1D_shr_32x32_neon
     vst1.16         {q2-q3}, [r0]!
     vst1.16         {q8-q9}, [r0]!
 .endr
-    bgt             .loop_cpy2Dto1D_shr_32
+    bgt             .Loop_cpy2Dto1D_shr_32
     bx              lr
 endfunc

diff --git a/source/common/arm/pixel-util.S b/source/common/arm/pixel-util.S
index c26b17acc..67719c8e5 100644
--- a/source/common/arm/pixel-util.S
+++ b/source/common/arm/pixel-util.S
@@ -848,36 +848,36 @@ function x265_pixel_planecopy_cp_neon
     vdup.8          q2, r12
     sub             r5, #1

-.loop_h:
+.Loop_h:
     mov             r6, r0
     mov             r12, r2
     eor             r7, r7
-.loop_w:
+.Loop_w:
     vld1.u8         {q0}, [r6]!
     vshl.u8         q0, q0, q2
     vst1.u8         {q0}, [r12]!

     add             r7, #16
     cmp             r7, r4
-    blt             .loop_w
+    blt             .Loop_w

     add             r0, r1
     add             r2, r3

     subs             r5, #1
-    bgt             .loop_h
+    bgt             .Loop_h

 // handle last row
     mov             r5, r4
     lsr             r5, #3

-.loopW8:
+.LoopW8:
     vld1.u8         d0, [r0]!
     vshl.u8         d0, d0, d4
     vst1.u8         d0, [r2]!
     subs            r4, r4, #8
     subs            r5, #1
-    bgt             .loopW8
+    bgt             .LoopW8

     mov             r5,#8
     sub             r5, r4
@@ -1970,7 +1970,7 @@ function x265_quant_neon
     eor             r5, r5
     veor.s32        q12, q12

-.loop_quant:
+.Loop_quant:

     vld1.s16        d16, [r0]!
     vmovl.s16       q9, d16                // q9= coef[blockpos]
@@ -1999,7 +1999,7 @@ function x265_quant_neon
     vst1.s16        d16, [r3]!

     subs            r4, #1
-    bne             .loop_quant
+    bne             .Loop_quant

     vadd.u32        d8, d9
     vpadd.u32       d8, d8
@@ -2023,7 +2023,7 @@ function x265_nquant_neon
     eor             r4, r4
     veor.s32        q12, q12

-.loop_nquant:
+.Loop_nquant:

     vld1.s16        d16, [r0]!
     vmovl.s16       q9, d16                // q9= coef[blockpos]
@@ -2049,7 +2049,7 @@ function x265_nquant_neon
     vst1.s16        d17, [r2]!

     subs            r3, #1
-    bne             .loop_nquant
+    bne             .Loop_nquant

     vadd.u32        d8, d9
     vpadd.u32       d8, d8
@@ -2148,7 +2148,7 @@ function x265_pixel_sa8d_32x64_neon
     mov             r10, #4
     eor             r9, r9

-.loop_32:
+.Loop_32:

     sa8d_16x16 r4

@@ -2166,7 +2166,7 @@ function x265_pixel_sa8d_32x64_neon
     sub             r2,  r2,  #24

     subs            r10, #1
-    bgt            .loop_32
+    bgt            .Loop_32

     mov             r0, r9
     vpop            {d8-d11}
@@ -2183,7 +2183,7 @@ function x265_pixel_sa8d_64x64_neon
     mov             r10, #4
     eor             r9, r9

-.loop_1:
+.Loop_1:

     sa8d_16x16 r4

@@ -2217,7 +2217,7 @@ function x265_pixel_sa8d_64x64_neon
     sub             r2,  r2,  #56

     subs            r10, #1
-    bgt            .loop_1
+    bgt            .Loop_1

     mov             r0, r9
     vpop            {d8-d11}
diff --git a/source/common/arm/sad-a.S b/source/common/arm/sad-a.S
index 6faf35957..b5cbded89 100644
--- a/source/common/arm/sad-a.S
+++ b/source/common/arm/sad-a.S
@@ -103,7 +103,7 @@ function x265_pixel_sad_16x\h\()_neon
     vabal.u8        q9, d5, d7
     mov             r12, #(\h-2)/2

-.loop_16x\h:
+.Loop_16x\h:

     subs            r12, #1
     vld1.8          {q0}, [r0], r1
@@ -115,7 +115,7 @@ function x265_pixel_sad_16x\h\()_neon
     vabal.u8        q9, d1, d3
     vabal.u8        q8, d4, d6
     vabal.u8        q9, d5, d7
-    bne             .loop_16x\h
+    bne             .Loop_16x\h

     vadd.u16        q8, q8, q9
 .if \h == 64
@@ -147,7 +147,7 @@ function x265_pixel_sad_32x\h\()_neon
     veor.u8         q11, q11
     mov             r12, #\h/8

-.loop_32x\h:
+.Loop_32x\h:

     subs            r12, #1
 .rept 4
@@ -166,7 +166,7 @@ function x265_pixel_sad_32x\h\()_neon
     vabal.u8        q10, d26, d30
     vabal.u8        q11, d27, d31
 .endr
-    bne             .loop_32x\h
+    bne             .Loop_32x\h

     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -213,7 +213,7 @@ function x265_pixel_sad_64x\h\()_neon
     sub             r3, r12
     mov             r12, #\h/8

-.loop_64x\h:
+.Loop_64x\h:

     subs            r12, #1
 .rept 4
@@ -246,7 +246,7 @@ function x265_pixel_sad_64x\h\()_neon
     vabal.u8        q10, d26, d30
     vabal.u8        q11, d27, d31
 .endr
-    bne             .loop_64x\h
+    bne             .Loop_64x\h

     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -283,7 +283,7 @@ function x265_pixel_sad_24x32_neon
     sub             r3, #16
     mov             r12, #8

-.loop_24x32:
+.Loop_24x32:

     subs            r12, #1
 .rept 4
@@ -296,7 +296,7 @@ function x265_pixel_sad_24x32_neon
     vld1.8          {d1}, [r2], r3
     vabal.u8        q10, d0, d1
 .endr
-    bne             .loop_24x32
+    bne             .Loop_24x32

     vadd.u16        q8, q8, q9
     vadd.u16        d16, d16, d17
@@ -322,7 +322,7 @@ function x265_pixel_sad_48x64_neon
     sub             r3, #32
     mov             r12, #16

-.loop_48x64:
+.Loop_48x64:

     subs            r12, #1
 .rept 4
@@ -337,7 +337,7 @@ function x265_pixel_sad_48x64_neon
     vabal.u8        q14, d4, d20
     vabal.u8        q15, d5, d21
 .endr
-    bne             .loop_48x64
+    bne             .Loop_48x64

     vadd.u16        q3, q3, q11
     vadd.u16        d6, d6, d7
@@ -635,12 +635,12 @@ function x265_sad_x\x\()_16x\h\()_neon
     veor.u8         q15, q15
 .endif

-.loop_sad_x\x\()_16x\h:
+.Loop_sad_x\x\()_16x\h:
 .rept 8
     SAD_X_16 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_16x\h
+    bne             .Loop_sad_x\x\()_16x\h

     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
@@ -929,12 +929,12 @@ function x265_sad_x\x\()_64x\h\()_neon
     veor.u8         q14, q14
     veor.u8         q15, q15
 .endif
-.loop_sad_x\x\()_64x\h:
+.Loop_sad_x\x\()_64x\h:
 .rept 8
     SAD_X_64 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_64x\h
+    bne             .Loop_sad_x\x\()_64x\h

 .if \h <= 16
     vadd.u16        q8, q8, q9
@@ -1071,12 +1071,12 @@ function x265_sad_x\x\()_48x64_neon
     veor.u8         q15, q15
 .endif

-.loop_sad_x\x\()_48x64:
+.Loop_sad_x\x\()_48x64:
 .rept 8
     SAD_X_48 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_48x64
+    bne             .Loop_sad_x\x\()_48x64

     vpaddl.u16      q8, q8
     vpaddl.u16      q9, q9
@@ -1179,12 +1179,12 @@ function x265_sad_x\x\()_24x32_neon
     veor.u8         q15, q15
 .endif

-.loop_sad_x\x\()_24x32:
+.Loop_sad_x\x\()_24x32:
 .rept 8
     SAD_X_24 \x
 .endr
     subs            r6, #1
-    bne             .loop_sad_x\x\()_24x32
+    bne             .Loop_sad_x\x\()_24x32

     vadd.u16        q8, q8, q9
     vadd.u16        q10, q10, q11
diff --git a/source/common/arm/ssd-a.S b/source/common/arm/ssd-a.S
index bb91a0bcb..c00ab0023 100644
--- a/source/common/arm/ssd-a.S
+++ b/source/common/arm/ssd-a.S
@@ -121,7 +121,7 @@ function x265_pixel_sse_pp_32x32_neon
     veor.u8     q0, q0
     veor.u8     q1, q1

-.loop_sse_pp_32:
+.Loop_sse_pp_32:
     subs        r12, #1
 .rept 4
     vld1.64     {q8-q9}, [r0], r1
@@ -139,7 +139,7 @@ function x265_pixel_sse_pp_32x32_neon
     vmlal.s16   q0, d26, d26
     vmlal.s16   q1, d27, d27
 .endr
-    bne         .loop_sse_pp_32
+    bne         .Loop_sse_pp_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -154,7 +154,7 @@ function x265_pixel_sse_pp_64x64_neon
     veor.u8     q0, q0
     veor.u8     q1, q1

-.loop_sse_pp_64:
+.Loop_sse_pp_64:
     subs        r12, #1
 .rept 4
     vld1.64     {q8-q9}, [r0]!
@@ -187,7 +187,7 @@ function x265_pixel_sse_pp_64x64_neon
     vmlal.s16   q0, d26, d26
     vmlal.s16   q1, d27, d27
 .endr
-    bne         .loop_sse_pp_64
+    bne         .Loop_sse_pp_64
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -257,7 +257,7 @@ function x265_pixel_sse_ss_16x16_neon
     veor.u8     q0, q0
     veor.u8     q1, q1

-.loop_sse_ss_16:
+.Loop_sse_ss_16:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, [r0], r1
@@ -269,7 +269,7 @@ function x265_pixel_sse_ss_16x16_neon
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_16
+    bne         .Loop_sse_ss_16
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -286,7 +286,7 @@ function x265_pixel_sse_ss_32x32_neon
     veor.u8     q0, q0
     veor.u8     q1, q1

-.loop_sse_ss_32:
+.Loop_sse_ss_32:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, [r0]!
@@ -307,7 +307,7 @@ function x265_pixel_sse_ss_32x32_neon
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_32
+    bne         .Loop_sse_ss_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -324,7 +324,7 @@ function x265_pixel_sse_ss_64x64_neon
     veor.u8     q0, q0
     veor.u8     q1, q1

-.loop_sse_ss_64:
+.Loop_sse_ss_64:
     subs        r12, #1
 .rept 2
     vld1.s16    {q8-q9}, [r0]!
@@ -363,7 +363,7 @@ function x265_pixel_sse_ss_64x64_neon
     vmlal.s16   q0, d18, d18
     vmlal.s16   q1, d19, d19
 .endr
-    bne         .loop_sse_ss_64
+    bne         .Loop_sse_ss_64
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -417,7 +417,7 @@ function x265_pixel_ssd_s_16x16_neon
     veor.u8     q0, q0
     veor.u8     q1, q1

-.loop_ssd_s_16:
+.Loop_ssd_s_16:
     subs        r12, #1
 .rept 2
     vld1.s16    {q8-q9}, [r0], r1
@@ -431,7 +431,7 @@ function x265_pixel_ssd_s_16x16_neon
     vmlal.s16   q0, d22, d22
     vmlal.s16   q1, d23, d23
 .endr
-    bne         .loop_ssd_s_16
+    bne         .Loop_ssd_s_16
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
@@ -446,7 +446,7 @@ function x265_pixel_ssd_s_32x32_neon
     veor.u8     q0, q0
     veor.u8     q1, q1

-.loop_ssd_s_32:
+.Loop_ssd_s_32:
     subs        r12, #1
 .rept 4
     vld1.s16    {q8-q9}, [r0]!
@@ -460,7 +460,7 @@ function x265_pixel_ssd_s_32x32_neon
     vmlal.s16   q0, d22, d22
     vmlal.s16   q1, d23, d23
 .endr
-    bne         .loop_ssd_s_32
+    bne         .Loop_ssd_s_32
     vadd.s32    q0, q1
     vadd.s32    d0, d0, d1
     vpadd.s32   d0, d0, d0
--
2.42.1

IMPORTANT NOTICE: The contents of this email and any attachments are confidential and may also be privileged. If you are not the intended recipient, please notify the sender immediately and do not disclose the contents to any other person, use it for any purpose, or store or copy the information in any medium. Thank you.



More information about the x265-devel mailing list