[x265] [PATCH ARM 4/6] AArch64: Improve IDCT16 cmp+beq -> cbz
Pavan Tarun Chakka Venkata
pavan.tarun at multicorewareinc.com
Thu Sep 12 13:41:06 UTC 2024
>From ead939a542ba3bfa2da352f9f2f32fdec9bd0698 Mon Sep 17 00:00:00 2001
From: Min Chen <chenm003 at 163.com>
Date: Sun, 8 Sep 2024 09:23:53 -0700
Subject: [PATCH 4/6] AArch64: Improve IDCT16 cmp+beq -> cbz
---
source/common/aarch64/dct.S | 69 ++++++++++++++++++++-----------------
1 file changed, 37 insertions(+), 32 deletions(-)
diff --git a/source/common/aarch64/dct.S b/source/common/aarch64/dct.S
index 26a88ef76..959310b1f 100644
--- a/source/common/aarch64/dct.S
+++ b/source/common/aarch64/dct.S
@@ -35,8 +35,8 @@
.align 4
.text
-.set idct_shift_1, 7
-.set idct_shift_2, 12
+.set idct16_shift_1, 7
+.set idct16_shift_2, 12
.align 4
// NOTE: Hardcoded due to asm syntax issue, don't reorder!
@@ -157,8 +157,9 @@ function PFX(idct16_neon)
smlsl v8.4s, v17.4h, v1.h[3] // v8 = O6 =
25*[1]-70*[3]
smlsl v9.4s, v17.4h, v1.h[6] // v9 = O7 =
9*[1]-25*[3]
- cmp x7, #0
- beq 1f
+ //cmp x7, #0
+ //beq 1f
+ cbz x7, 1f
smlal v2.4s, v18.4h, v1.h[2] // v2 = O0 =
90*[1]+87*[3]+80*[5]
smlal v3.4s, v18.4h, v1.h[7] // v3 = O1 =
87*[1]+57*[3]+ 9*[5]
@@ -184,8 +185,9 @@ function PFX(idct16_neon)
ldr d18, [x0, #(13*16*2)]
ldr d19, [x0, #(15*16*2)]
- cmp x6, #0
- beq 1f
+ //cmp x6, #0
+ //beq 1f
+ cbz x6, 1f
smlal v24.4s, v20.4h, v0.h[0] // EEE0 = 64*[0]+64*[8]
smlsl v25.4s, v20.4h, v0.h[0] // EEE1 = 64*[0]-64*[8]
@@ -222,8 +224,9 @@ function PFX(idct16_neon)
add v30.4s, v23.4s, v31.4s // v30 = E3 = EE3+EO3
sub v31.4s, v23.4s, v31.4s // v31 = E4 = EE3-EO3
- cmp x6, #0
- beq 1f
+ //cmp x6, #0
+ //beq 1f
+ cbz x6, 1f
smlal v2.4s, v16.4h, v1.h[4] // v2 = O0 =
90*[1]+87*[3]+80*[5]+70*[7]+57*[9]
smlsl v3.4s, v16.4h, v1.h[2] // v3 = O1 =
87*[1]+57*[3]+ 9*[5]-43*[7]-80*[9]
@@ -244,8 +247,9 @@ function PFX(idct16_neon)
smlsl v9.4s, v17.4h, v1.h[2] // v9 = O7 =
9*[1]-25*[3]+43*[5]-57*[7]+70*[9]-80*[11]
1:
- cmp x7, #0
- beq 1f
+ //cmp x7, #0
+ //beq 1f
+ cbz x7, 1f
smlal v2.4s, v18.4h, v1.h[6] // v2 = O0 =
90*[1]+87*[3]+80*[5]+70*[7]+57*[9]+43*[11]+25*[13]
smlsl v3.4s, v18.4h, v1.h[3] // v3 = O1 =
87*[1]+57*[3]+ 9*[5]-43*[7]-80*[9]-90*[11]-70*[13]
@@ -274,14 +278,14 @@ function PFX(idct16_neon)
sub v21.4s, v28.4s, v4.4s // [13] = E2-O2
add v22.4s, v30.4s, v5.4s // [ 3] = E3+O3
sub v23.4s, v30.4s, v5.4s // [12] = E3-O3
- sqrshrn v16.4h, v16.4s, #idct_shift_1
- sqrshrn v17.4h, v17.4s, #idct_shift_1
- sqrshrn v18.4h, v18.4s, #idct_shift_1
- sqrshrn v19.4h, v19.4s, #idct_shift_1
- sqrshrn v20.4h, v20.4s, #idct_shift_1
- sqrshrn v21.4h, v21.4s, #idct_shift_1
- sqrshrn v22.4h, v22.4s, #idct_shift_1
- sqrshrn v23.4h, v23.4s, #idct_shift_1
+ sqrshrn v16.4h, v16.4s, #idct16_shift_1
+ sqrshrn v17.4h, v17.4s, #idct16_shift_1
+ sqrshrn v18.4h, v18.4s, #idct16_shift_1
+ sqrshrn v19.4h, v19.4s, #idct16_shift_1
+ sqrshrn v20.4h, v20.4s, #idct16_shift_1
+ sqrshrn v21.4h, v21.4s, #idct16_shift_1
+ sqrshrn v22.4h, v22.4s, #idct16_shift_1
+ sqrshrn v23.4h, v23.4s, #idct16_shift_1
str d16, [x5, #( 0*16*2)]
str d17, [x5, #(15*16*2)]
str d18, [x5, #( 1*16*2)]
@@ -299,14 +303,14 @@ function PFX(idct16_neon)
sub v21.4s, v27.4s, v8.4s // [ 9] = E6-O6
add v22.4s, v25.4s, v9.4s // [ 7] = E7+O7
sub v23.4s, v25.4s, v9.4s // [ 8] = E7-O7
- sqrshrn v16.4h, v16.4s, #idct_shift_1
- sqrshrn v17.4h, v17.4s, #idct_shift_1
- sqrshrn v18.4h, v18.4s, #idct_shift_1
- sqrshrn v19.4h, v19.4s, #idct_shift_1
- sqrshrn v20.4h, v20.4s, #idct_shift_1
- sqrshrn v21.4h, v21.4s, #idct_shift_1
- sqrshrn v22.4h, v22.4s, #idct_shift_1
- sqrshrn v23.4h, v23.4s, #idct_shift_1
+ sqrshrn v16.4h, v16.4s, #idct16_shift_1
+ sqrshrn v17.4h, v17.4s, #idct16_shift_1
+ sqrshrn v18.4h, v18.4s, #idct16_shift_1
+ sqrshrn v19.4h, v19.4s, #idct16_shift_1
+ sqrshrn v20.4h, v20.4s, #idct16_shift_1
+ sqrshrn v21.4h, v21.4s, #idct16_shift_1
+ sqrshrn v22.4h, v22.4s, #idct16_shift_1
+ sqrshrn v23.4h, v23.4s, #idct16_shift_1
str d16, [x5, #( 4*16*2)]
str d17, [x5, #(11*16*2)]
str d18, [x5, #( 5*16*2)]
@@ -399,8 +403,9 @@ function PFX(idct16_neon)
smull v29.4s, v31.4h, v16.4h // v29 = [O6]
smull v30.4s, v31.4h, v17.4h // v30 = [O7]
- cmp x6, #0
- beq 1f
+ //cmp x6, #0
+ //beq 1f
+ cbz x6, 1f
smlal2 v23.4s, v31.8h, v4.8h
smlal2 v24.4s, v31.8h, v5.8h
@@ -443,10 +448,10 @@ function PFX(idct16_neon)
tbl v26.16b, {v26.16b}, v18.16b // v26 = [7 6 5 4]
tbl v27.16b, {v27.16b}, v18.16b // v27 = [15 14 13 12]
- sqrshrn v20.4h, v25.4s, #idct_shift_2
- sqrshrn v21.4h, v26.4s, #idct_shift_2
- sqrshrn v22.4h, v28.4s, #idct_shift_2
- sqrshrn v23.4h, v27.4s, #idct_shift_2
+ sqrshrn v20.4h, v25.4s, #idct16_shift_2
+ sqrshrn v21.4h, v26.4s, #idct16_shift_2
+ sqrshrn v22.4h, v28.4s, #idct16_shift_2
+ sqrshrn v23.4h, v27.4s, #idct16_shift_2
stp d20, d21, [x1, #0]
stp d22, d23, [x1, #16]
--
2.36.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/e0b391fa/attachment.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0004-AArch64-Improve-IDCT16-cmp-beq-cbz.patch
Type: application/octet-stream
Size: 6225 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/e0b391fa/attachment.obj>
More information about the x265-devel
mailing list