<div dir="ltr"><div class="gmail_default" style="font-family:tahoma,sans-serif">From ead939a542ba3bfa2da352f9f2f32fdec9bd0698 Mon Sep 17 00:00:00 2001<br>From: Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>Date: Sun, 8 Sep 2024 09:23:53 -0700<br>Subject: [PATCH 4/6] AArch64: Improve IDCT16 cmp+beq -> cbz<br><br>---<br> source/common/aarch64/dct.S | 69 ++++++++++++++++++++-----------------<br> 1 file changed, 37 insertions(+), 32 deletions(-)<br><br>diff --git a/source/common/aarch64/dct.S b/source/common/aarch64/dct.S<br>index 26a88ef76..959310b1f 100644<br>--- a/source/common/aarch64/dct.S<br>+++ b/source/common/aarch64/dct.S<br>@@ -35,8 +35,8 @@<br> .align 4<br> <br> .text<br>-.set idct_shift_1, 7<br>-.set idct_shift_2, 12<br>+.set idct16_shift_1, 7<br>+.set idct16_shift_2, 12<br> <br> .align 4<br> // NOTE: Hardcoded due to asm syntax issue, don't reorder!<br>@@ -157,8 +157,9 @@ function PFX(idct16_neon)<br>     smlsl           v8.4s, v17.4h, v1.h[3]          // v8 = O6 = 25*[1]-70*[3]<br>     smlsl           v9.4s, v17.4h, v1.h[6]          // v9 = O7 =  9*[1]-25*[3]<br> <br>-    cmp             x7, #0<br>-    beq             1f<br>+    //cmp             x7, #0<br>+    //beq             1f<br>+    cbz             x7, 1f<br> <br>     smlal           v2.4s, v18.4h, v1.h[2]          // v2 = O0 = 90*[1]+87*[3]+80*[5]<br>     smlal           v3.4s, v18.4h, v1.h[7]          // v3 = O1 = 87*[1]+57*[3]+ 9*[5]<br>@@ -184,8 +185,9 @@ function PFX(idct16_neon)<br>     ldr             d18, [x0, #(13*16*2)]<br>     ldr             d19, [x0, #(15*16*2)]<br> <br>-    cmp             x6, #0<br>-    beq             1f<br>+    //cmp             x6, #0<br>+    //beq             1f<br>+    cbz             x6, 1f<br> <br>     smlal           v24.4s, v20.4h, v0.h[0]         // EEE0 = 64*[0]+64*[8]<br>     smlsl           v25.4s, v20.4h, v0.h[0]         // EEE1 = 64*[0]-64*[8]<br>@@ -222,8 +224,9 @@ function PFX(idct16_neon)<br>     add             v30.4s, v23.4s, v31.4s          // v30 = E3 = EE3+EO3<br>     sub             v31.4s, v23.4s, v31.4s          // v31 = E4 = EE3-EO3<br> <br>-    cmp             x6, #0<br>-    beq             1f<br>+    //cmp             x6, #0<br>+    //beq             1f<br>+    cbz             x6, 1f<br> <br>     smlal           v2.4s, v16.4h, v1.h[4]          // v2 = O0 = 90*[1]+87*[3]+80*[5]+70*[7]+57*[9]<br>     smlsl           v3.4s, v16.4h, v1.h[2]          // v3 = O1 = 87*[1]+57*[3]+ 9*[5]-43*[7]-80*[9]<br>@@ -244,8 +247,9 @@ function PFX(idct16_neon)<br>     smlsl           v9.4s, v17.4h, v1.h[2]          // v9 = O7 =  9*[1]-25*[3]+43*[5]-57*[7]+70*[9]-80*[11]<br> <br> 1:<br>-    cmp             x7, #0<br>-    beq             1f<br>+    //cmp             x7, #0<br>+    //beq             1f<br>+    cbz             x7, 1f<br> <br>     smlal           v2.4s, v18.4h, v1.h[6]          // v2 = O0 = 90*[1]+87*[3]+80*[5]+70*[7]+57*[9]+43*[11]+25*[13]<br>     smlsl           v3.4s, v18.4h, v1.h[3]          // v3 = O1 = 87*[1]+57*[3]+ 9*[5]-43*[7]-80*[9]-90*[11]-70*[13]<br>@@ -274,14 +278,14 @@ function PFX(idct16_neon)<br>     sub             v21.4s, v28.4s, v4.4s           // [13] = E2-O2<br>     add             v22.4s, v30.4s, v5.4s           // [ 3] = E3+O3<br>     sub             v23.4s, v30.4s, v5.4s           // [12] = E3-O3<br>-    sqrshrn         v16.4h, v16.4s, #idct_shift_1<br>-    sqrshrn         v17.4h, v17.4s, #idct_shift_1<br>-    sqrshrn         v18.4h, v18.4s, #idct_shift_1<br>-    sqrshrn         v19.4h, v19.4s, #idct_shift_1<br>-    sqrshrn         v20.4h, v20.4s, #idct_shift_1<br>-    sqrshrn         v21.4h, v21.4s, #idct_shift_1<br>-    sqrshrn         v22.4h, v22.4s, #idct_shift_1<br>-    sqrshrn         v23.4h, v23.4s, #idct_shift_1<br>+    sqrshrn         v16.4h, v16.4s, #idct16_shift_1<br>+    sqrshrn         v17.4h, v17.4s, #idct16_shift_1<br>+    sqrshrn         v18.4h, v18.4s, #idct16_shift_1<br>+    sqrshrn         v19.4h, v19.4s, #idct16_shift_1<br>+    sqrshrn         v20.4h, v20.4s, #idct16_shift_1<br>+    sqrshrn         v21.4h, v21.4s, #idct16_shift_1<br>+    sqrshrn         v22.4h, v22.4s, #idct16_shift_1<br>+    sqrshrn         v23.4h, v23.4s, #idct16_shift_1<br>     str             d16, [x5, #( 0*16*2)]<br>     str             d17, [x5, #(15*16*2)]<br>     str             d18, [x5, #( 1*16*2)]<br>@@ -299,14 +303,14 @@ function PFX(idct16_neon)<br>     sub             v21.4s, v27.4s, v8.4s           // [ 9] = E6-O6<br>     add             v22.4s, v25.4s, v9.4s           // [ 7] = E7+O7<br>     sub             v23.4s, v25.4s, v9.4s           // [ 8] = E7-O7<br>-    sqrshrn         v16.4h, v16.4s, #idct_shift_1<br>-    sqrshrn         v17.4h, v17.4s, #idct_shift_1<br>-    sqrshrn         v18.4h, v18.4s, #idct_shift_1<br>-    sqrshrn         v19.4h, v19.4s, #idct_shift_1<br>-    sqrshrn         v20.4h, v20.4s, #idct_shift_1<br>-    sqrshrn         v21.4h, v21.4s, #idct_shift_1<br>-    sqrshrn         v22.4h, v22.4s, #idct_shift_1<br>-    sqrshrn         v23.4h, v23.4s, #idct_shift_1<br>+    sqrshrn         v16.4h, v16.4s, #idct16_shift_1<br>+    sqrshrn         v17.4h, v17.4s, #idct16_shift_1<br>+    sqrshrn         v18.4h, v18.4s, #idct16_shift_1<br>+    sqrshrn         v19.4h, v19.4s, #idct16_shift_1<br>+    sqrshrn         v20.4h, v20.4s, #idct16_shift_1<br>+    sqrshrn         v21.4h, v21.4s, #idct16_shift_1<br>+    sqrshrn         v22.4h, v22.4s, #idct16_shift_1<br>+    sqrshrn         v23.4h, v23.4s, #idct16_shift_1<br>     str             d16, [x5, #( 4*16*2)]<br>     str             d17, [x5, #(11*16*2)]<br>     str             d18, [x5, #( 5*16*2)]<br>@@ -399,8 +403,9 @@ function PFX(idct16_neon)<br>     smull   v29.4s, v31.4h, v16.4h                  // v29 = [O6]<br>     smull   v30.4s, v31.4h, v17.4h                  // v30 = [O7]<br> <br>-    cmp     x6, #0<br>-    beq     1f<br>+    //cmp     x6, #0<br>+    //beq     1f<br>+    cbz     x6, 1f<br> <br>     smlal2  v23.4s, v31.8h, v4.8h<br>     smlal2  v24.4s, v31.8h, v5.8h<br>@@ -443,10 +448,10 @@ function PFX(idct16_neon)<br>     tbl     v26.16b, {v26.16b}, v18.16b             // v26 = [7 6 5 4]<br>     tbl     v27.16b, {v27.16b}, v18.16b             // v27 = [15 14 13 12]<br> <br>-    sqrshrn         v20.4h, v25.4s, #idct_shift_2<br>-    sqrshrn         v21.4h, v26.4s, #idct_shift_2<br>-    sqrshrn         v22.4h, v28.4s, #idct_shift_2<br>-    sqrshrn         v23.4h, v27.4s, #idct_shift_2<br>+    sqrshrn         v20.4h, v25.4s, #idct16_shift_2<br>+    sqrshrn         v21.4h, v26.4s, #idct16_shift_2<br>+    sqrshrn         v22.4h, v28.4s, #idct16_shift_2<br>+    sqrshrn         v23.4h, v27.4s, #idct16_shift_2<br>     stp             d20, d21, [x1, #0]<br>     stp             d22, d23, [x1, #16]<br> <br>-- <br>2.36.0.windows.1<br><br></div></div>