<div dir="ltr"><div class="gmail_default" style="font-family:tahoma,sans-serif">From 3068c5ff3f1c8173c0f7c1a20cf323dde19fffe5 Mon Sep 17 00:00:00 2001<br>From: Min Chen <<a href="mailto:chenm003@163.com">chenm003@163.com</a>><br>Date: Thu, 12 Sep 2024 11:11:55 +0000<br>Subject: [PATCH 5/6] AArch64: DCT16x16<br><br>---<br> source/common/aarch64/dct-prim.cpp |   3 +-<br> source/common/aarch64/dct.S        | 417 +++++++++++++++++++++++++++++<br> 2 files changed, 419 insertions(+), 1 deletion(-)<br><br>diff --git a/source/common/aarch64/dct-prim.cpp b/source/common/aarch64/dct-prim.cpp<br>index 063dde845..8b523ceb0 100644<br>--- a/source/common/aarch64/dct-prim.cpp<br>+++ b/source/common/aarch64/dct-prim.cpp<br>@@ -14,6 +14,7 @@<br> #define X265_PRAGMA_UNROLL(n)<br> #endif<br> <br>+extern "C" void PFX(dct16_neon)(const int16_t *src, int16_t *dst, intptr_t srcStride);<br> extern "C" void PFX(idct16_neon)(const int16_t *src, int16_t *dst, intptr_t dstStride);<br> <br> namespace<br>@@ -1111,7 +1112,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p)<br>     <a href="http://p.cu">p.cu</a>[BLOCK_16x16].psyRdoQuant = psyRdoQuant_neon<4>;<br>     <a href="http://p.cu">p.cu</a>[BLOCK_32x32].psyRdoQuant = psyRdoQuant_neon<5>;<br>     <a href="http://p.cu">p.cu</a>[BLOCK_8x8].dct   = dct8_neon;<br>-    <a href="http://p.cu">p.cu</a>[BLOCK_16x16].dct = dct16_neon;<br>+    <a href="http://p.cu">p.cu</a>[BLOCK_16x16].dct = PFX(dct16_neon);<br>     <a href="http://p.cu">p.cu</a>[BLOCK_32x32].dct = dct32_neon;<br>     <a href="http://p.cu">p.cu</a>[BLOCK_4x4].idct   = idct4_neon;<br>     <a href="http://p.cu">p.cu</a>[BLOCK_16x16].idct = PFX(idct16_neon);<br>diff --git a/source/common/aarch64/dct.S b/source/common/aarch64/dct.S<br>index 959310b1f..68cf8778b 100644<br>--- a/source/common/aarch64/dct.S<br>+++ b/source/common/aarch64/dct.S<br>@@ -38,6 +38,9 @@<br> .set idct16_shift_1, 7<br> .set idct16_shift_2, 12<br> <br>+.set dct16_shift_1, 3<br>+.set dct16_shift_2, 10<br>+<br> .align 4<br> // NOTE: Hardcoded due to asm syntax issue, don't reorder!<br> tbl_const_idct_0:<br>@@ -67,6 +70,36 @@ tbl_const_idct_0:<br> <br>     .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3  // v18<br> <br>+tbl_const_dct_0:<br>+    // EE<br>+    .hword 64,+64,+64,+64                   // v16<br>+    .hword 83,+36,-36,-83                   // v17<br>+    .hword 64,-64,-64,+64                   // v18<br>+    .hword 36,-83,+83,-36                   // v19<br>+<br>+    // EO<br>+    .hword 89,+75,+50,+18                   // v20<br>+    .hword 75,-18,-89,-50                   // v21<br>+    .hword 50,-89,+18,+75                   // v22<br>+    .hword 18,-50,+75,-89                   // v23<br>+<br>+    // O<br>+    .hword 90,+87,+80,+70,+57,+43,+25, +9   // v24<br>+    .hword 87,+57, +9,-43,-80,-90,-70,-25   // v25<br>+    .hword 80, +9,-70,-87,-25,+57,+90,+43   // v26<br>+    .hword 70,-43,-87, +9,+90,+25,-80,-57   // v27<br>+    .hword 57,-80,-25,+90, -9,-87,+43,+70   // v28<br>+    .hword 43,-90,+57,+25,-87,+70, +9,-80   // v29<br>+    .hword 25,-70,+90,-80,+43, +9,-57,+87   // v30<br>+    .hword  9,-25,+43,-57,+70,-80,+87,-90   // v31<br>+<br>+    .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1  // v0<br>+//    .byte 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9  // v1<br>+<br>+    .word 64, 83, 36, 89, 75, 50, 18,  0    // v0, v1<br>+    .word 90, 87, 80, 70, 57, 43, 25,  9    // v2, v3<br>+<br>+<br> // ***** idct 16x16 *****<br> // void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)<br> function PFX(idct16_neon)<br>@@ -464,3 +497,387 @@ function PFX(idct16_neon)<br>     ldp             d8, d9, [sp], #16<br>     ret<br> endfunc<br>+<br>+<br>+// ***** dct 16x16 *****<br>+// void dct16(const int16_t* src, int16_t* dst, intptr_t srcStride)<br>+function PFX(dct16_neon)<br>+// Register map<br>+// x0  = src<br>+// x1  = dst<br>+// x2  = dstStride<br>+// x3  = tbl_const_dct_0<br>+<br>+    stp             d8, d9, [sp,#-16]!<br>+    stp             d10, d11, [sp,#-16]!<br>+    stp             d12, d13, [sp,#-16]!<br>+    stp             d14, d15, [sp,#-16]!<br>+<br>+    adr             x6, tbl_const_dct_0<br>+    ld4r            {v16.2d, v17.2d, v18.2d, v19.2d}, [x6], #32<br>+    ld4r            {v20.2d, v21.2d, v22.2d, v23.2d}, [x6], #32<br>+    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x6], #64<br>+    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x6], #64<br>+    ldr             q0, [x6], #16<br>+<br>+    add             x2, x2, x2<br>+    mov             x5, x1<br>+    mov             w4, #16<br>+<br>+5:  // Pass1<br>+    ld1             {v2.8h, v3.8h}, [x0], x2<br>+    tbl             v3.16b, {v3.16b}, v0.16b<br>+<br>+    add             v4.8h, v2.8h, v3.8h             // v4 = E[07 06 05 04 03 02 01 00]<br>+    sub             v1.8h, v2.8h, v3.8h             // v1 = O[07 06 05 04 03 02 01 00]<br>+<br>+// EE0 = E0 + E7;<br>+// EO0 = E0 - E7;<br>+// EE1 = E1 + E6;<br>+// EO1 = E1 - E6;<br>+// EE2 = E2 + E5;<br>+// EO2 = E2 - E5;<br>+// EE3 = E3 + E4;<br>+// EO3 = E3 - E4;<br>+    tbl             v2.8b, {v4.16b}, v0.8b          // v2 = E[04 05 06 07]<br>+<br>+    add             v3.4h, v4.4h, v2.4h             // v3 = EE[03 02 01 00]<br>+    sub             v2.4h, v4.4h, v2.4h             // v2 = EO[03 02 01 00]<br>+<br>+// [ 0] = (64*EE0 + 64*EE1 + 64*EE2 + 64*EE3 + rnd) >> nShift;                              // v16<br>+// [ 4] = (83*EE0 + 36*EE1 - 36*EE2 - 83*EE3 + rnd) >> nShift;                              // v17<br>+// [ 8] = (64*EE0 - 64*EE1 - 64*EE2 + 64*EE3 + rnd) >> nShift;                              // v18<br>+// [12] = (36*EE0 - 83*EE1 + 83*EE2 - 36*EE3 + rnd) >> nShift;                              // v19<br>+<br>+// [ 2] = (89*EO0 + 75*EO1 + 50*EO2 + 18*EO3 + rnd) >> nShift;                              // v20<br>+// [ 6] = (75*EO0 - 18*EO1 - 89*EO2 - 50*EO3 + rnd) >> nShift;                              // v21<br>+// [10] = (50*EO0 - 89*EO1 + 18*EO2 + 75*EO3 + rnd) >> nShift;                              // v22<br>+// [14] = (18*EO0 - 50*EO1 + 75*EO2 - 89*EO3 + rnd) >> nShift;                              // v23<br>+<br>+// [ 1] = (90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 +  9*O7 + rnd) >> nShift;  // v24<br>+// [ 3] = (87*O0 + 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7 + rnd) >> nShift;  // v25<br>+// [ 5] = (80*O0 +  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7 + rnd) >> nShift;  // v26<br>+// [ 7] = (70*O0 - 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7 + rnd) >> nShift;  // v27<br>+// [ 9] = (57*O0 - 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5 + 43*O6 + 70*O7 + rnd) >> nShift;  // v28<br>+// [11] = (43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 +  9*O6 - 80*O7 + rnd) >> nShift;  // v29<br>+// [13] = (25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5 - 57*O6 + 87*O7 + rnd) >> nShift;  // v30<br>+// [15] = ( 9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7 + rnd) >> nShift;  // v31<br>+<br>+<br>+    smull           v4.4s, v1.4h, v24.4h            // v4  = [ 1]<br>+    smull           v5.4s, v1.4h, v25.4h            // v5  = [ 3]<br>+    smull           v6.4s, v1.4h, v26.4h            // v6  = [ 5]<br>+    smull           v7.4s, v1.4h, v27.4h            // v7  = [ 7]<br>+    smull           v8.4s, v1.4h, v28.4h            // v8  = [ 9]<br>+    smull           v9.4s, v1.4h, v29.4h            // v9  = [11]<br>+    smull           v10.4s, v1.4h, v30.4h           // v10 = [13]<br>+    smull           v11.4s, v1.4h, v31.4h           // v11 = [15]<br>+<br>+    smlal2          v4.4s, v1.8h, v24.8h            // v4  = [ 1]<br>+    smlal2          v5.4s, v1.8h, v25.8h            // v5  = [ 3]<br>+    smlal2          v6.4s, v1.8h, v26.8h            // v6  = [ 5]<br>+    smlal2          v7.4s, v1.8h, v27.8h            // v7  = [ 7]<br>+    smlal2          v8.4s, v1.8h, v28.8h            // v8  = [ 9]<br>+    smlal2          v9.4s, v1.8h, v29.8h            // v9  = [11]<br>+    smlal2          v10.4s, v1.8h, v30.8h           // v10 = [13]<br>+    smlal2          v11.4s, v1.8h, v31.8h           // v11 = [15]<br>+<br>+    smull           v12.4s, v3.4h, v16.4h           // v12 = [ 0]<br>+    smull           v13.4s, v2.4h, v20.4h           // v13 = [ 2]<br>+    smull           v14.4s, v3.4h, v17.4h           // v14 = [ 4]<br>+    smull           v15.4s, v2.4h, v21.4h           // v15 = [ 6]<br>+<br>+    addp            v4.4s, v12.4s, v4.4s            // v4 = [1 0]<br>+    addp            v5.4s, v13.4s, v5.4s            // v5 = [3 2]<br>+    addp            v6.4s, v14.4s, v6.4s            // v6 = [5 4]<br>+    addp            v7.4s, v15.4s, v7.4s            // v7 = [7 6]<br>+    addp            v4.4s, v4.4s, v5.4s             // v4 = [3 2 1 0]<br>+    addp            v5.4s, v6.4s, v7.4s             // v5 = [7 6 5 4]<br>+<br>+    smull           v12.4s, v3.4h, v18.4h           // v12 = [ 8]<br>+    smull           v13.4s, v2.4h, v22.4h           // v13 = [10]<br>+    smull           v14.4s, v3.4h, v19.4h           // v14 = [12]<br>+    smull           v15.4s, v2.4h, v23.4h           // v15 = [14]<br>+<br>+    sqrshrn         v4.4h, v4.4s, #dct16_shift_1<br>+    sqrshrn         v5.4h, v5.4s, #dct16_shift_1<br>+    stp             d4, d5, [x5], #16<br>+<br>+    addp            v6.4s, v12.4s, v8.4s            // v6 = [9 8]<br>+    addp            v7.4s, v13.4s, v9.4s            // v7 = [11 10]<br>+    addp            v8.4s, v14.4s, v10.4s           // v8 = [13 12]<br>+    addp            v9.4s, v15.4s, v11.4s           // v9 = [15 14]<br>+    addp            v6.4s, v6.4s, v7.4s             // v6 = [11 10 9 8]<br>+    addp            v7.4s, v8.4s, v9.4s             // v7 = [15 14 13 12]<br>+<br>+    sqrshrn         v6.4h, v6.4s, #dct16_shift_1<br>+    sqrshrn         v7.4h, v7.4s, #dct16_shift_1<br>+    stp             d6, d7, [x5], #16<br>+<br>+    sub             w4, w4, #1<br>+    cbnz            w4, 5b<br>+<br>+    ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]<br>+    mov             w4, #16<br>+    mov             x5, x1<br>+6:  // Pass 2<br>+<br>+    ldr             d16, [x5, #(16*2* 0)]<br>+    ldr             d17, [x5, #(16*2* 1)]<br>+    ldr             d18, [x5, #(16*2* 2)]<br>+    ldr             d19, [x5, #(16*2* 3)]<br>+    ldr             d20, [x5, #(16*2* 4)]<br>+    ldr             d21, [x5, #(16*2* 5)]<br>+    ldr             d22, [x5, #(16*2* 6)]<br>+    ldr             d23, [x5, #(16*2* 7)]<br>+    ldr             d24, [x5, #(16*2* 8)]<br>+    ldr             d25, [x5, #(16*2* 9)]<br>+    ldr             d26, [x5, #(16*2*10)]<br>+    ldr             d27, [x5, #(16*2*11)]<br>+    ldr             d28, [x5, #(16*2*12)]<br>+    ldr             d29, [x5, #(16*2*13)]<br>+    ldr             d30, [x5, #(16*2*14)]<br>+    ldr             d31, [x5, #(16*2*15)]<br>+<br>+    saddl           v4.4s, v16.4h, v31.4h           // v4  = E0<br>+    saddl           v5.4s, v17.4h, v30.4h           // v5  = E1<br>+    saddl           v6.4s, v18.4h, v29.4h           // v6  = E2<br>+    saddl           v7.4s, v19.4h, v28.4h           // v7  = E3<br>+    saddl           v8.4s, v20.4h, v27.4h           // v8  = E4<br>+    saddl           v9.4s, v21.4h, v26.4h           // v9  = E5<br>+    saddl           v10.4s, v22.4h, v25.4h          // v10 = E6<br>+    saddl           v11.4s, v23.4h, v24.4h          // v11 = E7<br>+<br>+// [ 1] = (90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 +  9*O7 + rnd) >> nShift;<br>+// [ 3] = (87*O0 + 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7 + rnd) >> nShift;<br>+// [ 5] = (80*O0 +  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7 + rnd) >> nShift;<br>+// [ 7] = (70*O0 - 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7 + rnd) >> nShift;<br>+// [ 9] = (57*O0 - 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5 + 43*O6 + 70*O7 + rnd) >> nShift;<br>+// [11] = (43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 +  9*O6 - 80*O7 + rnd) >> nShift;<br>+// [13] = (25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5 - 57*O6 + 87*O7 + rnd) >> nShift;<br>+// [15] = ( 9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7 + rnd) >> nShift;<br>+<br>+    ssubl           v16.4s, v16.4h, v31.4h          // v16 = O0<br>+    ssubl           v17.4s, v17.4h, v30.4h          // v17 = O1<br>+    ssubl           v18.4s, v18.4h, v29.4h          // v18 = O2<br>+    ssubl           v19.4s, v19.4h, v28.4h          // v19 = O3<br>+    ssubl           v20.4s, v20.4h, v27.4h          // v20 = O4<br>+    ssubl           v21.4s, v21.4h, v26.4h          // v21 = O5<br>+    ssubl           v22.4s, v22.4h, v25.4h          // v22 = O6<br>+    ssubl           v23.4s, v23.4h, v24.4h          // v23 = O7<br>+<br>+    orr             v24.16b, v18.16b, v19.16b<br>+    orr             v25.16b, v20.16b, v21.16b<br>+    orr             v26.16b, v22.16b, v23.16b<br>+    uqxtn           v24.4h, v24.4s<br>+    uqxtn           v25.4h, v25.4s<br>+    uqxtn           v26.4h, v26.4s<br>+    mov             x0, v24.d[0]                    // x0 = zeros[O3 O2]<br>+    mov             x2, v25.d[0]                    // x2 = zeros[O5 O4]<br>+    mov             x6, v26.d[0]                    // x6 = zeros[O7 O6]<br>+<br>+    mul             v24.4s, v16.4s, v2.s[0]         // v24 = [ 1] = 90*O0<br>+    mul             v25.4s, v16.4s, v2.s[1]         // v25 = [ 3] = 87*O0<br>+    mul             v26.4s, v16.4s, v2.s[2]         // v26 = [ 5] = 80*O0<br>+    mul             v27.4s, v16.4s, v2.s[3]         // v27 = [ 7] = 70*O0<br>+    mul             v28.4s, v16.4s, v3.s[0]         // v28 = [ 9] = 57*O0<br>+    mul             v29.4s, v16.4s, v3.s[1]         // v29 = [11] = 43*O0<br>+    mul             v30.4s, v16.4s, v3.s[2]         // v30 = [13] = 25*O0<br>+    mul             v31.4s, v16.4s, v3.s[3]         // v31 = [15] =  9*O0<br>+<br>+    mla             v24.4s, v17.4s, v2.s[1]         // v24 = [ 1] = 90*O0 + 87*O1<br>+    mla             v25.4s, v17.4s, v3.s[0]         // v25 = [ 3] = 87*O0 + 57*O1<br>+    mla             v26.4s, v17.4s, v3.s[3]         // v26 = [ 5] = 80*O0 +  9*O1<br>+    mls             v27.4s, v17.4s, v3.s[1]         // v27 = [ 7] = 70*O0 - 43*O1<br>+    mls             v28.4s, v17.4s, v2.s[2]         // v28 = [ 9] = 57*O0 - 80*O1<br>+    mls             v29.4s, v17.4s, v2.s[0]         // v29 = [11] = 43*O0 - 90*O1<br>+    mls             v30.4s, v17.4s, v2.s[3]         // v30 = [13] = 25*O0 - 70*O1<br>+    mls             v31.4s, v17.4s, v3.s[2]         // v31 = [15] =  9*O0 - 25*O1<br>+<br>+    cbz             x0, 1f<br>+<br>+    mla             v24.4s, v18.4s, v2.s[2]         // v24 = [ 1] = 90*O0 + 87*O1 + 80*O2<br>+    mla             v25.4s, v18.4s, v3.s[3]         // v25 = [ 3] = 87*O0 + 57*O1 +  9*O2<br>+    mls             v26.4s, v18.4s, v2.s[3]         // v26 = [ 5] = 80*O0 +  9*O1 - 70*O2<br>+    mls             v27.4s, v18.4s, v2.s[1]         // v27 = [ 7] = 70*O0 - 43*O1 - 87*O2<br>+    mls             v28.4s, v18.4s, v3.s[2]         // v28 = [ 9] = 57*O0 - 80*O1 - 25*O2<br>+    mla             v29.4s, v18.4s, v3.s[0]         // v29 = [11] = 43*O0 - 90*O1 + 57*O2<br>+    mla             v30.4s, v18.4s, v2.s[0]         // v30 = [13] = 25*O0 - 70*O1 + 90*O2<br>+    mla             v31.4s, v18.4s, v3.s[1]         // v31 = [15] =  9*O0 - 25*O1 + 43*O2<br>+<br>+    mla             v24.4s, v19.4s, v2.s[3]         // v24 = [ 1] = 90*O0 + 87*O1 + 80*O2 + 70*O3<br>+    mls             v25.4s, v19.4s, v3.s[1]         // v25 = [ 3] = 87*O0 + 57*O1 +  9*O2 - 43*O3<br>+    mls             v26.4s, v19.4s, v2.s[1]         // v26 = [ 5] = 80*O0 +  9*O1 - 70*O2 - 87*O3<br>+    mla             v27.4s, v19.4s, v3.s[3]         // v27 = [ 7] = 70*O0 - 43*O1 - 87*O2 +  9*O3<br>+    mla             v28.4s, v19.4s, v2.s[0]         // v28 = [ 9] = 57*O0 - 80*O1 - 25*O2 + 90*O3<br>+    mla             v29.4s, v19.4s, v3.s[2]         // v29 = [11] = 43*O0 - 90*O1 + 57*O2 + 25*O3<br>+    mls             v30.4s, v19.4s, v2.s[2]         // v30 = [13] = 25*O0 - 70*O1 + 90*O2 - 80*O3<br>+    mls             v31.4s, v19.4s, v3.s[0]         // v31 = [15] =  9*O0 - 25*O1 + 43*O2 - 57*O3<br>+<br>+1:<br>+    cbz             x2, 1f<br>+<br>+    mla             v24.4s, v20.4s, v3.s[0]         // v24 = [ 1] = 90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4<br>+    mls             v25.4s, v20.4s, v2.s[2]         // v25 = [ 3] = 87*O0 + 57*O1 +  9*O2 - 43*O3 - 80*O4<br>+    mls             v26.4s, v20.4s, v3.s[2]         // v26 = [ 5] = 80*O0 +  9*O1 - 70*O2 - 87*O3 - 25*O4<br>+    mla             v27.4s, v20.4s, v2.s[0]         // v27 = [ 7] = 70*O0 - 43*O1 - 87*O2 +  9*O3 + 90*O4<br>+    mls             v28.4s, v20.4s, v3.s[3]         // v28 = [ 9] = 57*O0 - 80*O1 - 25*O2 + 90*O3 -  9*O4<br>+    mls             v29.4s, v20.4s, v2.s[1]         // v29 = [11] = 43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4<br>+    mla             v30.4s, v20.4s, v3.s[1]         // v30 = [13] = 25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4<br>+    mla             v31.4s, v20.4s, v2.s[3]         // v31 = [15] =  9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4<br>+<br>+    mla             v24.4s, v21.4s, v3.s[1]         // v24 = [ 1] = 90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5<br>+    mls             v25.4s, v21.4s, v2.s[0]         // v25 = [ 3] = 87*O0 + 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5<br>+    mla             v26.4s, v21.4s, v3.s[0]         // v26 = [ 5] = 80*O0 +  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5<br>+    mla             v27.4s, v21.4s, v3.s[2]         // v27 = [ 7] = 70*O0 - 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5<br>+    mls             v28.4s, v21.4s, v2.s[1]         // v28 = [ 9] = 57*O0 - 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5<br>+    mla             v29.4s, v21.4s, v2.s[3]         // v29 = [11] = 43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5<br>+    mla             v30.4s, v21.4s, v3.s[3]         // v30 = [13] = 25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5<br>+    mls             v31.4s, v21.4s, v2.s[2]         // v31 = [15] =  9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5<br>+<br>+1:<br>+    cbz             x6, 1f<br>+<br>+    mla             v24.4s, v22.4s, v3.s[2]         // v24 = [ 1] = 90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6<br>+    mls             v25.4s, v22.4s, v2.s[3]         // v25 = [ 3] = 87*O0 + 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6<br>+    mla             v26.4s, v22.4s, v2.s[0]         // v26 = [ 5] = 80*O0 +  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6<br>+    mls             v27.4s, v22.4s, v2.s[2]         // v27 = [ 7] = 70*O0 - 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5 - 80*O6<br>+    mla             v28.4s, v22.4s, v3.s[1]         // v28 = [ 9] = 57*O0 - 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5 + 43*O6<br>+    mla             v29.4s, v22.4s, v3.s[3]         // v29 = [11] = 43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 +  9*O6<br>+    mls             v30.4s, v22.4s, v3.s[0]         // v30 = [13] = 25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5 - 57*O6<br>+    mla             v31.4s, v22.4s, v2.s[1]         // v31 = [15] =  9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6<br>+<br>+    mla             v24.4s, v23.4s, v3.s[3]         // v24 = [ 1] = 90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 +  9*O7<br>+    mls             v25.4s, v23.4s, v3.s[2]         // v25 = [ 3] = 87*O0 + 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7<br>+    mla             v26.4s, v23.4s, v3.s[1]         // v26 = [ 5] = 80*O0 +  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7<br>+    mls             v27.4s, v23.4s, v3.s[0]         // v27 = [ 7] = 70*O0 - 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7<br>+    mla             v28.4s, v23.4s, v2.s[3]         // v28 = [ 9] = 57*O0 - 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5 + 43*O6 + 70*O7<br>+    mls             v29.4s, v23.4s, v2.s[2]         // v29 = [11] = 43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 +  9*O6 - 80*O7<br>+    mla             v30.4s, v23.4s, v2.s[1]         // v30 = [13] = 25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5 - 57*O6 + 87*O7<br>+    mls             v31.4s, v23.4s, v2.s[0]         // v31 = [15] =  9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7<br>+<br>+1:<br>+    sqrshrn         v24.4h, v24.4s, #dct16_shift_2 // [1]<br>+    sqrshrn         v25.4h, v25.4s, #dct16_shift_2 // [3]<br>+    sqrshrn         v26.4h, v26.4s, #dct16_shift_2 // [5]<br>+    sqrshrn         v27.4h, v27.4s, #dct16_shift_2 // [7]<br>+    sqrshrn         v28.4h, v28.4s, #dct16_shift_2 // [9]<br>+    sqrshrn         v29.4h, v29.4s, #dct16_shift_2 // [11]<br>+    sqrshrn         v30.4h, v30.4s, #dct16_shift_2 // [13]<br>+    sqrshrn         v31.4h, v31.4s, #dct16_shift_2 // [15]<br>+<br>+    str             d24, [x5, #(16*2* 1)]<br>+    str             d25, [x5, #(16*2* 3)]<br>+    str             d26, [x5, #(16*2* 5)]<br>+    str             d27, [x5, #(16*2* 7)]<br>+    str             d28, [x5, #(16*2* 9)]<br>+    str             d29, [x5, #(16*2*11)]<br>+    str             d30, [x5, #(16*2*13)]<br>+    str             d31, [x5, #(16*2*15)]<br>+<br>+// EE0 = E0 + E7;<br>+// EO0 = E0 - E7;<br>+// EE1 = E1 + E6;<br>+// EO1 = E1 - E6;<br>+// EE2 = E2 + E5;<br>+// EO2 = E2 - E5;<br>+// EE3 = E3 + E4;<br>+// EO3 = E3 - E4;<br>+    add             v16.4s, v4.4s, v11.4s           // v16 = EE0<br>+    sub             v17.4s, v4.4s, v11.4s           // v17 = EO0<br>+    add             v18.4s, v5.4s, v10.4s           // v18 = EE1<br>+    sub             v19.4s, v5.4s, v10.4s           // v19 = EO1<br>+    add             v20.4s, v6.4s, v9.4s            // v20 = EE2<br>+    sub             v21.4s, v6.4s, v9.4s            // v21 = EO2<br>+    add             v22.4s, v7.4s, v8.4s            // v22 = EE3<br>+    sub             v23.4s, v7.4s, v8.4s            // v23 = EO3<br>+<br>+// EEE0 = EE0 + EE3;<br>+// EEO0 = EE0 - EE3;<br>+// EEE1 = EE1 + EE2;<br>+// EEO1 = EE1 - EE2;<br>+<br>+    add             v24.4s, v16.4s, v22.4s          // v24 = EEE0<br>+    sub             v25.4s, v16.4s, v22.4s          // v25 = EEO0<br>+    add             v26.4s, v18.4s, v20.4s          // v26 = EEE1<br>+    sub             v27.4s, v18.4s, v20.4s          // v27 = EEO1<br>+<br>+    orr             v28.16b, v21.16b, v23.16b<br>+    uqxtn           v28.4h, v28.4s<br>+    mov             x0, v28.d[0]                    // x0 = zeros[EO3 EO2]<br>+<br>+// [ 0] = (64*EEE0 + 64*EEE1 + rnd) >> nShift;<br>+// [ 4] = (83*EEO0 + 36*EEO1 + rnd) >> nShift;<br>+// [ 8] = (64*EEE0 - 64*EEE1 + rnd) >> nShift;<br>+// [12] = (36*EEO0 - 83*EEO1 + rnd) >> nShift;<br>+<br>+    add             v28.4s, v24.4s, v26.4s          // [ 0] = EEE0+EEE1<br>+    mul             v29.4s, v25.4s, v0.s[1]         // [ 4] = 83*EEO0<br>+    sub             v30.4s, v24.4s, v26.4s          // [ 8] = EEE0-EEE1<br>+    mul             v31.4s, v25.4s, v0.s[2]         // [12] = 36*EEO0<br>+<br>+    shl             v28.4s, v28.4s, #6              // [ 0] = 64*EEE0 + 64*EEE1<br>+    mla             v29.4s, v27.4s, v0.s[2]         // [ 4] = 83*EEO0 + 36*EEO1<br>+    shl             v30.4s, v30.4s, #6              // [ 0] = 64*EEE0 - 64*EEE1<br>+    mls             v31.4s, v27.4s, v0.s[1]         // [12] = 36*EEO0 - 83*EEO1<br>+<br>+    sqrshrn         v28.4h, v28.4s, #dct16_shift_2  // [ 0]<br>+    sqrshrn         v29.4h, v29.4s, #dct16_shift_2  // [ 4]<br>+    sqrshrn         v30.4h, v30.4s, #dct16_shift_2  // [ 8]<br>+    sqrshrn         v31.4h, v31.4s, #dct16_shift_2  // [12]<br>+<br>+    str             d28, [x5, #(16*2* 0)]<br>+    str             d29, [x5, #(16*2* 4)]<br>+    str             d30, [x5, #(16*2* 8)]<br>+    str             d31, [x5, #(16*2*12)]<br>+<br>+// [ 2] = (89*EO0 + 75*EO1 + 50*EO2 + 18*EO3 + rnd) >> nShift;<br>+// [ 6] = (75*EO0 - 18*EO1 - 89*EO2 - 50*EO3 + rnd) >> nShift;<br>+// [10] = (50*EO0 - 89*EO1 + 18*EO2 + 75*EO3 + rnd) >> nShift;<br>+// [14] = (18*EO0 - 50*EO1 + 75*EO2 - 89*EO3 + rnd) >> nShift;<br>+<br>+    mul             v28.4s, v17.4s, v0.s[3]         // [ 2] = 89*EO0<br>+    mul             v29.4s, v17.4s, v1.s[0]         // [ 6] = 75*EO0<br>+    mul             v30.4s, v17.4s, v1.s[1]         // [10] = 50*EO0<br>+    mul             v31.4s, v17.4s, v1.s[2]         // [14] = 18*EO0<br>+<br>+    mla             v28.4s, v19.4s, v1.s[0]         // [ 2] = 89*EO0 + 75*EO1<br>+    mls             v29.4s, v19.4s, v1.s[2]         // [ 6] = 75*EO0 - 18*EO1<br>+    mls             v30.4s, v19.4s, v0.s[3]         // [10] = 50*EO0 - 89*EO1<br>+    mls             v31.4s, v19.4s, v1.s[1]         // [14] = 18*EO0 - 50*EO1<br>+<br>+    cbz             x0, 1f<br>+<br>+    mla             v28.4s, v21.4s, v1.s[1]         // [ 2] = 89*EO0 + 75*EO1 + 50*EO2<br>+    mls             v29.4s, v21.4s, v0.s[3]         // [ 6] = 75*EO0 - 18*EO1 - 89*EO2<br>+    mla             v30.4s, v21.4s, v1.s[2]         // [10] = 50*EO0 - 89*EO1 + 18*EO2<br>+    mla             v31.4s, v21.4s, v1.s[0]         // [14] = 18*EO0 - 50*EO1 + 75*EO2<br>+<br>+    mla             v28.4s, v23.4s, v1.s[2]         // [ 2] = 89*EO0 + 75*EO1 + 50*EO2 + 18*EO3<br>+    mls             v29.4s, v23.4s, v1.s[1]         // [ 6] = 75*EO0 - 18*EO1 - 89*EO2 - 50*EO3<br>+    mla             v30.4s, v23.4s, v1.s[0]         // [10] = 50*EO0 - 89*EO1 + 18*EO2 + 75*EO3<br>+    mls             v31.4s, v23.4s, v0.s[3]         // [14] = 18*EO0 - 50*EO1 + 75*EO2 - 89*EO3<br>+<br>+1:<br>+<br>+    sqrshrn         v28.4h, v28.4s, #dct16_shift_2  // [ 0]<br>+    sqrshrn         v29.4h, v29.4s, #dct16_shift_2  // [ 4]<br>+    sqrshrn         v30.4h, v30.4s, #dct16_shift_2  // [ 8]<br>+    sqrshrn         v31.4h, v31.4s, #dct16_shift_2  // [12]<br>+<br>+    str             d28, [x5, #(16*2* 2)]<br>+    str             d29, [x5, #(16*2* 6)]<br>+    str             d30, [x5, #(16*2*10)]<br>+    str             d31, [x5, #(16*2*14)]<br>+<br>+    add             x5, x5, #(4*2)<br>+    sub             w4, w4, #4<br>+    cbnz            w4, 6b<br>+<br>+9:<br>+    ldp             d14, d15, [sp], #16<br>+    ldp             d12, d13, [sp], #16<br>+    ldp             d10, d11, [sp], #16<br>+    ldp             d8, d9, [sp], #16<br>+    ret<br>+endfunc<br>-- <br>2.36.0.windows.1<br><br></div></div>