[x265] [PATCH ARM 5/6] AArch64: DCT16x16

Pavan Tarun Chakka Venkata pavan.tarun at multicorewareinc.com
Thu Sep 12 13:42:14 UTC 2024


>From 3068c5ff3f1c8173c0f7c1a20cf323dde19fffe5 Mon Sep 17 00:00:00 2001
From: Min Chen <chenm003 at 163.com>
Date: Thu, 12 Sep 2024 11:11:55 +0000
Subject: [PATCH 5/6] AArch64: DCT16x16

---
 source/common/aarch64/dct-prim.cpp |   3 +-
 source/common/aarch64/dct.S        | 417 +++++++++++++++++++++++++++++
 2 files changed, 419 insertions(+), 1 deletion(-)

diff --git a/source/common/aarch64/dct-prim.cpp
b/source/common/aarch64/dct-prim.cpp
index 063dde845..8b523ceb0 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -14,6 +14,7 @@
 #define X265_PRAGMA_UNROLL(n)
 #endif

+extern "C" void PFX(dct16_neon)(const int16_t *src, int16_t *dst, intptr_t
srcStride);
 extern "C" void PFX(idct16_neon)(const int16_t *src, int16_t *dst,
intptr_t dstStride);

 namespace
@@ -1111,7 +1112,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p)
     p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_neon<4>;
     p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_neon<5>;
     p.cu[BLOCK_8x8].dct   = dct8_neon;
-    p.cu[BLOCK_16x16].dct = dct16_neon;
+    p.cu[BLOCK_16x16].dct = PFX(dct16_neon);
     p.cu[BLOCK_32x32].dct = dct32_neon;
     p.cu[BLOCK_4x4].idct   = idct4_neon;
     p.cu[BLOCK_16x16].idct = PFX(idct16_neon);
diff --git a/source/common/aarch64/dct.S b/source/common/aarch64/dct.S
index 959310b1f..68cf8778b 100644
--- a/source/common/aarch64/dct.S
+++ b/source/common/aarch64/dct.S
@@ -38,6 +38,9 @@
 .set idct16_shift_1, 7
 .set idct16_shift_2, 12

+.set dct16_shift_1, 3
+.set dct16_shift_2, 10
+
 .align 4
 // NOTE: Hardcoded due to asm syntax issue, don't reorder!
 tbl_const_idct_0:
@@ -67,6 +70,36 @@ tbl_const_idct_0:

     .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3  // v18

+tbl_const_dct_0:
+    // EE
+    .hword 64,+64,+64,+64                   // v16
+    .hword 83,+36,-36,-83                   // v17
+    .hword 64,-64,-64,+64                   // v18
+    .hword 36,-83,+83,-36                   // v19
+
+    // EO
+    .hword 89,+75,+50,+18                   // v20
+    .hword 75,-18,-89,-50                   // v21
+    .hword 50,-89,+18,+75                   // v22
+    .hword 18,-50,+75,-89                   // v23
+
+    // O
+    .hword 90,+87,+80,+70,+57,+43,+25, +9   // v24
+    .hword 87,+57, +9,-43,-80,-90,-70,-25   // v25
+    .hword 80, +9,-70,-87,-25,+57,+90,+43   // v26
+    .hword 70,-43,-87, +9,+90,+25,-80,-57   // v27
+    .hword 57,-80,-25,+90, -9,-87,+43,+70   // v28
+    .hword 43,-90,+57,+25,-87,+70, +9,-80   // v29
+    .hword 25,-70,+90,-80,+43, +9,-57,+87   // v30
+    .hword  9,-25,+43,-57,+70,-80,+87,-90   // v31
+
+    .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1  // v0
+//    .byte 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9  // v1
+
+    .word 64, 83, 36, 89, 75, 50, 18,  0    // v0, v1
+    .word 90, 87, 80, 70, 57, 43, 25,  9    // v2, v3
+
+
 // ***** idct 16x16 *****
 // void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
 function PFX(idct16_neon)
@@ -464,3 +497,387 @@ function PFX(idct16_neon)
     ldp             d8, d9, [sp], #16
     ret
 endfunc
+
+
+// ***** dct 16x16 *****
+// void dct16(const int16_t* src, int16_t* dst, intptr_t srcStride)
+function PFX(dct16_neon)
+// Register map
+// x0  = src
+// x1  = dst
+// x2  = dstStride
+// x3  = tbl_const_dct_0
+
+    stp             d8, d9, [sp,#-16]!
+    stp             d10, d11, [sp,#-16]!
+    stp             d12, d13, [sp,#-16]!
+    stp             d14, d15, [sp,#-16]!
+
+    adr             x6, tbl_const_dct_0
+    ld4r            {v16.2d, v17.2d, v18.2d, v19.2d}, [x6], #32
+    ld4r            {v20.2d, v21.2d, v22.2d, v23.2d}, [x6], #32
+    ld1             {v24.8h, v25.8h, v26.8h, v27.8h}, [x6], #64
+    ld1             {v28.8h, v29.8h, v30.8h, v31.8h}, [x6], #64
+    ldr             q0, [x6], #16
+
+    add             x2, x2, x2
+    mov             x5, x1
+    mov             w4, #16
+
+5:  // Pass1
+    ld1             {v2.8h, v3.8h}, [x0], x2
+    tbl             v3.16b, {v3.16b}, v0.16b
+
+    add             v4.8h, v2.8h, v3.8h             // v4 = E[07 06 05 04
03 02 01 00]
+    sub             v1.8h, v2.8h, v3.8h             // v1 = O[07 06 05 04
03 02 01 00]
+
+// EE0 = E0 + E7;
+// EO0 = E0 - E7;
+// EE1 = E1 + E6;
+// EO1 = E1 - E6;
+// EE2 = E2 + E5;
+// EO2 = E2 - E5;
+// EE3 = E3 + E4;
+// EO3 = E3 - E4;
+    tbl             v2.8b, {v4.16b}, v0.8b          // v2 = E[04 05 06 07]
+
+    add             v3.4h, v4.4h, v2.4h             // v3 = EE[03 02 01 00]
+    sub             v2.4h, v4.4h, v2.4h             // v2 = EO[03 02 01 00]
+
+// [ 0] = (64*EE0 + 64*EE1 + 64*EE2 + 64*EE3 + rnd) >> nShift;
                 // v16
+// [ 4] = (83*EE0 + 36*EE1 - 36*EE2 - 83*EE3 + rnd) >> nShift;
                 // v17
+// [ 8] = (64*EE0 - 64*EE1 - 64*EE2 + 64*EE3 + rnd) >> nShift;
                 // v18
+// [12] = (36*EE0 - 83*EE1 + 83*EE2 - 36*EE3 + rnd) >> nShift;
                 // v19
+
+// [ 2] = (89*EO0 + 75*EO1 + 50*EO2 + 18*EO3 + rnd) >> nShift;
                 // v20
+// [ 6] = (75*EO0 - 18*EO1 - 89*EO2 - 50*EO3 + rnd) >> nShift;
                 // v21
+// [10] = (50*EO0 - 89*EO1 + 18*EO2 + 75*EO3 + rnd) >> nShift;
                 // v22
+// [14] = (18*EO0 - 50*EO1 + 75*EO2 - 89*EO3 + rnd) >> nShift;
                 // v23
+
+// [ 1] = (90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 +  9*O7 +
rnd) >> nShift;  // v24
+// [ 3] = (87*O0 + 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7 +
rnd) >> nShift;  // v25
+// [ 5] = (80*O0 +  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7 +
rnd) >> nShift;  // v26
+// [ 7] = (70*O0 - 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7 +
rnd) >> nShift;  // v27
+// [ 9] = (57*O0 - 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5 + 43*O6 + 70*O7 +
rnd) >> nShift;  // v28
+// [11] = (43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 +  9*O6 - 80*O7 +
rnd) >> nShift;  // v29
+// [13] = (25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5 - 57*O6 + 87*O7 +
rnd) >> nShift;  // v30
+// [15] = ( 9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7 +
rnd) >> nShift;  // v31
+
+
+    smull           v4.4s, v1.4h, v24.4h            // v4  = [ 1]
+    smull           v5.4s, v1.4h, v25.4h            // v5  = [ 3]
+    smull           v6.4s, v1.4h, v26.4h            // v6  = [ 5]
+    smull           v7.4s, v1.4h, v27.4h            // v7  = [ 7]
+    smull           v8.4s, v1.4h, v28.4h            // v8  = [ 9]
+    smull           v9.4s, v1.4h, v29.4h            // v9  = [11]
+    smull           v10.4s, v1.4h, v30.4h           // v10 = [13]
+    smull           v11.4s, v1.4h, v31.4h           // v11 = [15]
+
+    smlal2          v4.4s, v1.8h, v24.8h            // v4  = [ 1]
+    smlal2          v5.4s, v1.8h, v25.8h            // v5  = [ 3]
+    smlal2          v6.4s, v1.8h, v26.8h            // v6  = [ 5]
+    smlal2          v7.4s, v1.8h, v27.8h            // v7  = [ 7]
+    smlal2          v8.4s, v1.8h, v28.8h            // v8  = [ 9]
+    smlal2          v9.4s, v1.8h, v29.8h            // v9  = [11]
+    smlal2          v10.4s, v1.8h, v30.8h           // v10 = [13]
+    smlal2          v11.4s, v1.8h, v31.8h           // v11 = [15]
+
+    smull           v12.4s, v3.4h, v16.4h           // v12 = [ 0]
+    smull           v13.4s, v2.4h, v20.4h           // v13 = [ 2]
+    smull           v14.4s, v3.4h, v17.4h           // v14 = [ 4]
+    smull           v15.4s, v2.4h, v21.4h           // v15 = [ 6]
+
+    addp            v4.4s, v12.4s, v4.4s            // v4 = [1 0]
+    addp            v5.4s, v13.4s, v5.4s            // v5 = [3 2]
+    addp            v6.4s, v14.4s, v6.4s            // v6 = [5 4]
+    addp            v7.4s, v15.4s, v7.4s            // v7 = [7 6]
+    addp            v4.4s, v4.4s, v5.4s             // v4 = [3 2 1 0]
+    addp            v5.4s, v6.4s, v7.4s             // v5 = [7 6 5 4]
+
+    smull           v12.4s, v3.4h, v18.4h           // v12 = [ 8]
+    smull           v13.4s, v2.4h, v22.4h           // v13 = [10]
+    smull           v14.4s, v3.4h, v19.4h           // v14 = [12]
+    smull           v15.4s, v2.4h, v23.4h           // v15 = [14]
+
+    sqrshrn         v4.4h, v4.4s, #dct16_shift_1
+    sqrshrn         v5.4h, v5.4s, #dct16_shift_1
+    stp             d4, d5, [x5], #16
+
+    addp            v6.4s, v12.4s, v8.4s            // v6 = [9 8]
+    addp            v7.4s, v13.4s, v9.4s            // v7 = [11 10]
+    addp            v8.4s, v14.4s, v10.4s           // v8 = [13 12]
+    addp            v9.4s, v15.4s, v11.4s           // v9 = [15 14]
+    addp            v6.4s, v6.4s, v7.4s             // v6 = [11 10 9 8]
+    addp            v7.4s, v8.4s, v9.4s             // v7 = [15 14 13 12]
+
+    sqrshrn         v6.4h, v6.4s, #dct16_shift_1
+    sqrshrn         v7.4h, v7.4s, #dct16_shift_1
+    stp             d6, d7, [x5], #16
+
+    sub             w4, w4, #1
+    cbnz            w4, 5b
+
+    ld1             {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+    mov             w4, #16
+    mov             x5, x1
+6:  // Pass 2
+
+    ldr             d16, [x5, #(16*2* 0)]
+    ldr             d17, [x5, #(16*2* 1)]
+    ldr             d18, [x5, #(16*2* 2)]
+    ldr             d19, [x5, #(16*2* 3)]
+    ldr             d20, [x5, #(16*2* 4)]
+    ldr             d21, [x5, #(16*2* 5)]
+    ldr             d22, [x5, #(16*2* 6)]
+    ldr             d23, [x5, #(16*2* 7)]
+    ldr             d24, [x5, #(16*2* 8)]
+    ldr             d25, [x5, #(16*2* 9)]
+    ldr             d26, [x5, #(16*2*10)]
+    ldr             d27, [x5, #(16*2*11)]
+    ldr             d28, [x5, #(16*2*12)]
+    ldr             d29, [x5, #(16*2*13)]
+    ldr             d30, [x5, #(16*2*14)]
+    ldr             d31, [x5, #(16*2*15)]
+
+    saddl           v4.4s, v16.4h, v31.4h           // v4  = E0
+    saddl           v5.4s, v17.4h, v30.4h           // v5  = E1
+    saddl           v6.4s, v18.4h, v29.4h           // v6  = E2
+    saddl           v7.4s, v19.4h, v28.4h           // v7  = E3
+    saddl           v8.4s, v20.4h, v27.4h           // v8  = E4
+    saddl           v9.4s, v21.4h, v26.4h           // v9  = E5
+    saddl           v10.4s, v22.4h, v25.4h          // v10 = E6
+    saddl           v11.4s, v23.4h, v24.4h          // v11 = E7
+
+// [ 1] = (90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 +  9*O7 +
rnd) >> nShift;
+// [ 3] = (87*O0 + 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7 +
rnd) >> nShift;
+// [ 5] = (80*O0 +  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7 +
rnd) >> nShift;
+// [ 7] = (70*O0 - 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7 +
rnd) >> nShift;
+// [ 9] = (57*O0 - 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5 + 43*O6 + 70*O7 +
rnd) >> nShift;
+// [11] = (43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 +  9*O6 - 80*O7 +
rnd) >> nShift;
+// [13] = (25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5 - 57*O6 + 87*O7 +
rnd) >> nShift;
+// [15] = ( 9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7 +
rnd) >> nShift;
+
+    ssubl           v16.4s, v16.4h, v31.4h          // v16 = O0
+    ssubl           v17.4s, v17.4h, v30.4h          // v17 = O1
+    ssubl           v18.4s, v18.4h, v29.4h          // v18 = O2
+    ssubl           v19.4s, v19.4h, v28.4h          // v19 = O3
+    ssubl           v20.4s, v20.4h, v27.4h          // v20 = O4
+    ssubl           v21.4s, v21.4h, v26.4h          // v21 = O5
+    ssubl           v22.4s, v22.4h, v25.4h          // v22 = O6
+    ssubl           v23.4s, v23.4h, v24.4h          // v23 = O7
+
+    orr             v24.16b, v18.16b, v19.16b
+    orr             v25.16b, v20.16b, v21.16b
+    orr             v26.16b, v22.16b, v23.16b
+    uqxtn           v24.4h, v24.4s
+    uqxtn           v25.4h, v25.4s
+    uqxtn           v26.4h, v26.4s
+    mov             x0, v24.d[0]                    // x0 = zeros[O3 O2]
+    mov             x2, v25.d[0]                    // x2 = zeros[O5 O4]
+    mov             x6, v26.d[0]                    // x6 = zeros[O7 O6]
+
+    mul             v24.4s, v16.4s, v2.s[0]         // v24 = [ 1] = 90*O0
+    mul             v25.4s, v16.4s, v2.s[1]         // v25 = [ 3] = 87*O0
+    mul             v26.4s, v16.4s, v2.s[2]         // v26 = [ 5] = 80*O0
+    mul             v27.4s, v16.4s, v2.s[3]         // v27 = [ 7] = 70*O0
+    mul             v28.4s, v16.4s, v3.s[0]         // v28 = [ 9] = 57*O0
+    mul             v29.4s, v16.4s, v3.s[1]         // v29 = [11] = 43*O0
+    mul             v30.4s, v16.4s, v3.s[2]         // v30 = [13] = 25*O0
+    mul             v31.4s, v16.4s, v3.s[3]         // v31 = [15] =  9*O0
+
+    mla             v24.4s, v17.4s, v2.s[1]         // v24 = [ 1] = 90*O0
+ 87*O1
+    mla             v25.4s, v17.4s, v3.s[0]         // v25 = [ 3] = 87*O0
+ 57*O1
+    mla             v26.4s, v17.4s, v3.s[3]         // v26 = [ 5] = 80*O0
+  9*O1
+    mls             v27.4s, v17.4s, v3.s[1]         // v27 = [ 7] = 70*O0
- 43*O1
+    mls             v28.4s, v17.4s, v2.s[2]         // v28 = [ 9] = 57*O0
- 80*O1
+    mls             v29.4s, v17.4s, v2.s[0]         // v29 = [11] = 43*O0
- 90*O1
+    mls             v30.4s, v17.4s, v2.s[3]         // v30 = [13] = 25*O0
- 70*O1
+    mls             v31.4s, v17.4s, v3.s[2]         // v31 = [15] =  9*O0
- 25*O1
+
+    cbz             x0, 1f
+
+    mla             v24.4s, v18.4s, v2.s[2]         // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2
+    mla             v25.4s, v18.4s, v3.s[3]         // v25 = [ 3] = 87*O0
+ 57*O1 +  9*O2
+    mls             v26.4s, v18.4s, v2.s[3]         // v26 = [ 5] = 80*O0
+  9*O1 - 70*O2
+    mls             v27.4s, v18.4s, v2.s[1]         // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2
+    mls             v28.4s, v18.4s, v3.s[2]         // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2
+    mla             v29.4s, v18.4s, v3.s[0]         // v29 = [11] = 43*O0
- 90*O1 + 57*O2
+    mla             v30.4s, v18.4s, v2.s[0]         // v30 = [13] = 25*O0
- 70*O1 + 90*O2
+    mla             v31.4s, v18.4s, v3.s[1]         // v31 = [15] =  9*O0
- 25*O1 + 43*O2
+
+    mla             v24.4s, v19.4s, v2.s[3]         // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3
+    mls             v25.4s, v19.4s, v3.s[1]         // v25 = [ 3] = 87*O0
+ 57*O1 +  9*O2 - 43*O3
+    mls             v26.4s, v19.4s, v2.s[1]         // v26 = [ 5] = 80*O0
+  9*O1 - 70*O2 - 87*O3
+    mla             v27.4s, v19.4s, v3.s[3]         // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 +  9*O3
+    mla             v28.4s, v19.4s, v2.s[0]         // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3
+    mla             v29.4s, v19.4s, v3.s[2]         // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3
+    mls             v30.4s, v19.4s, v2.s[2]         // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3
+    mls             v31.4s, v19.4s, v3.s[0]         // v31 = [15] =  9*O0
- 25*O1 + 43*O2 - 57*O3
+
+1:
+    cbz             x2, 1f
+
+    mla             v24.4s, v20.4s, v3.s[0]         // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3 + 57*O4
+    mls             v25.4s, v20.4s, v2.s[2]         // v25 = [ 3] = 87*O0
+ 57*O1 +  9*O2 - 43*O3 - 80*O4
+    mls             v26.4s, v20.4s, v3.s[2]         // v26 = [ 5] = 80*O0
+  9*O1 - 70*O2 - 87*O3 - 25*O4
+    mla             v27.4s, v20.4s, v2.s[0]         // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 +  9*O3 + 90*O4
+    mls             v28.4s, v20.4s, v3.s[3]         // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3 -  9*O4
+    mls             v29.4s, v20.4s, v2.s[1]         // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3 - 87*O4
+    mla             v30.4s, v20.4s, v3.s[1]         // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3 + 43*O4
+    mla             v31.4s, v20.4s, v2.s[3]         // v31 = [15] =  9*O0
- 25*O1 + 43*O2 - 57*O3 + 70*O4
+
+    mla             v24.4s, v21.4s, v3.s[1]         // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5
+    mls             v25.4s, v21.4s, v2.s[0]         // v25 = [ 3] = 87*O0
+ 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5
+    mla             v26.4s, v21.4s, v3.s[0]         // v26 = [ 5] = 80*O0
+  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5
+    mla             v27.4s, v21.4s, v3.s[2]         // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5
+    mls             v28.4s, v21.4s, v2.s[1]         // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5
+    mla             v29.4s, v21.4s, v2.s[3]         // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5
+    mla             v30.4s, v21.4s, v3.s[3]         // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5
+    mls             v31.4s, v21.4s, v2.s[2]         // v31 = [15] =  9*O0
- 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5
+
+1:
+    cbz             x6, 1f
+
+    mla             v24.4s, v22.4s, v3.s[2]         // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6
+    mls             v25.4s, v22.4s, v2.s[3]         // v25 = [ 3] = 87*O0
+ 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6
+    mla             v26.4s, v22.4s, v2.s[0]         // v26 = [ 5] = 80*O0
+  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6
+    mls             v27.4s, v22.4s, v2.s[2]         // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5 - 80*O6
+    mla             v28.4s, v22.4s, v3.s[1]         // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5 + 43*O6
+    mla             v29.4s, v22.4s, v3.s[3]         // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 +  9*O6
+    mls             v30.4s, v22.4s, v3.s[0]         // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5 - 57*O6
+    mla             v31.4s, v22.4s, v2.s[1]         // v31 = [15] =  9*O0
- 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6
+
+    mla             v24.4s, v23.4s, v3.s[3]         // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 +  9*O7
+    mls             v25.4s, v23.4s, v3.s[2]         // v25 = [ 3] = 87*O0
+ 57*O1 +  9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7
+    mla             v26.4s, v23.4s, v3.s[1]         // v26 = [ 5] = 80*O0
+  9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7
+    mls             v27.4s, v23.4s, v3.s[0]         // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 +  9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7
+    mla             v28.4s, v23.4s, v2.s[3]         // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3 -  9*O4 - 87*O5 + 43*O6 + 70*O7
+    mls             v29.4s, v23.4s, v2.s[2]         // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 +  9*O6 - 80*O7
+    mla             v30.4s, v23.4s, v2.s[1]         // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3 + 43*O4 +  9*O5 - 57*O6 + 87*O7
+    mls             v31.4s, v23.4s, v2.s[0]         // v31 = [15] =  9*O0
- 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7
+
+1:
+    sqrshrn         v24.4h, v24.4s, #dct16_shift_2 // [1]
+    sqrshrn         v25.4h, v25.4s, #dct16_shift_2 // [3]
+    sqrshrn         v26.4h, v26.4s, #dct16_shift_2 // [5]
+    sqrshrn         v27.4h, v27.4s, #dct16_shift_2 // [7]
+    sqrshrn         v28.4h, v28.4s, #dct16_shift_2 // [9]
+    sqrshrn         v29.4h, v29.4s, #dct16_shift_2 // [11]
+    sqrshrn         v30.4h, v30.4s, #dct16_shift_2 // [13]
+    sqrshrn         v31.4h, v31.4s, #dct16_shift_2 // [15]
+
+    str             d24, [x5, #(16*2* 1)]
+    str             d25, [x5, #(16*2* 3)]
+    str             d26, [x5, #(16*2* 5)]
+    str             d27, [x5, #(16*2* 7)]
+    str             d28, [x5, #(16*2* 9)]
+    str             d29, [x5, #(16*2*11)]
+    str             d30, [x5, #(16*2*13)]
+    str             d31, [x5, #(16*2*15)]
+
+// EE0 = E0 + E7;
+// EO0 = E0 - E7;
+// EE1 = E1 + E6;
+// EO1 = E1 - E6;
+// EE2 = E2 + E5;
+// EO2 = E2 - E5;
+// EE3 = E3 + E4;
+// EO3 = E3 - E4;
+    add             v16.4s, v4.4s, v11.4s           // v16 = EE0
+    sub             v17.4s, v4.4s, v11.4s           // v17 = EO0
+    add             v18.4s, v5.4s, v10.4s           // v18 = EE1
+    sub             v19.4s, v5.4s, v10.4s           // v19 = EO1
+    add             v20.4s, v6.4s, v9.4s            // v20 = EE2
+    sub             v21.4s, v6.4s, v9.4s            // v21 = EO2
+    add             v22.4s, v7.4s, v8.4s            // v22 = EE3
+    sub             v23.4s, v7.4s, v8.4s            // v23 = EO3
+
+// EEE0 = EE0 + EE3;
+// EEO0 = EE0 - EE3;
+// EEE1 = EE1 + EE2;
+// EEO1 = EE1 - EE2;
+
+    add             v24.4s, v16.4s, v22.4s          // v24 = EEE0
+    sub             v25.4s, v16.4s, v22.4s          // v25 = EEO0
+    add             v26.4s, v18.4s, v20.4s          // v26 = EEE1
+    sub             v27.4s, v18.4s, v20.4s          // v27 = EEO1
+
+    orr             v28.16b, v21.16b, v23.16b
+    uqxtn           v28.4h, v28.4s
+    mov             x0, v28.d[0]                    // x0 = zeros[EO3 EO2]
+
+// [ 0] = (64*EEE0 + 64*EEE1 + rnd) >> nShift;
+// [ 4] = (83*EEO0 + 36*EEO1 + rnd) >> nShift;
+// [ 8] = (64*EEE0 - 64*EEE1 + rnd) >> nShift;
+// [12] = (36*EEO0 - 83*EEO1 + rnd) >> nShift;
+
+    add             v28.4s, v24.4s, v26.4s          // [ 0] = EEE0+EEE1
+    mul             v29.4s, v25.4s, v0.s[1]         // [ 4] = 83*EEO0
+    sub             v30.4s, v24.4s, v26.4s          // [ 8] = EEE0-EEE1
+    mul             v31.4s, v25.4s, v0.s[2]         // [12] = 36*EEO0
+
+    shl             v28.4s, v28.4s, #6              // [ 0] = 64*EEE0 +
64*EEE1
+    mla             v29.4s, v27.4s, v0.s[2]         // [ 4] = 83*EEO0 +
36*EEO1
+    shl             v30.4s, v30.4s, #6              // [ 0] = 64*EEE0 -
64*EEE1
+    mls             v31.4s, v27.4s, v0.s[1]         // [12] = 36*EEO0 -
83*EEO1
+
+    sqrshrn         v28.4h, v28.4s, #dct16_shift_2  // [ 0]
+    sqrshrn         v29.4h, v29.4s, #dct16_shift_2  // [ 4]
+    sqrshrn         v30.4h, v30.4s, #dct16_shift_2  // [ 8]
+    sqrshrn         v31.4h, v31.4s, #dct16_shift_2  // [12]
+
+    str             d28, [x5, #(16*2* 0)]
+    str             d29, [x5, #(16*2* 4)]
+    str             d30, [x5, #(16*2* 8)]
+    str             d31, [x5, #(16*2*12)]
+
+// [ 2] = (89*EO0 + 75*EO1 + 50*EO2 + 18*EO3 + rnd) >> nShift;
+// [ 6] = (75*EO0 - 18*EO1 - 89*EO2 - 50*EO3 + rnd) >> nShift;
+// [10] = (50*EO0 - 89*EO1 + 18*EO2 + 75*EO3 + rnd) >> nShift;
+// [14] = (18*EO0 - 50*EO1 + 75*EO2 - 89*EO3 + rnd) >> nShift;
+
+    mul             v28.4s, v17.4s, v0.s[3]         // [ 2] = 89*EO0
+    mul             v29.4s, v17.4s, v1.s[0]         // [ 6] = 75*EO0
+    mul             v30.4s, v17.4s, v1.s[1]         // [10] = 50*EO0
+    mul             v31.4s, v17.4s, v1.s[2]         // [14] = 18*EO0
+
+    mla             v28.4s, v19.4s, v1.s[0]         // [ 2] = 89*EO0 +
75*EO1
+    mls             v29.4s, v19.4s, v1.s[2]         // [ 6] = 75*EO0 -
18*EO1
+    mls             v30.4s, v19.4s, v0.s[3]         // [10] = 50*EO0 -
89*EO1
+    mls             v31.4s, v19.4s, v1.s[1]         // [14] = 18*EO0 -
50*EO1
+
+    cbz             x0, 1f
+
+    mla             v28.4s, v21.4s, v1.s[1]         // [ 2] = 89*EO0 +
75*EO1 + 50*EO2
+    mls             v29.4s, v21.4s, v0.s[3]         // [ 6] = 75*EO0 -
18*EO1 - 89*EO2
+    mla             v30.4s, v21.4s, v1.s[2]         // [10] = 50*EO0 -
89*EO1 + 18*EO2
+    mla             v31.4s, v21.4s, v1.s[0]         // [14] = 18*EO0 -
50*EO1 + 75*EO2
+
+    mla             v28.4s, v23.4s, v1.s[2]         // [ 2] = 89*EO0 +
75*EO1 + 50*EO2 + 18*EO3
+    mls             v29.4s, v23.4s, v1.s[1]         // [ 6] = 75*EO0 -
18*EO1 - 89*EO2 - 50*EO3
+    mla             v30.4s, v23.4s, v1.s[0]         // [10] = 50*EO0 -
89*EO1 + 18*EO2 + 75*EO3
+    mls             v31.4s, v23.4s, v0.s[3]         // [14] = 18*EO0 -
50*EO1 + 75*EO2 - 89*EO3
+
+1:
+
+    sqrshrn         v28.4h, v28.4s, #dct16_shift_2  // [ 0]
+    sqrshrn         v29.4h, v29.4s, #dct16_shift_2  // [ 4]
+    sqrshrn         v30.4h, v30.4s, #dct16_shift_2  // [ 8]
+    sqrshrn         v31.4h, v31.4s, #dct16_shift_2  // [12]
+
+    str             d28, [x5, #(16*2* 2)]
+    str             d29, [x5, #(16*2* 6)]
+    str             d30, [x5, #(16*2*10)]
+    str             d31, [x5, #(16*2*14)]
+
+    add             x5, x5, #(4*2)
+    sub             w4, w4, #4
+    cbnz            w4, 6b
+
+9:
+    ldp             d14, d15, [sp], #16
+    ldp             d12, d13, [sp], #16
+    ldp             d10, d11, [sp], #16
+    ldp             d8, d9, [sp], #16
+    ret
+endfunc
-- 
2.36.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/0e1e5686/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0005-AArch64-DCT16x16.patch
Type: application/octet-stream
Size: 24417 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/0e1e5686/attachment-0001.obj>


More information about the x265-devel mailing list