[x265] [PATCH ARM 5/6] AArch64: DCT16x16
Pavan Tarun Chakka Venkata
pavan.tarun at multicorewareinc.com
Thu Sep 12 13:42:14 UTC 2024
>From 3068c5ff3f1c8173c0f7c1a20cf323dde19fffe5 Mon Sep 17 00:00:00 2001
From: Min Chen <chenm003 at 163.com>
Date: Thu, 12 Sep 2024 11:11:55 +0000
Subject: [PATCH 5/6] AArch64: DCT16x16
---
source/common/aarch64/dct-prim.cpp | 3 +-
source/common/aarch64/dct.S | 417 +++++++++++++++++++++++++++++
2 files changed, 419 insertions(+), 1 deletion(-)
diff --git a/source/common/aarch64/dct-prim.cpp
b/source/common/aarch64/dct-prim.cpp
index 063dde845..8b523ceb0 100644
--- a/source/common/aarch64/dct-prim.cpp
+++ b/source/common/aarch64/dct-prim.cpp
@@ -14,6 +14,7 @@
#define X265_PRAGMA_UNROLL(n)
#endif
+extern "C" void PFX(dct16_neon)(const int16_t *src, int16_t *dst, intptr_t
srcStride);
extern "C" void PFX(idct16_neon)(const int16_t *src, int16_t *dst,
intptr_t dstStride);
namespace
@@ -1111,7 +1112,7 @@ void setupDCTPrimitives_neon(EncoderPrimitives &p)
p.cu[BLOCK_16x16].psyRdoQuant = psyRdoQuant_neon<4>;
p.cu[BLOCK_32x32].psyRdoQuant = psyRdoQuant_neon<5>;
p.cu[BLOCK_8x8].dct = dct8_neon;
- p.cu[BLOCK_16x16].dct = dct16_neon;
+ p.cu[BLOCK_16x16].dct = PFX(dct16_neon);
p.cu[BLOCK_32x32].dct = dct32_neon;
p.cu[BLOCK_4x4].idct = idct4_neon;
p.cu[BLOCK_16x16].idct = PFX(idct16_neon);
diff --git a/source/common/aarch64/dct.S b/source/common/aarch64/dct.S
index 959310b1f..68cf8778b 100644
--- a/source/common/aarch64/dct.S
+++ b/source/common/aarch64/dct.S
@@ -38,6 +38,9 @@
.set idct16_shift_1, 7
.set idct16_shift_2, 12
+.set dct16_shift_1, 3
+.set dct16_shift_2, 10
+
.align 4
// NOTE: Hardcoded due to asm syntax issue, don't reorder!
tbl_const_idct_0:
@@ -67,6 +70,36 @@ tbl_const_idct_0:
.byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 // v18
+tbl_const_dct_0:
+ // EE
+ .hword 64,+64,+64,+64 // v16
+ .hword 83,+36,-36,-83 // v17
+ .hword 64,-64,-64,+64 // v18
+ .hword 36,-83,+83,-36 // v19
+
+ // EO
+ .hword 89,+75,+50,+18 // v20
+ .hword 75,-18,-89,-50 // v21
+ .hword 50,-89,+18,+75 // v22
+ .hword 18,-50,+75,-89 // v23
+
+ // O
+ .hword 90,+87,+80,+70,+57,+43,+25, +9 // v24
+ .hword 87,+57, +9,-43,-80,-90,-70,-25 // v25
+ .hword 80, +9,-70,-87,-25,+57,+90,+43 // v26
+ .hword 70,-43,-87, +9,+90,+25,-80,-57 // v27
+ .hword 57,-80,-25,+90, -9,-87,+43,+70 // v28
+ .hword 43,-90,+57,+25,-87,+70, +9,-80 // v29
+ .hword 25,-70,+90,-80,+43, +9,-57,+87 // v30
+ .hword 9,-25,+43,-57,+70,-80,+87,-90 // v31
+
+ .byte 14, 15, 12, 13, 10, 11, 8, 9, 6, 7, 4, 5, 2, 3, 0, 1 // v0
+// .byte 6, 7, 4, 5, 2, 3, 0, 1, 14, 15, 12, 13, 10, 11, 8, 9 // v1
+
+ .word 64, 83, 36, 89, 75, 50, 18, 0 // v0, v1
+ .word 90, 87, 80, 70, 57, 43, 25, 9 // v2, v3
+
+
// ***** idct 16x16 *****
// void idct16(const int16_t* src, int16_t* dst, intptr_t dstStride)
function PFX(idct16_neon)
@@ -464,3 +497,387 @@ function PFX(idct16_neon)
ldp d8, d9, [sp], #16
ret
endfunc
+
+
+// ***** dct 16x16 *****
+// void dct16(const int16_t* src, int16_t* dst, intptr_t srcStride)
+function PFX(dct16_neon)
+// Register map
+// x0 = src
+// x1 = dst
+// x2 = dstStride
+// x3 = tbl_const_dct_0
+
+ stp d8, d9, [sp,#-16]!
+ stp d10, d11, [sp,#-16]!
+ stp d12, d13, [sp,#-16]!
+ stp d14, d15, [sp,#-16]!
+
+ adr x6, tbl_const_dct_0
+ ld4r {v16.2d, v17.2d, v18.2d, v19.2d}, [x6], #32
+ ld4r {v20.2d, v21.2d, v22.2d, v23.2d}, [x6], #32
+ ld1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x6], #64
+ ld1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x6], #64
+ ldr q0, [x6], #16
+
+ add x2, x2, x2
+ mov x5, x1
+ mov w4, #16
+
+5: // Pass1
+ ld1 {v2.8h, v3.8h}, [x0], x2
+ tbl v3.16b, {v3.16b}, v0.16b
+
+ add v4.8h, v2.8h, v3.8h // v4 = E[07 06 05 04
03 02 01 00]
+ sub v1.8h, v2.8h, v3.8h // v1 = O[07 06 05 04
03 02 01 00]
+
+// EE0 = E0 + E7;
+// EO0 = E0 - E7;
+// EE1 = E1 + E6;
+// EO1 = E1 - E6;
+// EE2 = E2 + E5;
+// EO2 = E2 - E5;
+// EE3 = E3 + E4;
+// EO3 = E3 - E4;
+ tbl v2.8b, {v4.16b}, v0.8b // v2 = E[04 05 06 07]
+
+ add v3.4h, v4.4h, v2.4h // v3 = EE[03 02 01 00]
+ sub v2.4h, v4.4h, v2.4h // v2 = EO[03 02 01 00]
+
+// [ 0] = (64*EE0 + 64*EE1 + 64*EE2 + 64*EE3 + rnd) >> nShift;
// v16
+// [ 4] = (83*EE0 + 36*EE1 - 36*EE2 - 83*EE3 + rnd) >> nShift;
// v17
+// [ 8] = (64*EE0 - 64*EE1 - 64*EE2 + 64*EE3 + rnd) >> nShift;
// v18
+// [12] = (36*EE0 - 83*EE1 + 83*EE2 - 36*EE3 + rnd) >> nShift;
// v19
+
+// [ 2] = (89*EO0 + 75*EO1 + 50*EO2 + 18*EO3 + rnd) >> nShift;
// v20
+// [ 6] = (75*EO0 - 18*EO1 - 89*EO2 - 50*EO3 + rnd) >> nShift;
// v21
+// [10] = (50*EO0 - 89*EO1 + 18*EO2 + 75*EO3 + rnd) >> nShift;
// v22
+// [14] = (18*EO0 - 50*EO1 + 75*EO2 - 89*EO3 + rnd) >> nShift;
// v23
+
+// [ 1] = (90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 + 9*O7 +
rnd) >> nShift; // v24
+// [ 3] = (87*O0 + 57*O1 + 9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7 +
rnd) >> nShift; // v25
+// [ 5] = (80*O0 + 9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7 +
rnd) >> nShift; // v26
+// [ 7] = (70*O0 - 43*O1 - 87*O2 + 9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7 +
rnd) >> nShift; // v27
+// [ 9] = (57*O0 - 80*O1 - 25*O2 + 90*O3 - 9*O4 - 87*O5 + 43*O6 + 70*O7 +
rnd) >> nShift; // v28
+// [11] = (43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 + 9*O6 - 80*O7 +
rnd) >> nShift; // v29
+// [13] = (25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 + 9*O5 - 57*O6 + 87*O7 +
rnd) >> nShift; // v30
+// [15] = ( 9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7 +
rnd) >> nShift; // v31
+
+
+ smull v4.4s, v1.4h, v24.4h // v4 = [ 1]
+ smull v5.4s, v1.4h, v25.4h // v5 = [ 3]
+ smull v6.4s, v1.4h, v26.4h // v6 = [ 5]
+ smull v7.4s, v1.4h, v27.4h // v7 = [ 7]
+ smull v8.4s, v1.4h, v28.4h // v8 = [ 9]
+ smull v9.4s, v1.4h, v29.4h // v9 = [11]
+ smull v10.4s, v1.4h, v30.4h // v10 = [13]
+ smull v11.4s, v1.4h, v31.4h // v11 = [15]
+
+ smlal2 v4.4s, v1.8h, v24.8h // v4 = [ 1]
+ smlal2 v5.4s, v1.8h, v25.8h // v5 = [ 3]
+ smlal2 v6.4s, v1.8h, v26.8h // v6 = [ 5]
+ smlal2 v7.4s, v1.8h, v27.8h // v7 = [ 7]
+ smlal2 v8.4s, v1.8h, v28.8h // v8 = [ 9]
+ smlal2 v9.4s, v1.8h, v29.8h // v9 = [11]
+ smlal2 v10.4s, v1.8h, v30.8h // v10 = [13]
+ smlal2 v11.4s, v1.8h, v31.8h // v11 = [15]
+
+ smull v12.4s, v3.4h, v16.4h // v12 = [ 0]
+ smull v13.4s, v2.4h, v20.4h // v13 = [ 2]
+ smull v14.4s, v3.4h, v17.4h // v14 = [ 4]
+ smull v15.4s, v2.4h, v21.4h // v15 = [ 6]
+
+ addp v4.4s, v12.4s, v4.4s // v4 = [1 0]
+ addp v5.4s, v13.4s, v5.4s // v5 = [3 2]
+ addp v6.4s, v14.4s, v6.4s // v6 = [5 4]
+ addp v7.4s, v15.4s, v7.4s // v7 = [7 6]
+ addp v4.4s, v4.4s, v5.4s // v4 = [3 2 1 0]
+ addp v5.4s, v6.4s, v7.4s // v5 = [7 6 5 4]
+
+ smull v12.4s, v3.4h, v18.4h // v12 = [ 8]
+ smull v13.4s, v2.4h, v22.4h // v13 = [10]
+ smull v14.4s, v3.4h, v19.4h // v14 = [12]
+ smull v15.4s, v2.4h, v23.4h // v15 = [14]
+
+ sqrshrn v4.4h, v4.4s, #dct16_shift_1
+ sqrshrn v5.4h, v5.4s, #dct16_shift_1
+ stp d4, d5, [x5], #16
+
+ addp v6.4s, v12.4s, v8.4s // v6 = [9 8]
+ addp v7.4s, v13.4s, v9.4s // v7 = [11 10]
+ addp v8.4s, v14.4s, v10.4s // v8 = [13 12]
+ addp v9.4s, v15.4s, v11.4s // v9 = [15 14]
+ addp v6.4s, v6.4s, v7.4s // v6 = [11 10 9 8]
+ addp v7.4s, v8.4s, v9.4s // v7 = [15 14 13 12]
+
+ sqrshrn v6.4h, v6.4s, #dct16_shift_1
+ sqrshrn v7.4h, v7.4s, #dct16_shift_1
+ stp d6, d7, [x5], #16
+
+ sub w4, w4, #1
+ cbnz w4, 5b
+
+ ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x6]
+ mov w4, #16
+ mov x5, x1
+6: // Pass 2
+
+ ldr d16, [x5, #(16*2* 0)]
+ ldr d17, [x5, #(16*2* 1)]
+ ldr d18, [x5, #(16*2* 2)]
+ ldr d19, [x5, #(16*2* 3)]
+ ldr d20, [x5, #(16*2* 4)]
+ ldr d21, [x5, #(16*2* 5)]
+ ldr d22, [x5, #(16*2* 6)]
+ ldr d23, [x5, #(16*2* 7)]
+ ldr d24, [x5, #(16*2* 8)]
+ ldr d25, [x5, #(16*2* 9)]
+ ldr d26, [x5, #(16*2*10)]
+ ldr d27, [x5, #(16*2*11)]
+ ldr d28, [x5, #(16*2*12)]
+ ldr d29, [x5, #(16*2*13)]
+ ldr d30, [x5, #(16*2*14)]
+ ldr d31, [x5, #(16*2*15)]
+
+ saddl v4.4s, v16.4h, v31.4h // v4 = E0
+ saddl v5.4s, v17.4h, v30.4h // v5 = E1
+ saddl v6.4s, v18.4h, v29.4h // v6 = E2
+ saddl v7.4s, v19.4h, v28.4h // v7 = E3
+ saddl v8.4s, v20.4h, v27.4h // v8 = E4
+ saddl v9.4s, v21.4h, v26.4h // v9 = E5
+ saddl v10.4s, v22.4h, v25.4h // v10 = E6
+ saddl v11.4s, v23.4h, v24.4h // v11 = E7
+
+// [ 1] = (90*O0 + 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 + 9*O7 +
rnd) >> nShift;
+// [ 3] = (87*O0 + 57*O1 + 9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7 +
rnd) >> nShift;
+// [ 5] = (80*O0 + 9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7 +
rnd) >> nShift;
+// [ 7] = (70*O0 - 43*O1 - 87*O2 + 9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7 +
rnd) >> nShift;
+// [ 9] = (57*O0 - 80*O1 - 25*O2 + 90*O3 - 9*O4 - 87*O5 + 43*O6 + 70*O7 +
rnd) >> nShift;
+// [11] = (43*O0 - 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 + 9*O6 - 80*O7 +
rnd) >> nShift;
+// [13] = (25*O0 - 70*O1 + 90*O2 - 80*O3 + 43*O4 + 9*O5 - 57*O6 + 87*O7 +
rnd) >> nShift;
+// [15] = ( 9*O0 - 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7 +
rnd) >> nShift;
+
+ ssubl v16.4s, v16.4h, v31.4h // v16 = O0
+ ssubl v17.4s, v17.4h, v30.4h // v17 = O1
+ ssubl v18.4s, v18.4h, v29.4h // v18 = O2
+ ssubl v19.4s, v19.4h, v28.4h // v19 = O3
+ ssubl v20.4s, v20.4h, v27.4h // v20 = O4
+ ssubl v21.4s, v21.4h, v26.4h // v21 = O5
+ ssubl v22.4s, v22.4h, v25.4h // v22 = O6
+ ssubl v23.4s, v23.4h, v24.4h // v23 = O7
+
+ orr v24.16b, v18.16b, v19.16b
+ orr v25.16b, v20.16b, v21.16b
+ orr v26.16b, v22.16b, v23.16b
+ uqxtn v24.4h, v24.4s
+ uqxtn v25.4h, v25.4s
+ uqxtn v26.4h, v26.4s
+ mov x0, v24.d[0] // x0 = zeros[O3 O2]
+ mov x2, v25.d[0] // x2 = zeros[O5 O4]
+ mov x6, v26.d[0] // x6 = zeros[O7 O6]
+
+ mul v24.4s, v16.4s, v2.s[0] // v24 = [ 1] = 90*O0
+ mul v25.4s, v16.4s, v2.s[1] // v25 = [ 3] = 87*O0
+ mul v26.4s, v16.4s, v2.s[2] // v26 = [ 5] = 80*O0
+ mul v27.4s, v16.4s, v2.s[3] // v27 = [ 7] = 70*O0
+ mul v28.4s, v16.4s, v3.s[0] // v28 = [ 9] = 57*O0
+ mul v29.4s, v16.4s, v3.s[1] // v29 = [11] = 43*O0
+ mul v30.4s, v16.4s, v3.s[2] // v30 = [13] = 25*O0
+ mul v31.4s, v16.4s, v3.s[3] // v31 = [15] = 9*O0
+
+ mla v24.4s, v17.4s, v2.s[1] // v24 = [ 1] = 90*O0
+ 87*O1
+ mla v25.4s, v17.4s, v3.s[0] // v25 = [ 3] = 87*O0
+ 57*O1
+ mla v26.4s, v17.4s, v3.s[3] // v26 = [ 5] = 80*O0
+ 9*O1
+ mls v27.4s, v17.4s, v3.s[1] // v27 = [ 7] = 70*O0
- 43*O1
+ mls v28.4s, v17.4s, v2.s[2] // v28 = [ 9] = 57*O0
- 80*O1
+ mls v29.4s, v17.4s, v2.s[0] // v29 = [11] = 43*O0
- 90*O1
+ mls v30.4s, v17.4s, v2.s[3] // v30 = [13] = 25*O0
- 70*O1
+ mls v31.4s, v17.4s, v3.s[2] // v31 = [15] = 9*O0
- 25*O1
+
+ cbz x0, 1f
+
+ mla v24.4s, v18.4s, v2.s[2] // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2
+ mla v25.4s, v18.4s, v3.s[3] // v25 = [ 3] = 87*O0
+ 57*O1 + 9*O2
+ mls v26.4s, v18.4s, v2.s[3] // v26 = [ 5] = 80*O0
+ 9*O1 - 70*O2
+ mls v27.4s, v18.4s, v2.s[1] // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2
+ mls v28.4s, v18.4s, v3.s[2] // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2
+ mla v29.4s, v18.4s, v3.s[0] // v29 = [11] = 43*O0
- 90*O1 + 57*O2
+ mla v30.4s, v18.4s, v2.s[0] // v30 = [13] = 25*O0
- 70*O1 + 90*O2
+ mla v31.4s, v18.4s, v3.s[1] // v31 = [15] = 9*O0
- 25*O1 + 43*O2
+
+ mla v24.4s, v19.4s, v2.s[3] // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3
+ mls v25.4s, v19.4s, v3.s[1] // v25 = [ 3] = 87*O0
+ 57*O1 + 9*O2 - 43*O3
+ mls v26.4s, v19.4s, v2.s[1] // v26 = [ 5] = 80*O0
+ 9*O1 - 70*O2 - 87*O3
+ mla v27.4s, v19.4s, v3.s[3] // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 + 9*O3
+ mla v28.4s, v19.4s, v2.s[0] // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3
+ mla v29.4s, v19.4s, v3.s[2] // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3
+ mls v30.4s, v19.4s, v2.s[2] // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3
+ mls v31.4s, v19.4s, v3.s[0] // v31 = [15] = 9*O0
- 25*O1 + 43*O2 - 57*O3
+
+1:
+ cbz x2, 1f
+
+ mla v24.4s, v20.4s, v3.s[0] // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3 + 57*O4
+ mls v25.4s, v20.4s, v2.s[2] // v25 = [ 3] = 87*O0
+ 57*O1 + 9*O2 - 43*O3 - 80*O4
+ mls v26.4s, v20.4s, v3.s[2] // v26 = [ 5] = 80*O0
+ 9*O1 - 70*O2 - 87*O3 - 25*O4
+ mla v27.4s, v20.4s, v2.s[0] // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 + 9*O3 + 90*O4
+ mls v28.4s, v20.4s, v3.s[3] // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3 - 9*O4
+ mls v29.4s, v20.4s, v2.s[1] // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3 - 87*O4
+ mla v30.4s, v20.4s, v3.s[1] // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3 + 43*O4
+ mla v31.4s, v20.4s, v2.s[3] // v31 = [15] = 9*O0
- 25*O1 + 43*O2 - 57*O3 + 70*O4
+
+ mla v24.4s, v21.4s, v3.s[1] // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5
+ mls v25.4s, v21.4s, v2.s[0] // v25 = [ 3] = 87*O0
+ 57*O1 + 9*O2 - 43*O3 - 80*O4 - 90*O5
+ mla v26.4s, v21.4s, v3.s[0] // v26 = [ 5] = 80*O0
+ 9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5
+ mla v27.4s, v21.4s, v3.s[2] // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 + 9*O3 + 90*O4 + 25*O5
+ mls v28.4s, v21.4s, v2.s[1] // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3 - 9*O4 - 87*O5
+ mla v29.4s, v21.4s, v2.s[3] // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5
+ mla v30.4s, v21.4s, v3.s[3] // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3 + 43*O4 + 9*O5
+ mls v31.4s, v21.4s, v2.s[2] // v31 = [15] = 9*O0
- 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5
+
+1:
+ cbz x6, 1f
+
+ mla v24.4s, v22.4s, v3.s[2] // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6
+ mls v25.4s, v22.4s, v2.s[3] // v25 = [ 3] = 87*O0
+ 57*O1 + 9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6
+ mla v26.4s, v22.4s, v2.s[0] // v26 = [ 5] = 80*O0
+ 9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6
+ mls v27.4s, v22.4s, v2.s[2] // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 + 9*O3 + 90*O4 + 25*O5 - 80*O6
+ mla v28.4s, v22.4s, v3.s[1] // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3 - 9*O4 - 87*O5 + 43*O6
+ mla v29.4s, v22.4s, v3.s[3] // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 + 9*O6
+ mls v30.4s, v22.4s, v3.s[0] // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3 + 43*O4 + 9*O5 - 57*O6
+ mla v31.4s, v22.4s, v2.s[1] // v31 = [15] = 9*O0
- 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6
+
+ mla v24.4s, v23.4s, v3.s[3] // v24 = [ 1] = 90*O0
+ 87*O1 + 80*O2 + 70*O3 + 57*O4 + 43*O5 + 25*O6 + 9*O7
+ mls v25.4s, v23.4s, v3.s[2] // v25 = [ 3] = 87*O0
+ 57*O1 + 9*O2 - 43*O3 - 80*O4 - 90*O5 - 70*O6 - 25*O7
+ mla v26.4s, v23.4s, v3.s[1] // v26 = [ 5] = 80*O0
+ 9*O1 - 70*O2 - 87*O3 - 25*O4 + 57*O5 + 90*O6 + 43*O7
+ mls v27.4s, v23.4s, v3.s[0] // v27 = [ 7] = 70*O0
- 43*O1 - 87*O2 + 9*O3 + 90*O4 + 25*O5 - 80*O6 - 57*O7
+ mla v28.4s, v23.4s, v2.s[3] // v28 = [ 9] = 57*O0
- 80*O1 - 25*O2 + 90*O3 - 9*O4 - 87*O5 + 43*O6 + 70*O7
+ mls v29.4s, v23.4s, v2.s[2] // v29 = [11] = 43*O0
- 90*O1 + 57*O2 + 25*O3 - 87*O4 + 70*O5 + 9*O6 - 80*O7
+ mla v30.4s, v23.4s, v2.s[1] // v30 = [13] = 25*O0
- 70*O1 + 90*O2 - 80*O3 + 43*O4 + 9*O5 - 57*O6 + 87*O7
+ mls v31.4s, v23.4s, v2.s[0] // v31 = [15] = 9*O0
- 25*O1 + 43*O2 - 57*O3 + 70*O4 - 80*O5 + 87*O6 - 90*O7
+
+1:
+ sqrshrn v24.4h, v24.4s, #dct16_shift_2 // [1]
+ sqrshrn v25.4h, v25.4s, #dct16_shift_2 // [3]
+ sqrshrn v26.4h, v26.4s, #dct16_shift_2 // [5]
+ sqrshrn v27.4h, v27.4s, #dct16_shift_2 // [7]
+ sqrshrn v28.4h, v28.4s, #dct16_shift_2 // [9]
+ sqrshrn v29.4h, v29.4s, #dct16_shift_2 // [11]
+ sqrshrn v30.4h, v30.4s, #dct16_shift_2 // [13]
+ sqrshrn v31.4h, v31.4s, #dct16_shift_2 // [15]
+
+ str d24, [x5, #(16*2* 1)]
+ str d25, [x5, #(16*2* 3)]
+ str d26, [x5, #(16*2* 5)]
+ str d27, [x5, #(16*2* 7)]
+ str d28, [x5, #(16*2* 9)]
+ str d29, [x5, #(16*2*11)]
+ str d30, [x5, #(16*2*13)]
+ str d31, [x5, #(16*2*15)]
+
+// EE0 = E0 + E7;
+// EO0 = E0 - E7;
+// EE1 = E1 + E6;
+// EO1 = E1 - E6;
+// EE2 = E2 + E5;
+// EO2 = E2 - E5;
+// EE3 = E3 + E4;
+// EO3 = E3 - E4;
+ add v16.4s, v4.4s, v11.4s // v16 = EE0
+ sub v17.4s, v4.4s, v11.4s // v17 = EO0
+ add v18.4s, v5.4s, v10.4s // v18 = EE1
+ sub v19.4s, v5.4s, v10.4s // v19 = EO1
+ add v20.4s, v6.4s, v9.4s // v20 = EE2
+ sub v21.4s, v6.4s, v9.4s // v21 = EO2
+ add v22.4s, v7.4s, v8.4s // v22 = EE3
+ sub v23.4s, v7.4s, v8.4s // v23 = EO3
+
+// EEE0 = EE0 + EE3;
+// EEO0 = EE0 - EE3;
+// EEE1 = EE1 + EE2;
+// EEO1 = EE1 - EE2;
+
+ add v24.4s, v16.4s, v22.4s // v24 = EEE0
+ sub v25.4s, v16.4s, v22.4s // v25 = EEO0
+ add v26.4s, v18.4s, v20.4s // v26 = EEE1
+ sub v27.4s, v18.4s, v20.4s // v27 = EEO1
+
+ orr v28.16b, v21.16b, v23.16b
+ uqxtn v28.4h, v28.4s
+ mov x0, v28.d[0] // x0 = zeros[EO3 EO2]
+
+// [ 0] = (64*EEE0 + 64*EEE1 + rnd) >> nShift;
+// [ 4] = (83*EEO0 + 36*EEO1 + rnd) >> nShift;
+// [ 8] = (64*EEE0 - 64*EEE1 + rnd) >> nShift;
+// [12] = (36*EEO0 - 83*EEO1 + rnd) >> nShift;
+
+ add v28.4s, v24.4s, v26.4s // [ 0] = EEE0+EEE1
+ mul v29.4s, v25.4s, v0.s[1] // [ 4] = 83*EEO0
+ sub v30.4s, v24.4s, v26.4s // [ 8] = EEE0-EEE1
+ mul v31.4s, v25.4s, v0.s[2] // [12] = 36*EEO0
+
+ shl v28.4s, v28.4s, #6 // [ 0] = 64*EEE0 +
64*EEE1
+ mla v29.4s, v27.4s, v0.s[2] // [ 4] = 83*EEO0 +
36*EEO1
+ shl v30.4s, v30.4s, #6 // [ 0] = 64*EEE0 -
64*EEE1
+ mls v31.4s, v27.4s, v0.s[1] // [12] = 36*EEO0 -
83*EEO1
+
+ sqrshrn v28.4h, v28.4s, #dct16_shift_2 // [ 0]
+ sqrshrn v29.4h, v29.4s, #dct16_shift_2 // [ 4]
+ sqrshrn v30.4h, v30.4s, #dct16_shift_2 // [ 8]
+ sqrshrn v31.4h, v31.4s, #dct16_shift_2 // [12]
+
+ str d28, [x5, #(16*2* 0)]
+ str d29, [x5, #(16*2* 4)]
+ str d30, [x5, #(16*2* 8)]
+ str d31, [x5, #(16*2*12)]
+
+// [ 2] = (89*EO0 + 75*EO1 + 50*EO2 + 18*EO3 + rnd) >> nShift;
+// [ 6] = (75*EO0 - 18*EO1 - 89*EO2 - 50*EO3 + rnd) >> nShift;
+// [10] = (50*EO0 - 89*EO1 + 18*EO2 + 75*EO3 + rnd) >> nShift;
+// [14] = (18*EO0 - 50*EO1 + 75*EO2 - 89*EO3 + rnd) >> nShift;
+
+ mul v28.4s, v17.4s, v0.s[3] // [ 2] = 89*EO0
+ mul v29.4s, v17.4s, v1.s[0] // [ 6] = 75*EO0
+ mul v30.4s, v17.4s, v1.s[1] // [10] = 50*EO0
+ mul v31.4s, v17.4s, v1.s[2] // [14] = 18*EO0
+
+ mla v28.4s, v19.4s, v1.s[0] // [ 2] = 89*EO0 +
75*EO1
+ mls v29.4s, v19.4s, v1.s[2] // [ 6] = 75*EO0 -
18*EO1
+ mls v30.4s, v19.4s, v0.s[3] // [10] = 50*EO0 -
89*EO1
+ mls v31.4s, v19.4s, v1.s[1] // [14] = 18*EO0 -
50*EO1
+
+ cbz x0, 1f
+
+ mla v28.4s, v21.4s, v1.s[1] // [ 2] = 89*EO0 +
75*EO1 + 50*EO2
+ mls v29.4s, v21.4s, v0.s[3] // [ 6] = 75*EO0 -
18*EO1 - 89*EO2
+ mla v30.4s, v21.4s, v1.s[2] // [10] = 50*EO0 -
89*EO1 + 18*EO2
+ mla v31.4s, v21.4s, v1.s[0] // [14] = 18*EO0 -
50*EO1 + 75*EO2
+
+ mla v28.4s, v23.4s, v1.s[2] // [ 2] = 89*EO0 +
75*EO1 + 50*EO2 + 18*EO3
+ mls v29.4s, v23.4s, v1.s[1] // [ 6] = 75*EO0 -
18*EO1 - 89*EO2 - 50*EO3
+ mla v30.4s, v23.4s, v1.s[0] // [10] = 50*EO0 -
89*EO1 + 18*EO2 + 75*EO3
+ mls v31.4s, v23.4s, v0.s[3] // [14] = 18*EO0 -
50*EO1 + 75*EO2 - 89*EO3
+
+1:
+
+ sqrshrn v28.4h, v28.4s, #dct16_shift_2 // [ 0]
+ sqrshrn v29.4h, v29.4s, #dct16_shift_2 // [ 4]
+ sqrshrn v30.4h, v30.4s, #dct16_shift_2 // [ 8]
+ sqrshrn v31.4h, v31.4s, #dct16_shift_2 // [12]
+
+ str d28, [x5, #(16*2* 2)]
+ str d29, [x5, #(16*2* 6)]
+ str d30, [x5, #(16*2*10)]
+ str d31, [x5, #(16*2*14)]
+
+ add x5, x5, #(4*2)
+ sub w4, w4, #4
+ cbnz w4, 6b
+
+9:
+ ldp d14, d15, [sp], #16
+ ldp d12, d13, [sp], #16
+ ldp d10, d11, [sp], #16
+ ldp d8, d9, [sp], #16
+ ret
+endfunc
--
2.36.0.windows.1
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/0e1e5686/attachment-0001.htm>
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0005-AArch64-DCT16x16.patch
Type: application/octet-stream
Size: 24417 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240912/0e1e5686/attachment-0001.obj>
More information about the x265-devel
mailing list