[x265] [PATCH v2 6/8] AArch64: Refactor setup of optimised assembly primitives
Hari Limaye
hari.limaye at arm.com
Tue Jul 30 15:46:00 UTC 2024
Refactor the AArch64 assembly primitives setup functions so that they
only add optimised primitives that use instructions specific to one
architecture extension.
As well as removing duplication and deleting some now unnecessary
macros, this change results in the test harness now correctly testing
each architecture extension separately, and simplifies the addition of
new architecture features.
---
source/common/aarch64/asm-primitives.cpp | 787 +----------------------
1 file changed, 13 insertions(+), 774 deletions(-)
diff --git a/source/common/aarch64/asm-primitives.cpp b/source/common/aarch64/asm-primitives.cpp
index fd6332786..b1e6b817b 100644
--- a/source/common/aarch64/asm-primitives.cpp
+++ b/source/common/aarch64/asm-primitives.cpp
@@ -39,15 +39,9 @@ extern "C" {
p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu)
-#define LUMA_TU_TYPED_NEON(prim, fncdef, fname) \
- p.cu[BLOCK_4x4].prim = fncdef PFX(fname ## _4x4_ ## neon); \
- p.cu[BLOCK_8x8].prim = fncdef PFX(fname ## _8x8_ ## neon); \
- p.cu[BLOCK_16x16].prim = fncdef PFX(fname ## _16x16_ ## neon); \
- p.cu[BLOCK_64x64].prim = fncdef PFX(fname ## _64x64_ ## neon)
#define LUMA_TU_TYPED_CAN_USE_SVE(prim, fncdef, fname) \
p.cu[BLOCK_32x32].prim = fncdef PFX(fname ## _32x32_ ## sve)
#define ALL_LUMA_TU(prim, fname, cpu) ALL_LUMA_TU_TYPED(prim, , fname, cpu)
-#define LUMA_TU_NEON(prim, fname) LUMA_TU_TYPED_NEON(prim, , fname)
#define LUMA_TU_CAN_USE_SVE(prim, fname) LUMA_TU_TYPED_CAN_USE_SVE(prim, , fname)
#define ALL_LUMA_PU_TYPED(prim, fncdef, fname, cpu) \
@@ -103,23 +97,6 @@ extern "C" {
p.pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## cpu); \
p.pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## cpu); \
p.pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu)
-#define LUMA_PU_TYPED_NEON_1(prim, fncdef, fname) \
- p.pu[LUMA_4x4].prim = fncdef PFX(fname ## _4x4_ ## neon); \
- p.pu[LUMA_4x8].prim = fncdef PFX(fname ## _4x8_ ## neon); \
- p.pu[LUMA_4x16].prim = fncdef PFX(fname ## _4x16_ ## neon); \
- p.pu[LUMA_12x16].prim = fncdef PFX(fname ## _12x16_ ## neon); \
- p.pu[LUMA_8x8].prim = fncdef PFX(fname ## _8x8_ ## neon); \
- p.pu[LUMA_16x16].prim = fncdef PFX(fname ## _16x16_ ## neon); \
- p.pu[LUMA_8x4].prim = fncdef PFX(fname ## _8x4_ ## neon); \
- p.pu[LUMA_16x8].prim = fncdef PFX(fname ## _16x8_ ## neon); \
- p.pu[LUMA_8x16].prim = fncdef PFX(fname ## _8x16_ ## neon); \
- p.pu[LUMA_16x12].prim = fncdef PFX(fname ## _16x12_ ## neon); \
- p.pu[LUMA_16x32].prim = fncdef PFX(fname ## _16x32_ ## neon); \
- p.pu[LUMA_16x4].prim = fncdef PFX(fname ## _16x4_ ## neon); \
- p.pu[LUMA_24x32].prim = fncdef PFX(fname ## _24x32_ ## neon); \
- p.pu[LUMA_8x32].prim = fncdef PFX(fname ## _8x32_ ## neon); \
- p.pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## neon); \
- p.pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## neon)
#define LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
p.pu[LUMA_32x32].prim = fncdef PFX(fname ## _32x32_ ## sve); \
p.pu[LUMA_64x64].prim = fncdef PFX(fname ## _64x64_ ## sve); \
@@ -130,20 +107,6 @@ extern "C" {
p.pu[LUMA_32x8].prim = fncdef PFX(fname ## _32x8_ ## sve); \
p.pu[LUMA_64x48].prim = fncdef PFX(fname ## _64x48_ ## sve); \
p.pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## sve)
-#define LUMA_PU_TYPED_NEON_2(prim, fncdef, fname) \
- p.pu[LUMA_4x4].prim = fncdef PFX(fname ## _4x4_ ## neon); \
- p.pu[LUMA_8x4].prim = fncdef PFX(fname ## _8x4_ ## neon); \
- p.pu[LUMA_4x8].prim = fncdef PFX(fname ## _4x8_ ## neon); \
- p.pu[LUMA_8x8].prim = fncdef PFX(fname ## _8x8_ ## neon); \
- p.pu[LUMA_16x8].prim = fncdef PFX(fname ## _16x8_ ## neon); \
- p.pu[LUMA_8x16].prim = fncdef PFX(fname ## _8x16_ ## neon); \
- p.pu[LUMA_16x16].prim = fncdef PFX(fname ## _16x16_ ## neon); \
- p.pu[LUMA_16x32].prim = fncdef PFX(fname ## _16x32_ ## neon); \
- p.pu[LUMA_16x12].prim = fncdef PFX(fname ## _16x12_ ## neon); \
- p.pu[LUMA_16x4].prim = fncdef PFX(fname ## _16x4_ ## neon); \
- p.pu[LUMA_4x16].prim = fncdef PFX(fname ## _4x16_ ## neon); \
- p.pu[LUMA_8x32].prim = fncdef PFX(fname ## _8x32_ ## neon); \
- p.pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## neon)
#define LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, fncdef, fname, cpu) \
p.pu[LUMA_32x32].prim = fncdef PFX(fname ## _32x32_ ## cpu); \
p.pu[LUMA_64x64].prim = fncdef PFX(fname ## _64x64_ ## cpu); \
@@ -157,10 +120,6 @@ extern "C" {
p.pu[LUMA_64x48].prim = fncdef PFX(fname ## _64x48_ ## cpu); \
p.pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## cpu); \
p.pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## cpu)
-#define LUMA_PU_TYPED_NEON_3(prim, fncdef, fname) \
- p.pu[LUMA_4x4].prim = fncdef PFX(fname ## _4x4_ ## neon); \
- p.pu[LUMA_4x8].prim = fncdef PFX(fname ## _4x8_ ## neon); \
- p.pu[LUMA_4x16].prim = fncdef PFX(fname ## _4x16_ ## neon)
#define LUMA_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
p.pu[LUMA_8x8].prim = fncdef PFX(fname ## _8x8_ ## sve2); \
p.pu[LUMA_16x16].prim = fncdef PFX(fname ## _16x16_ ## sve2); \
@@ -184,22 +143,6 @@ extern "C" {
p.pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## sve2); \
p.pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## sve2); \
p.pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## sve2)
-#define LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
- p.pu[LUMA_4x4].prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
- p.pu[LUMA_8x8].prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
- p.pu[LUMA_16x16].prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
- p.pu[LUMA_8x4].prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
- p.pu[LUMA_4x8].prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
- p.pu[LUMA_16x8].prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
- p.pu[LUMA_8x16].prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
- p.pu[LUMA_16x32].prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
- p.pu[LUMA_16x12].prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
- p.pu[LUMA_12x16].prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
- p.pu[LUMA_16x4].prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
- p.pu[LUMA_4x16].prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
- p.pu[LUMA_24x32].prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
- p.pu[LUMA_8x32].prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
- p.pu[LUMA_16x64].prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
#define LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
p.pu[LUMA_32x32].prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
p.pu[LUMA_32x16].prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
@@ -214,13 +157,9 @@ extern "C" {
#define ALL_LUMA_PU(prim, fname, cpu) ALL_LUMA_PU_TYPED(prim, , fname, cpu)
#define LUMA_PU_MULTIPLE_ARCHS_1(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_1(prim, , fname, cpu)
#define LUMA_PU_MULTIPLE_ARCHS_2(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_2(prim, , fname, cpu)
-#define LUMA_PU_NEON_1(prim, fname) LUMA_PU_TYPED_NEON_1(prim, , fname)
#define LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
-#define LUMA_PU_NEON_2(prim, fname) LUMA_PU_TYPED_NEON_2(prim, , fname)
#define LUMA_PU_MULTIPLE_ARCHS_3(prim, fname, cpu) LUMA_PU_TYPED_MULTIPLE_ARCHS_3(prim, , fname, cpu)
-#define LUMA_PU_NEON_3(prim, fname) LUMA_PU_TYPED_NEON_3(prim, , fname)
#define LUMA_PU_CAN_USE_SVE2(prim, fname) LUMA_PU_TYPED_CAN_USE_SVE2(prim, , fname)
-#define LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
#define LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) LUMA_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
@@ -276,37 +215,9 @@ extern "C" {
p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef PFX(fname ## _32x8_ ## cpu); \
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu)
-#define CHROMA_420_PU_TYPED_NEON_1(prim, fncdef, fname) \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef PFX(fname ## _4x4_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].prim = fncdef PFX(fname ## _4x2_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef PFX(fname ## _4x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_6x8].prim = fncdef PFX(fname ## _6x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].prim = fncdef PFX(fname ## _12x16_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].prim = fncdef PFX(fname ## _4x16_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].prim = fncdef PFX(fname ## _32x24_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef PFX(fname ## _24x32_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef PFX(fname ## _32x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef PFX(fname ## _8x32_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef PFX(fname ## _8x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef PFX(fname ## _16x16_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].prim = fncdef PFX(fname ## _2x4_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].prim = fncdef PFX(fname ## _8x4_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].prim = fncdef PFX(fname ## _16x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].prim = fncdef PFX(fname ## _8x16_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].prim = fncdef PFX(fname ## _16x32_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].prim = fncdef PFX(fname ## _8x6_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].prim = fncdef PFX(fname ## _8x2_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].prim = fncdef PFX(fname ## _2x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].prim = fncdef PFX(fname ## _16x12_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].prim = fncdef PFX(fname ## _16x4_ ## neon)
#define CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].prim = fncdef PFX(fname ## _32x32_ ## sve); \
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].prim = fncdef PFX(fname ## _32x16_ ## sve)
-#define CHROMA_420_PU_TYPED_NEON_2(prim, fncdef, fname) \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef PFX(fname ## _4x4_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x2].prim = fncdef PFX(fname ## _4x2_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef PFX(fname ## _4x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].prim = fncdef PFX(fname ## _4x16_ ## neon)
#define CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, fncdef, fname, cpu) \
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef PFX(fname ## _8x8_ ## cpu); \
p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef PFX(fname ## _16x16_ ## cpu); \
@@ -328,23 +239,6 @@ extern "C" {
p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef PFX(fname ## _24x32_ ## cpu); \
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef PFX(fname ## _32x8_ ## cpu); \
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef PFX(fname ## _8x32_ ## cpu)
-#define CHROMA_420_PU_TYPED_FILTER_PIXEL_TO_SHORT_NEON(prim, fncdef) \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x6].prim = fncdef PFX(filterPixelToShort ## _8x6_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x2].prim = fncdef PFX(filterPixelToShort ## _8x2_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon)
#define CHROMA_420_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x4].prim = fncdef PFX(filterPixelToShort ## _2x4_ ## sve); \
p.chroma[X265_CSP_I420].pu[CHROMA_420_2x8].prim = fncdef PFX(filterPixelToShort ## _2x8_ ## sve); \
@@ -355,11 +249,8 @@ extern "C" {
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].prim = fncdef PFX(filterPixelToShort ## _32x24_ ## sve); \
p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].prim = fncdef PFX(filterPixelToShort ## _32x8_ ## sve)
#define ALL_CHROMA_420_PU(prim, fname, cpu) ALL_CHROMA_420_PU_TYPED(prim, , fname, cpu)
-#define CHROMA_420_PU_NEON_1(prim, fname) CHROMA_420_PU_TYPED_NEON_1(prim, , fname)
#define CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) CHROMA_420_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
-#define CHROMA_420_PU_NEON_2(prim, fname) CHROMA_420_PU_TYPED_NEON_2(prim, , fname)
#define CHROMA_420_PU_MULTIPLE_ARCHS(prim, fname, cpu) CHROMA_420_PU_TYPED_MULTIPLE_ARCHS(prim, , fname, cpu)
-#define CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(prim) CHROMA_420_PU_TYPED_FILTER_PIXEL_TO_SHORT_NEON(prim, )
#define CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_420_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
@@ -411,37 +302,11 @@ extern "C" {
p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].prim = fncdef PFX(fname ## _24x64_ ## cpu); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].prim = fncdef PFX(fname ## _32x16_ ## cpu); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].prim = fncdef PFX(fname ## _8x64_ ## cpu)
-#define CHROMA_422_PU_TYPED_NEON_1(prim, fncdef, fname) \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].prim = fncdef PFX(fname ## _4x8_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].prim = fncdef PFX(fname ## _4x4_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].prim = fncdef PFX(fname ## _4x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_6x16].prim = fncdef PFX(fname ## _6x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].prim = fncdef PFX(fname ## _12x32_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].prim = fncdef PFX(fname ## _4x32_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].prim = fncdef PFX(fname ## _8x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].prim = fncdef PFX(fname ## _16x32_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].prim = fncdef PFX(fname ## _2x8_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].prim = fncdef PFX(fname ## _8x8_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].prim = fncdef PFX(fname ## _16x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].prim = fncdef PFX(fname ## _8x32_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].prim = fncdef PFX(fname ## _16x64_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].prim = fncdef PFX(fname ## _8x12_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].prim = fncdef PFX(fname ## _8x4_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].prim = fncdef PFX(fname ## _2x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].prim = fncdef PFX(fname ## _16x24_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].prim = fncdef PFX(fname ## _16x8_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].prim = fncdef PFX(fname ## _24x64_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].prim = fncdef PFX(fname ## _8x64_ ## neon)
#define CHROMA_422_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fncdef, fname) \
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].prim = fncdef PFX(fname ## _32x64_ ## sve); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].prim = fncdef PFX(fname ## _32x32_ ## sve); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].prim = fncdef PFX(fname ## _32x48_ ## sve); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].prim = fncdef PFX(fname ## _32x16_ ## sve)
-#define CHROMA_422_PU_TYPED_NEON_2(prim, fncdef, fname) \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].prim = fncdef PFX(fname ## _4x8_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].prim = fncdef PFX(fname ## _4x4_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].prim = fncdef PFX(fname ## _4x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].prim = fncdef PFX(fname ## _4x32_ ## neon)
#define CHROMA_422_PU_TYPED_CAN_USE_SVE2(prim, fncdef, fname) \
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].prim = fncdef PFX(fname ## _8x16_ ## sve2); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].prim = fncdef PFX(fname ## _16x32_ ## sve2); \
@@ -463,24 +328,6 @@ extern "C" {
p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].prim = fncdef PFX(fname ## _24x64_ ## sve2); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].prim = fncdef PFX(fname ## _32x16_ ## sve2); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].prim = fncdef PFX(fname ## _8x64_ ## sve2)
-#define CHROMA_422_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].prim = fncdef PFX(filterPixelToShort ## _8x12_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].prim = fncdef PFX(filterPixelToShort ## _16x24_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].prim = fncdef PFX(filterPixelToShort ## _12x32_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].prim = fncdef PFX(filterPixelToShort ## _4x32_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].prim = fncdef PFX(filterPixelToShort ## _24x64_ ## neon); \
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].prim = fncdef PFX(filterPixelToShort ## _8x64_ ## neon)
#define CHROMA_422_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x8].prim = fncdef PFX(filterPixelToShort ## _2x8_ ## sve); \
p.chroma[X265_CSP_I422].pu[CHROMA_422_2x16].prim = fncdef PFX(filterPixelToShort ## _2x16_ ## sve); \
@@ -492,9 +339,7 @@ extern "C" {
#define ALL_CHROMA_422_PU(prim, fname, cpu) ALL_CHROMA_422_PU_TYPED(prim, , fname, cpu)
#define CHROMA_422_PU_NEON_1(prim, fname) CHROMA_422_PU_TYPED_NEON_1(prim, , fname)
#define CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, fname) CHROMA_422_PU_TYPED_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(prim, , fname)
-#define CHROMA_422_PU_NEON_2(prim, fname) CHROMA_422_PU_TYPED_NEON_2(prim, , fname)
#define CHROMA_422_PU_CAN_USE_SVE2(prim, fname) CHROMA_422_PU_TYPED_CAN_USE_SVE2(prim, , fname)
-#define CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) CHROMA_422_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
#define CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_422_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
#define ALL_CHROMA_444_PU_TYPED(prim, fncdef, fname, cpu) \
@@ -523,22 +368,6 @@ extern "C" {
p.chroma[X265_CSP_I444].pu[LUMA_48x64].prim = fncdef PFX(fname ## _48x64_ ## cpu); \
p.chroma[X265_CSP_I444].pu[LUMA_64x16].prim = fncdef PFX(fname ## _64x16_ ## cpu); \
p.chroma[X265_CSP_I444].pu[LUMA_16x64].prim = fncdef PFX(fname ## _16x64_ ## cpu)
-#define CHROMA_444_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
- p.chroma[X265_CSP_I444].pu[LUMA_4x4].prim = fncdef PFX(filterPixelToShort ## _4x4_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_8x8].prim = fncdef PFX(filterPixelToShort ## _8x8_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_16x16].prim = fncdef PFX(filterPixelToShort ## _16x16_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_8x4].prim = fncdef PFX(filterPixelToShort ## _8x4_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_4x8].prim = fncdef PFX(filterPixelToShort ## _4x8_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_16x8].prim = fncdef PFX(filterPixelToShort ## _16x8_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_8x16].prim = fncdef PFX(filterPixelToShort ## _8x16_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_16x32].prim = fncdef PFX(filterPixelToShort ## _16x32_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_16x12].prim = fncdef PFX(filterPixelToShort ## _16x12_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_12x16].prim = fncdef PFX(filterPixelToShort ## _12x16_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_16x4].prim = fncdef PFX(filterPixelToShort ## _16x4_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_4x16].prim = fncdef PFX(filterPixelToShort ## _4x16_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_24x32].prim = fncdef PFX(filterPixelToShort ## _24x32_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_8x32].prim = fncdef PFX(filterPixelToShort ## _8x32_ ## neon); \
- p.chroma[X265_CSP_I444].pu[LUMA_16x64].prim = fncdef PFX(filterPixelToShort ## _16x64_ ## neon)
#define CHROMA_444_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, fncdef) \
p.chroma[X265_CSP_I444].pu[LUMA_32x32].prim = fncdef PFX(filterPixelToShort ## _32x32_ ## sve); \
p.chroma[X265_CSP_I444].pu[LUMA_32x16].prim = fncdef PFX(filterPixelToShort ## _32x16_ ## sve); \
@@ -551,7 +380,6 @@ extern "C" {
p.chroma[X265_CSP_I444].pu[LUMA_64x16].prim = fncdef PFX(filterPixelToShort ## _64x16_ ## sve); \
p.chroma[X265_CSP_I444].pu[LUMA_48x64].prim = fncdef PFX(filterPixelToShort ## _48x64_ ## sve)
#define ALL_CHROMA_444_PU(prim, fname, cpu) ALL_CHROMA_444_PU_TYPED(prim, , fname, cpu)
-#define CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(prim) CHROMA_444_PU_TYPED_NEON_FILTER_PIXEL_TO_SHORT(prim, )
#define CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(prim) CHROMA_444_PU_TYPED_SVE_FILTER_PIXEL_TO_SHORT(prim, )
#define ALL_CHROMA_420_VERT_FILTERS(cpu) \
@@ -560,9 +388,6 @@ extern "C" {
ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, cpu); \
ALL_CHROMA_420_4x4_PU(filter_vss, interp_4tap_vert_ss, cpu)
-#define CHROMA_420_VERT_FILTERS_NEON() \
- ALL_CHROMA_420_4x4_PU(filter_vsp, interp_4tap_vert_sp, neon)
-
#define CHROMA_420_VERT_FILTERS_CAN_USE_SVE2() \
ALL_CHROMA_420_4x4_PU(filter_vpp, interp_4tap_vert_pp, sve2); \
ALL_CHROMA_420_4x4_PU(filter_vps, interp_4tap_vert_ps, sve2); \
@@ -1078,73 +903,33 @@ void setupNeonPrimitives(EncoderPrimitives &p)
#if defined(HAVE_SVE2) || defined(HAVE_SVE)
void setupSvePrimitives(EncoderPrimitives &p)
{
- CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2s[NONALIGNED]);
CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
- CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
- CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
- LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2s[ALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[ALIGNED]);
- CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2s[ALIGNED]);
CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
- CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
- CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
- LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
#if !HIGH_BIT_DEPTH
- ALL_LUMA_PU(luma_vpp, interp_8tap_vert_pp, neon);
- ALL_LUMA_PU(luma_vsp, interp_8tap_vert_sp, neon);
- ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, neon);
- ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
- ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
- ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, neon);
- ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
- ALL_CHROMA_420_VERT_FILTERS(neon);
- CHROMA_422_VERT_FILTERS_NEON();
- CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(neon);
- ALL_CHROMA_444_VERT_FILTERS(neon);
- ALL_CHROMA_420_FILTERS(neon);
- ALL_CHROMA_422_FILTERS(neon);
- ALL_CHROMA_444_FILTERS(neon);
-
-
// Blockcopy_pp
- LUMA_PU_NEON_1(copy_pp, blockcopy_pp);
LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- CHROMA_420_PU_NEON_1(copy_pp, blockcopy_pp);
CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- CHROMA_422_PU_NEON_1(copy_pp, blockcopy_pp);
CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- p.cu[BLOCK_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
- p.cu[BLOCK_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
- p.cu[BLOCK_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
p.cu[BLOCK_32x32].copy_pp = PFX(blockcopy_pp_32x32_sve);
p.cu[BLOCK_64x64].copy_pp = PFX(blockcopy_pp_64x64_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_sve);
#endif // !HIGH_BIT_DEPTH
// Blockcopy_ss
- p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
- p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_sve);
// Blockcopy_ps
- p.cu[BLOCK_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
- p.cu[BLOCK_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_sve);
p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_sve);
p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_sve);
@@ -1154,21 +939,14 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
- p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
// chroma blockcopy_ss
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ss = PFX(blockcopy_ss_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ss = PFX(blockcopy_ss_8x16_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_sve);
// chroma blockcopy_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = PFX(blockcopy_ps_16x16_sve);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_sve);
p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ps = PFX(blockcopy_ps_4x8_sve);
@@ -1187,510 +965,97 @@ void setupSvePrimitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_sve);
// Block_fill
- LUMA_TU_NEON(blockfill_s[ALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
- LUMA_TU_NEON(blockfill_s[NONALIGNED], blockfill_s);
LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
- // copy_count
- p.cu[BLOCK_4x4].copy_cnt = PFX(copy_cnt_4_neon);
- p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_neon);
- p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_neon);
- p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_neon);
-
- // count nonzero
- p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4_neon);
- p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8_neon);
- p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
- p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
-
// cpy2Dto1D_shl
- p.cu[BLOCK_4x4].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon);
p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
// cpy2Dto1D_shr
- p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
- p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
// cpy1Dto2D_shl
- p.cu[BLOCK_4x4].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon);
p.cu[BLOCK_16x16].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_16x16_sve);
p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_32x32_sve);
p.cu[BLOCK_64x64].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_64x64_sve);
- p.cu[BLOCK_4x4].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon);
p.cu[BLOCK_16x16].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_16x16_sve);
p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32x32_sve);
p.cu[BLOCK_64x64].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_64x64_sve);
// cpy1Dto2D_shr
- p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
#if !HIGH_BIT_DEPTH
- // pixel_avg_pp
- ALL_LUMA_PU(pixelavg_pp[NONALIGNED], pixel_avg_pp, neon);
- ALL_LUMA_PU(pixelavg_pp[ALIGNED], pixel_avg_pp, neon);
-
- // addAvg
- ALL_LUMA_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_LUMA_PU(addAvg[ALIGNED], addAvg, neon);
- ALL_CHROMA_420_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_CHROMA_422_PU(addAvg[NONALIGNED], addAvg, neon);
- ALL_CHROMA_420_PU(addAvg[ALIGNED], addAvg, neon);
- ALL_CHROMA_422_PU(addAvg[ALIGNED], addAvg, neon);
-
- // sad
- ALL_LUMA_PU(sad, pixel_sad, neon);
- ALL_LUMA_PU(sad_x3, sad_x3, neon);
- ALL_LUMA_PU(sad_x4, sad_x4, neon);
-
// sse_pp
p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
- p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon);
- p.cu[BLOCK_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon);
- p.cu[BLOCK_32x32].sse_pp = PFX(pixel_sse_pp_32x32_neon);
- p.cu[BLOCK_64x64].sse_pp = PFX(pixel_sse_pp_64x64_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_sse_pp_32x32_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = PFX(pixel_sse_pp_4x8_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_neon);
-
- // sse_ss
- p.cu[BLOCK_4x4].sse_ss = PFX(pixel_sse_ss_4x4_neon);
- p.cu[BLOCK_8x8].sse_ss = PFX(pixel_sse_ss_8x8_neon);
- p.cu[BLOCK_16x16].sse_ss = PFX(pixel_sse_ss_16x16_neon);
- p.cu[BLOCK_32x32].sse_ss = PFX(pixel_sse_ss_32x32_neon);
- p.cu[BLOCK_64x64].sse_ss = PFX(pixel_sse_ss_64x64_neon);
-
- // ssd_s
- p.cu[BLOCK_4x4].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_4x4_neon);
- p.cu[BLOCK_8x8].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_8x8_neon);
- p.cu[BLOCK_16x16].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_16x16_neon);
- p.cu[BLOCK_32x32].ssd_s[NONALIGNED] = PFX(pixel_ssd_s_32x32_neon);
-
- p.cu[BLOCK_4x4].ssd_s[ALIGNED] = PFX(pixel_ssd_s_4x4_neon);
- p.cu[BLOCK_8x8].ssd_s[ALIGNED] = PFX(pixel_ssd_s_8x8_neon);
- p.cu[BLOCK_16x16].ssd_s[ALIGNED] = PFX(pixel_ssd_s_16x16_neon);
- p.cu[BLOCK_32x32].ssd_s[ALIGNED] = PFX(pixel_ssd_s_32x32_neon);
-
- // pixel_var
- p.cu[BLOCK_8x8].var = PFX(pixel_var_8x8_neon);
- p.cu[BLOCK_16x16].var = PFX(pixel_var_16x16_neon);
- p.cu[BLOCK_32x32].var = PFX(pixel_var_32x32_neon);
- p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_neon);
-
- // calc_Residual
- p.cu[BLOCK_4x4].calcresidual[NONALIGNED] = PFX(getResidual4_neon);
- p.cu[BLOCK_8x8].calcresidual[NONALIGNED] = PFX(getResidual8_neon);
- p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = PFX(getResidual16_neon);
- p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_neon);
-
- p.cu[BLOCK_4x4].calcresidual[ALIGNED] = PFX(getResidual4_neon);
- p.cu[BLOCK_8x8].calcresidual[ALIGNED] = PFX(getResidual8_neon);
- p.cu[BLOCK_16x16].calcresidual[ALIGNED] = PFX(getResidual16_neon);
- p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual32_neon);
-
- // pixel_sub_ps
- p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon);
- p.cu[BLOCK_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon);
- p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
- p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
- p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_neon);
- // chroma sub_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sub_ps = PFX(pixel_sub_ps_4x8_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_neon);
-
- // pixel_add_ps
- p.cu[BLOCK_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.cu[BLOCK_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.cu[BLOCK_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.cu[BLOCK_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.cu[BLOCK_64x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_64x64_neon);
-
- p.cu[BLOCK_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.cu[BLOCK_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.cu[BLOCK_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.cu[BLOCK_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.cu[BLOCK_64x64].add_ps[ALIGNED] = PFX(pixel_add_ps_64x64_neon);
-
- // chroma add_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[NONALIGNED] = PFX(pixel_add_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[NONALIGNED] = PFX(pixel_add_ps_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[NONALIGNED] = PFX(pixel_add_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[NONALIGNED] = PFX(pixel_add_ps_32x64_neon);
-
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].add_ps[ALIGNED] = PFX(pixel_add_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].add_ps[ALIGNED] = PFX(pixel_add_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].add_ps[ALIGNED] = PFX(pixel_add_ps_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].add_ps[ALIGNED] = PFX(pixel_add_ps_32x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].add_ps[ALIGNED] = PFX(pixel_add_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].add_ps[ALIGNED] = PFX(pixel_add_ps_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = PFX(pixel_add_ps_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_32x64_neon);
-
- //scale2D_64to32
- p.scale2D_64to32 = PFX(scale2D_64to32_neon);
-
- // scale1D_128to64
- p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_neon);
- p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_neon);
-
- // planecopy
- p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
// satd
p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_neon);
p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_sve);
- p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_neon);
p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_sve);
- p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_neon);
p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_sve);
- p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_neon);
- p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_neon);
- p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_neon);
- p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
- p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_neon);
- p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_neon);
- p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_neon);
- p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_neon);
- p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_neon);
p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_sve);
- p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_neon);
- p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_neon);
- p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_neon);
p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_neon);
p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_sve);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_neon);
p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_neon);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_sve);
p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = PFX(pixel_satd_8x64_neon);
// sa8d
p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sve);
- p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_neon);
- p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
- p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
- p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_neon);
-
- // dequant_scaling
- p.dequant_scaling = PFX(dequant_scaling_neon);
- p.dequant_normal = PFX(dequant_normal_neon);
-
- // ssim_4x4x2_core
- p.ssim_4x4x2_core = PFX(ssim_4x4x2_core_neon);
-
- // ssimDist
- p.cu[BLOCK_4x4].ssimDist = PFX(ssimDist4_neon);
- p.cu[BLOCK_8x8].ssimDist = PFX(ssimDist8_neon);
- p.cu[BLOCK_16x16].ssimDist = PFX(ssimDist16_neon);
- p.cu[BLOCK_32x32].ssimDist = PFX(ssimDist32_neon);
- p.cu[BLOCK_64x64].ssimDist = PFX(ssimDist64_neon);
-
- // normFact
- p.cu[BLOCK_8x8].normFact = PFX(normFact8_neon);
- p.cu[BLOCK_16x16].normFact = PFX(normFact16_neon);
- p.cu[BLOCK_32x32].normFact = PFX(normFact32_neon);
- p.cu[BLOCK_64x64].normFact = PFX(normFact64_neon);
-
- // psy_cost_pp
- p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
-
- p.weight_pp = PFX(weight_pp_neon);
-#if !defined(__APPLE__)
- p.scanPosLast = PFX(scanPosLast_neon);
-#endif
- p.costCoeffNxN = PFX(costCoeffNxN_neon);
#endif
// quant
p.quant = PFX(quant_sve);
- p.nquant = PFX(nquant_neon);
}
#endif
#if defined(HAVE_SVE2)
+#if !HIGH_BIT_DEPTH
void setupSve2Primitives(EncoderPrimitives &p)
{
- CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2s[NONALIGNED]);
- CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
- CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
- CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
- CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
- CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
- LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2s[ALIGNED]);
- LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[ALIGNED]);
- CHROMA_420_PU_FILTER_PIXEL_TO_SHORT_NEON(p2s[ALIGNED]);
- CHROMA_420_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[ALIGNED]);
- CHROMA_422_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
- CHROMA_422_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
- CHROMA_444_PU_NEON_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
- CHROMA_444_PU_SVE_FILTER_PIXEL_TO_SHORT(p2s[NONALIGNED]);
- LUMA_PU_NEON_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
- LUMA_PU_SVE_FILTER_PIXEL_TO_SHORT(convert_p2s[NONALIGNED]);
-
-#if !HIGH_BIT_DEPTH
- LUMA_PU_MULTIPLE_ARCHS_1(luma_vpp, interp_8tap_vert_pp, neon);
LUMA_PU_MULTIPLE_ARCHS_2(luma_vpp, interp_8tap_vert_pp, sve2);
LUMA_PU_MULTIPLE_ARCHS_1(luma_vsp, interp_8tap_vert_sp, sve2);
- LUMA_PU_MULTIPLE_ARCHS_2(luma_vsp, interp_8tap_vert_sp, neon);
ALL_LUMA_PU(luma_vps, interp_8tap_vert_ps, sve2);
- ALL_LUMA_PU(luma_hpp, interp_horiz_pp, neon);
- ALL_LUMA_PU(luma_hps, interp_horiz_ps, neon);
ALL_LUMA_PU(luma_vss, interp_8tap_vert_ss, sve2);
- ALL_LUMA_PU_T(luma_hvpp, interp_8tap_hv_pp_cpu);
- CHROMA_420_VERT_FILTERS_NEON();
CHROMA_420_VERT_FILTERS_CAN_USE_SVE2();
- CHROMA_422_VERT_FILTERS_NEON();
CHROMA_422_VERT_FILTERS_CAN_USE_SVE2(sve2);
- CHROMA_444_VERT_FILTERS_NEON();
CHROMA_444_VERT_FILTERS_CAN_USE_SVE2();
- CHROMA_420_FILTERS_NEON();
CHROMA_420_FILTERS_CAN_USE_SVE2();
- CHROMA_422_FILTERS_NEON();
CHROMA_422_FILTERS_CAN_USE_SVE2();
- CHROMA_444_FILTERS_NEON();
CHROMA_444_FILTERS_CAN_USE_SVE2();
- // Blockcopy_pp
- LUMA_PU_NEON_1(copy_pp, blockcopy_pp);
- LUMA_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- CHROMA_420_PU_NEON_1(copy_pp, blockcopy_pp);
- CHROMA_420_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- CHROMA_422_PU_NEON_1(copy_pp, blockcopy_pp);
- CHROMA_422_PU_CAN_USE_SVE_EXCEPT_FILTER_PIXEL_TO_SHORT(copy_pp, blockcopy_pp);
- p.cu[BLOCK_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
- p.cu[BLOCK_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
- p.cu[BLOCK_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
- p.cu[BLOCK_32x32].copy_pp = PFX(blockcopy_pp_32x32_sve);
- p.cu[BLOCK_64x64].copy_pp = PFX(blockcopy_pp_64x64_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_pp = PFX(blockcopy_pp_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_pp = PFX(blockcopy_pp_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_pp = PFX(blockcopy_pp_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_pp = PFX(blockcopy_pp_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_pp = PFX(blockcopy_pp_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_pp = PFX(blockcopy_pp_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_pp = PFX(blockcopy_pp_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_pp = PFX(blockcopy_pp_32x64_sve);
-
-#endif // !HIGH_BIT_DEPTH
-
- // Blockcopy_ss
- p.cu[BLOCK_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
- p.cu[BLOCK_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
- p.cu[BLOCK_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
- p.cu[BLOCK_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
- p.cu[BLOCK_64x64].copy_ss = PFX(blockcopy_ss_64x64_sve);
-
- // Blockcopy_ps
- p.cu[BLOCK_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
- p.cu[BLOCK_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
- p.cu[BLOCK_16x16].copy_ps = PFX(blockcopy_ps_16x16_sve);
- p.cu[BLOCK_32x32].copy_ps = PFX(blockcopy_ps_32x32_sve);
- p.cu[BLOCK_64x64].copy_ps = PFX(blockcopy_ps_64x64_sve);
-
- // Blockcopy_sp
- p.cu[BLOCK_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
- p.cu[BLOCK_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
- p.cu[BLOCK_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
- p.cu[BLOCK_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
- p.cu[BLOCK_64x64].copy_sp = PFX(blockcopy_sp_64x64_neon);
-
- // chroma blockcopy_ss
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ss = PFX(blockcopy_ss_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ss = PFX(blockcopy_ss_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ss = PFX(blockcopy_ss_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ss = PFX(blockcopy_ss_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ss = PFX(blockcopy_ss_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ss = PFX(blockcopy_ss_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ss = PFX(blockcopy_ss_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ss = PFX(blockcopy_ss_32x64_sve);
-
- // chroma blockcopy_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_ps = PFX(blockcopy_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_ps = PFX(blockcopy_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_ps = PFX(blockcopy_ps_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_ps = PFX(blockcopy_ps_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_ps = PFX(blockcopy_ps_4x8_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_ps = PFX(blockcopy_ps_8x16_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_ps = PFX(blockcopy_ps_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_ps = PFX(blockcopy_ps_32x64_sve);
-
- // chroma blockcopy_sp
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].copy_sp = PFX(blockcopy_sp_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].copy_sp = PFX(blockcopy_sp_8x8_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].copy_sp = PFX(blockcopy_sp_16x16_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].copy_sp = PFX(blockcopy_sp_32x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].copy_sp = PFX(blockcopy_sp_4x8_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].copy_sp = PFX(blockcopy_sp_8x16_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].copy_sp = PFX(blockcopy_sp_16x32_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].copy_sp = PFX(blockcopy_sp_32x64_sve);
-
- // Block_fill
- LUMA_TU_NEON(blockfill_s[ALIGNED], blockfill_s);
- LUMA_TU_CAN_USE_SVE(blockfill_s[ALIGNED], blockfill_s);
- LUMA_TU_NEON(blockfill_s[NONALIGNED], blockfill_s);
- LUMA_TU_CAN_USE_SVE(blockfill_s[NONALIGNED], blockfill_s);
-
- // copy_count
- p.cu[BLOCK_4x4].copy_cnt = PFX(copy_cnt_4_neon);
- p.cu[BLOCK_8x8].copy_cnt = PFX(copy_cnt_8_neon);
- p.cu[BLOCK_16x16].copy_cnt = PFX(copy_cnt_16_neon);
- p.cu[BLOCK_32x32].copy_cnt = PFX(copy_cnt_32_neon);
-
- // count nonzero
- p.cu[BLOCK_4x4].count_nonzero = PFX(count_nonzero_4_neon);
- p.cu[BLOCK_8x8].count_nonzero = PFX(count_nonzero_8_neon);
- p.cu[BLOCK_16x16].count_nonzero = PFX(count_nonzero_16_neon);
- p.cu[BLOCK_32x32].count_nonzero = PFX(count_nonzero_32_neon);
-
- // cpy2Dto1D_shl
- p.cu[BLOCK_4x4].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy2Dto1D_shl = PFX(cpy2Dto1D_shl_64x64_sve);
-
- // cpy2Dto1D_shr
- p.cu[BLOCK_4x4].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_4x4_neon);
- p.cu[BLOCK_8x8].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_8x8_neon);
- p.cu[BLOCK_16x16].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_16x16_sve);
- p.cu[BLOCK_32x32].cpy2Dto1D_shr = PFX(cpy2Dto1D_shr_32x32_sve);
-
- // cpy1Dto2D_shl
- p.cu[BLOCK_4x4].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[ALIGNED] = PFX(cpy1Dto2D_shl_64x64_sve);
-
- p.cu[BLOCK_4x4].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shl[NONALIGNED] = PFX(cpy1Dto2D_shl_64x64_sve);
-
- // cpy1Dto2D_shr
- p.cu[BLOCK_4x4].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_4x4_neon);
- p.cu[BLOCK_8x8].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_8x8_neon);
- p.cu[BLOCK_16x16].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_16x16_sve);
- p.cu[BLOCK_32x32].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_32x32_sve);
- p.cu[BLOCK_64x64].cpy1Dto2D_shr = PFX(cpy1Dto2D_shr_64x64_sve);
-
-#if !HIGH_BIT_DEPTH
// pixel_avg_pp
- LUMA_PU_NEON_2(pixelavg_pp[NONALIGNED], pixel_avg_pp);
LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[NONALIGNED], pixel_avg_pp, sve2);
- LUMA_PU_NEON_2(pixelavg_pp[ALIGNED], pixel_avg_pp);
LUMA_PU_MULTIPLE_ARCHS_3(pixelavg_pp[ALIGNED], pixel_avg_pp, sve2);
// addAvg
- LUMA_PU_NEON_3(addAvg[NONALIGNED], addAvg);
LUMA_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
- LUMA_PU_NEON_3(addAvg[ALIGNED], addAvg);
LUMA_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- CHROMA_420_PU_NEON_2(addAvg[NONALIGNED], addAvg);
CHROMA_420_PU_MULTIPLE_ARCHS(addAvg[NONALIGNED], addAvg, sve2);
- CHROMA_420_PU_NEON_2(addAvg[ALIGNED], addAvg);
CHROMA_420_PU_MULTIPLE_ARCHS(addAvg[ALIGNED], addAvg, sve2);
- CHROMA_422_PU_NEON_2(addAvg[NONALIGNED], addAvg);
CHROMA_422_PU_CAN_USE_SVE2(addAvg[NONALIGNED], addAvg);
- CHROMA_422_PU_NEON_2(addAvg[ALIGNED], addAvg);
CHROMA_422_PU_CAN_USE_SVE2(addAvg[ALIGNED], addAvg);
- // sad
- ALL_LUMA_PU(sad, pixel_sad, neon);
- ALL_LUMA_PU(sad_x3, sad_x3, neon);
- ALL_LUMA_PU(sad_x4, sad_x4, neon);
-
// sse_pp
- p.cu[BLOCK_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
- p.cu[BLOCK_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon);
- p.cu[BLOCK_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon);
p.cu[BLOCK_32x32].sse_pp = PFX(pixel_sse_pp_32x32_sve2);
p.cu[BLOCK_64x64].sse_pp = PFX(pixel_sse_pp_64x64_sve2);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sse_pp = PFX(pixel_sse_pp_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sse_pp = PFX(pixel_sse_pp_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sse_pp = PFX(pixel_sse_pp_16x16_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sse_pp = PFX(pixel_sse_pp_32x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sse_pp = PFX(pixel_sse_pp_4x8_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sse_pp = PFX(pixel_sse_pp_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sse_pp = PFX(pixel_sse_pp_16x32_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sse_pp = PFX(pixel_sse_pp_32x64_sve2);
// sse_ss
@@ -1718,31 +1083,18 @@ void setupSve2Primitives(EncoderPrimitives &p)
p.cu[BLOCK_64x64].var = PFX(pixel_var_64x64_sve2);
// calc_Residual
- p.cu[BLOCK_4x4].calcresidual[NONALIGNED] = PFX(getResidual4_neon);
- p.cu[BLOCK_8x8].calcresidual[NONALIGNED] = PFX(getResidual8_neon);
p.cu[BLOCK_16x16].calcresidual[NONALIGNED] = PFX(getResidual16_sve2);
p.cu[BLOCK_32x32].calcresidual[NONALIGNED] = PFX(getResidual32_sve2);
- p.cu[BLOCK_4x4].calcresidual[ALIGNED] = PFX(getResidual4_neon);
- p.cu[BLOCK_8x8].calcresidual[ALIGNED] = PFX(getResidual8_neon);
p.cu[BLOCK_16x16].calcresidual[ALIGNED] = PFX(getResidual16_sve2);
p.cu[BLOCK_32x32].calcresidual[ALIGNED] = PFX(getResidual32_sve2);
// pixel_sub_ps
- p.cu[BLOCK_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon);
- p.cu[BLOCK_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon);
- p.cu[BLOCK_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
p.cu[BLOCK_32x32].sub_ps = PFX(pixel_sub_ps_32x32_sve2);
p.cu[BLOCK_64x64].sub_ps = PFX(pixel_sub_ps_64x64_sve2);
// chroma sub_ps
- p.chroma[X265_CSP_I420].cu[BLOCK_420_4x4].sub_ps = PFX(pixel_sub_ps_4x4_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_8x8].sub_ps = PFX(pixel_sub_ps_8x8_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_420_16x16].sub_ps = PFX(pixel_sub_ps_16x16_neon);
p.chroma[X265_CSP_I420].cu[BLOCK_420_32x32].sub_ps = PFX(pixel_sub_ps_32x32_sve2);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_4x8].sub_ps = PFX(pixel_sub_ps_4x8_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sub_ps = PFX(pixel_sub_ps_8x16_sve);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sub_ps = PFX(pixel_sub_ps_16x32_neon);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sub_ps = PFX(pixel_sub_ps_32x64_sve2);
// pixel_add_ps
@@ -1777,98 +1129,10 @@ void setupSve2Primitives(EncoderPrimitives &p)
p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].add_ps[ALIGNED] = PFX(pixel_add_ps_16x32_sve2);
p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].add_ps[ALIGNED] = PFX(pixel_add_ps_32x64_sve2);
- //scale2D_64to32
- p.scale2D_64to32 = PFX(scale2D_64to32_neon);
-
// scale1D_128to64
p.scale1D_128to64[NONALIGNED] = PFX(scale1D_128to64_sve2);
p.scale1D_128to64[ALIGNED] = PFX(scale1D_128to64_sve2);
- // planecopy
- p.planecopy_cp = PFX(pixel_planecopy_cp_neon);
-
- // satd
- p.pu[LUMA_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.pu[LUMA_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.pu[LUMA_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.pu[LUMA_32x32].satd = PFX(pixel_satd_32x32_sve);
- p.pu[LUMA_64x64].satd = PFX(pixel_satd_64x64_neon);
- p.pu[LUMA_8x4].satd = PFX(pixel_satd_8x4_sve);
- p.pu[LUMA_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.pu[LUMA_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.pu[LUMA_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.pu[LUMA_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.pu[LUMA_32x16].satd = PFX(pixel_satd_32x16_sve);
- p.pu[LUMA_64x32].satd = PFX(pixel_satd_64x32_neon);
- p.pu[LUMA_32x64].satd = PFX(pixel_satd_32x64_neon);
- p.pu[LUMA_16x12].satd = PFX(pixel_satd_16x12_neon);
- p.pu[LUMA_12x16].satd = PFX(pixel_satd_12x16_neon);
- p.pu[LUMA_16x4].satd = PFX(pixel_satd_16x4_neon);
- p.pu[LUMA_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.pu[LUMA_32x24].satd = PFX(pixel_satd_32x24_neon);
- p.pu[LUMA_24x32].satd = PFX(pixel_satd_24x32_neon);
- p.pu[LUMA_32x8].satd = PFX(pixel_satd_32x8_neon);
- p.pu[LUMA_8x32].satd = PFX(pixel_satd_8x32_neon);
- p.pu[LUMA_64x48].satd = PFX(pixel_satd_64x48_sve);
- p.pu[LUMA_48x64].satd = PFX(pixel_satd_48x64_neon);
- p.pu[LUMA_64x16].satd = PFX(pixel_satd_64x16_neon);
- p.pu[LUMA_16x64].satd = PFX(pixel_satd_16x64_neon);
-
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x32].satd = PFX(pixel_satd_32x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x4].satd = PFX(pixel_satd_8x4_sve);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x16].satd = PFX(pixel_satd_32x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x12].satd = PFX(pixel_satd_16x12_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_12x16].satd = PFX(pixel_satd_12x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_16x4].satd = PFX(pixel_satd_16x4_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x24].satd = PFX(pixel_satd_32x24_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_24x32].satd = PFX(pixel_satd_24x32_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_32x8].satd = PFX(pixel_satd_32x8_neon);
- p.chroma[X265_CSP_I420].pu[CHROMA_420_8x32].satd = PFX(pixel_satd_8x32_neon);
-
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x8].satd = PFX(pixel_satd_4x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x16].satd = PFX(pixel_satd_8x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x32].satd = PFX(pixel_satd_16x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x64].satd = PFX(pixel_satd_32x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x4].satd = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x8].satd = PFX(pixel_satd_8x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x16].satd = PFX(pixel_satd_4x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x16].satd = PFX(pixel_satd_16x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x32].satd = PFX(pixel_satd_8x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x32].satd = PFX(pixel_satd_32x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x64].satd = PFX(pixel_satd_16x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x12].satd = PFX(pixel_satd_8x12_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x4].satd = PFX(pixel_satd_8x4_sve);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x24].satd = PFX(pixel_satd_16x24_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_12x32].satd = PFX(pixel_satd_12x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_16x8].satd = PFX(pixel_satd_16x8_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_4x32].satd = PFX(pixel_satd_4x32_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x48].satd = PFX(pixel_satd_32x48_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_24x64].satd = PFX(pixel_satd_24x64_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_32x16].satd = PFX(pixel_satd_32x16_neon);
- p.chroma[X265_CSP_I422].pu[CHROMA_422_8x64].satd = PFX(pixel_satd_8x64_neon);
-
- // sa8d
- p.cu[BLOCK_4x4].sa8d = PFX(pixel_satd_4x4_sve);
- p.cu[BLOCK_8x8].sa8d = PFX(pixel_sa8d_8x8_neon);
- p.cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
- p.cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
- p.cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_8x8].sa8d = PFX(pixel_satd_4x4_sve);
- p.chroma[X265_CSP_I420].cu[BLOCK_16x16].sa8d = PFX(pixel_sa8d_16x16_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_32x32].sa8d = PFX(pixel_sa8d_32x32_neon);
- p.chroma[X265_CSP_I420].cu[BLOCK_64x64].sa8d = PFX(pixel_sa8d_64x64_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_8x16].sa8d = PFX(pixel_sa8d_8x16_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_16x32].sa8d = PFX(pixel_sa8d_16x32_neon);
- p.chroma[X265_CSP_I422].cu[BLOCK_422_32x64].sa8d = PFX(pixel_sa8d_32x64_neon);
-
// dequant_scaling
p.dequant_scaling = PFX(dequant_scaling_sve2);
p.dequant_normal = PFX(dequant_normal_sve2);
@@ -1888,57 +1152,32 @@ void setupSve2Primitives(EncoderPrimitives &p)
p.cu[BLOCK_16x16].normFact = PFX(normFact16_sve2);
p.cu[BLOCK_32x32].normFact = PFX(normFact32_sve2);
p.cu[BLOCK_64x64].normFact = PFX(normFact64_sve2);
-
- // psy_cost_pp
- p.cu[BLOCK_4x4].psy_cost_pp = PFX(psyCost_4x4_neon);
-
- p.weight_pp = PFX(weight_pp_neon);
-#if !defined(__APPLE__)
- p.scanPosLast = PFX(scanPosLast_neon);
-#endif
- p.costCoeffNxN = PFX(costCoeffNxN_neon);
-#endif
-
- // quant
- p.quant = PFX(quant_sve);
- p.nquant = PFX(nquant_neon);
}
-#endif
+#else // !HIGH_BIT_DEPTH
+void setupSve2Primitives(EncoderPrimitives &)
+{
+}
+#endif // !HIGH_BIT_DEPTH
+#endif // defined(HAVE_SVE2)
void setupAssemblyPrimitives(EncoderPrimitives &p, int cpuMask)
{
-
-#ifdef HAVE_SVE2
- if (cpuMask & X265_CPU_SVE2)
- {
- setupSve2Primitives(p);
- }
- else if (cpuMask & X265_CPU_SVE)
- {
- setupSvePrimitives(p);
- }
- else if (cpuMask & X265_CPU_NEON)
+ if (cpuMask & X265_CPU_NEON)
{
setupNeonPrimitives(p);
}
-
-#elif defined(HAVE_SVE)
+#ifdef HAVE_SVE
if (cpuMask & X265_CPU_SVE)
{
setupSvePrimitives(p);
}
- else if (cpuMask & X265_CPU_NEON)
- {
- setupNeonPrimitives(p);
- }
-
-#else
- if (cpuMask & X265_CPU_NEON)
+#endif
+#ifdef HAVE_SVE2
+ if (cpuMask & X265_CPU_SVE2)
{
- setupNeonPrimitives(p);
+ setupSve2Primitives(p);
}
#endif
-
}
void setupIntrinsicPrimitives(EncoderPrimitives &p, int cpuMask)
--
2.42.1
-------------- next part --------------
A non-text attachment was scrubbed...
Name: v2-0006-AArch64-Refactor-setup-of-optimised-assembly-prim.patch
Type: text/x-patch
Size: 69109 bytes
Desc: not available
URL: <http://mailman.videolan.org/pipermail/x265-devel/attachments/20240730/60de3624/attachment-0001.bin>
More information about the x265-devel
mailing list