[x264-devel] [PATCH 2/2] aarch64: Update the var2 functions to the new signature
Martin Storsjö
martin at martin.st
Mon May 29 11:13:03 CEST 2017
The existing functions could easily be used by just calling them
twice - this would give the following cycle numbers from checkasm:
var2_8x8_c: 4110
var2_8x8_neon: 1505
var2_8x16_c: 8019
var2_8x16_neon: 2545
However, by merging both passes into the same function, we get the
following speedup:
var2_8x8_neon: 1205
var2_8x16_neon: 2327
---
common/aarch64/pixel-a.S | 72 +++++++++++++++++++++++++++---------------------
common/aarch64/pixel.h | 4 +--
common/pixel.c | 4 +--
3 files changed, 44 insertions(+), 36 deletions(-)
diff --git a/common/aarch64/pixel-a.S b/common/aarch64/pixel-a.S
index 48209b2..047d3db 100644
--- a/common/aarch64/pixel-a.S
+++ b/common/aarch64/pixel-a.S
@@ -569,57 +569,65 @@ endfunc
.macro pixel_var2_8 h
function x264_pixel_var2_8x\h\()_neon, export=1
- ld1 {v16.8b}, [x0], x1
- ld1 {v18.8b}, [x2], x3
- ld1 {v17.8b}, [x0], x1
- ld1 {v19.8b}, [x2], x3
- mov x5, \h - 4
- usubl v6.8h, v16.8b, v18.8b
- usubl v7.8h, v17.8b, v19.8b
- ld1 {v16.8b}, [x0], x1
- ld1 {v18.8b}, [x2], x3
- smull v2.4s, v6.4h, v6.4h
- smull2 v3.4s, v6.8h, v6.8h
- add v0.8h, v6.8h, v7.8h
- smlal v2.4s, v7.4h, v7.4h
- smlal2 v3.4s, v7.8h, v7.8h
+ mov x3, #16
+ ld1 {v16.8b}, [x0], #8
+ ld1 {v18.8b}, [x1], x3
+ ld1 {v17.8b}, [x0], #8
+ ld1 {v19.8b}, [x1], x3
+ mov x5, \h - 2
+ usubl v0.8h, v16.8b, v18.8b
+ usubl v1.8h, v17.8b, v19.8b
+ ld1 {v16.8b}, [x0], #8
+ ld1 {v18.8b}, [x1], x3
+ smull v2.4s, v0.4h, v0.4h
+ smull2 v3.4s, v0.8h, v0.8h
+ smull v4.4s, v1.4h, v1.4h
+ smull2 v5.4s, v1.8h, v1.8h
usubl v6.8h, v16.8b, v18.8b
-1: subs x5, x5, #2
- ld1 {v17.8b}, [x0], x1
- ld1 {v19.8b}, [x2], x3
+1: subs x5, x5, #1
+ ld1 {v17.8b}, [x0], #8
+ ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
- ld1 {v16.8b}, [x0], x1
- ld1 {v18.8b}, [x2], x3
- smlal v2.4s, v7.4h, v7.4h
- smlal2 v3.4s, v7.8h, v7.8h
+ ld1 {v16.8b}, [x0], #8
+ ld1 {v18.8b}, [x1], x3
+ smlal v4.4s, v7.4h, v7.4h
+ smlal2 v5.4s, v7.8h, v7.8h
usubl v6.8h, v16.8b, v18.8b
- add v0.8h, v0.8h, v7.8h
+ add v1.8h, v1.8h, v7.8h
b.gt 1b
- ld1 {v17.8b}, [x0], x1
- ld1 {v19.8b}, [x2], x3
+ ld1 {v17.8b}, [x0], #8
+ ld1 {v19.8b}, [x1], x3
smlal v2.4s, v6.4h, v6.4h
smlal2 v3.4s, v6.8h, v6.8h
usubl v7.8h, v17.8b, v19.8b
add v0.8h, v0.8h, v6.8h
- smlal v2.4s, v7.4h, v7.4h
- add v0.8h, v0.8h, v7.8h
- smlal2 v3.4s, v7.8h, v7.8h
+ smlal v4.4s, v7.4h, v7.4h
+ add v1.8h, v1.8h, v7.8h
+ smlal2 v5.4s, v7.8h, v7.8h
saddlv s0, v0.8h
+ saddlv s1, v1.8h
add v2.4s, v2.4s, v3.4s
+ add v4.4s, v4.4s, v5.4s
mov w0, v0.s[0]
- addv s1, v2.4s
- sxtw x0, w0
mov w1, v1.s[0]
- mul x0, x0, x0
- str w1, [x4]
- sub x0, x1, x0, lsr # 6 + (\h >> 4)
+ addv s2, v2.4s
+ addv s4, v4.4s
+ mul w0, w0, w0
+ mul w1, w1, w1
+ mov w3, v2.s[0]
+ mov w4, v4.s[0]
+ sub w0, w3, w0, lsr # 6 + (\h >> 4)
+ sub w1, w4, w1, lsr # 6 + (\h >> 4)
+ str w3, [x2]
+ add w0, w0, w1
+ str w4, [x2, #4]
ret
endfunc
diff --git a/common/aarch64/pixel.h b/common/aarch64/pixel.h
index 8a7b83e..5206a0c 100644
--- a/common/aarch64/pixel.h
+++ b/common/aarch64/pixel.h
@@ -61,8 +61,8 @@ uint64_t x264_pixel_sa8d_satd_16x16_neon( uint8_t *, intptr_t, uint8_t *, intptr
uint64_t x264_pixel_var_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_8x16_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_var_16x16_neon( uint8_t *, intptr_t );
-int x264_pixel_var2_8x8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
-int x264_pixel_var2_8x16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int * );
+int x264_pixel_var2_8x8_neon ( uint8_t *, uint8_t *, int * );
+int x264_pixel_var2_8x16_neon( uint8_t *, uint8_t *, int * );
uint64_t x264_pixel_hadamard_ac_8x8_neon ( uint8_t *, intptr_t );
uint64_t x264_pixel_hadamard_ac_8x16_neon ( uint8_t *, intptr_t );
diff --git a/common/pixel.c b/common/pixel.c
index 7bedc9d..6d33921 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -1442,8 +1442,8 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_neon;
pixf->var[PIXEL_8x16] = x264_pixel_var_8x16_neon;
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_neon;
- //pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
- //pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
+ pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
+ pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
pixf->vsad = x264_pixel_vsad_neon;
pixf->asd8 = x264_pixel_asd8_neon;
--
2.7.4
More information about the x264-devel
mailing list