[x264-devel] [PATCH 11/23] aarch64: NEON asm for integral init
Janne Grunau
janne-x264 at jannau.net
Thu Nov 27 08:56:39 CET 2014
integral_init4h_neon and integral_init8h_neon are 3-4 times faster than
C. integral_init8v_neon is 6 times faster and integral_init4v_neon is 10
times faster.
---
common/aarch64/mc-a.S | 121 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/mc-c.c | 9 ++++
2 files changed, 130 insertions(+)
diff --git a/common/aarch64/mc-a.S b/common/aarch64/mc-a.S
index 351317e..83652f2 100644
--- a/common/aarch64/mc-a.S
+++ b/common/aarch64/mc-a.S
@@ -1363,3 +1363,124 @@ function x264_store_interleave_chroma_neon, export=1
ret
endfunc
+
+.macro integral4h p1, p2
+ ext v1.8b, \p1\().8b, \p2\().8b, #1
+ ext v2.8b, \p1\().8b, \p2\().8b, #2
+ ext v3.8b, \p1\().8b, \p2\().8b, #3
+ uaddl v0.8h, \p1\().8b, v1.8b
+ uaddl v4.8h, v2.8b, v3.8b
+ add v0.8h, v0.8h, v4.8h
+ add v0.8h, v0.8h, v5.8h
+.endm
+
+function integral_init4h_neon, export=1
+ sub x3, x0, x2
+ ld1 {v6.8b,v7.8b}, [x1], #16
+1:
+ subs x2, x2, #16
+ ld1 {v5.8h}, [x3], #16
+ integral4h v6, v7
+ ld1 {v6.8b}, [x1], #8
+ ld1 {v5.8h}, [x3], #16
+ st1 {v0.8h}, [x0], #16
+ integral4h v7, v6
+ ld1 {v7.8b}, [x1], #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
+.macro integral8h p1, p2, s
+ ext v1.8b, \p1\().8b, \p2\().8b, #1
+ ext v2.8b, \p1\().8b, \p2\().8b, #2
+ ext v3.8b, \p1\().8b, \p2\().8b, #3
+ ext v4.8b, \p1\().8b, \p2\().8b, #4
+ ext v5.8b, \p1\().8b, \p2\().8b, #5
+ ext v6.8b, \p1\().8b, \p2\().8b, #6
+ ext v7.8b, \p1\().8b, \p2\().8b, #7
+ uaddl v0.8h, \p1\().8b, v1.8b
+ uaddl v2.8h, v2.8b, v3.8b
+ uaddl v4.8h, v4.8b, v5.8b
+ uaddl v6.8h, v6.8b, v7.8b
+ add v0.8h, v0.8h, v2.8h
+ add v4.8h, v4.8h, v6.8h
+ add v0.8h, v0.8h, v4.8h
+ add v0.8h, v0.8h, \s\().8h
+.endm
+
+function integral_init8h_neon, export=1
+ sub x3, x0, x2
+ ld1 {v16.8b,v17.8b}, [x1], #16
+1:
+ subs x2, x2, #16
+ ld1 {v18.8h}, [x3], #16
+ integral8h v16, v17, v18
+ ld1 {v16.8b}, [x1], #8
+ ld1 {v18.8h}, [x3], #16
+ st1 {v0.8h}, [x0], #16
+ integral8h v17, v16, v18
+ ld1 {v17.8b}, [x1], #8
+ st1 {v0.8h}, [x0], #16
+ b.gt 1b
+ ret
+endfunc
+
+function integral_init4v_neon, export=1
+ mov x3, x0
+ add x4, x0, x2, lsl #3
+ add x8, x0, x2, lsl #4
+ sub x2, x2, #8
+ ld1 {v20.8h,v21.8h,v22.8h}, [x3], #48
+ ld1 {v16.8h,v17.8h,v18.8h}, [x8], #48
+1:
+ subs x2, x2, #16
+ ld1 {v24.8h,v25.8h}, [x4], #32
+ ext v0.16b, v20.16b, v21.16b, #8
+ ext v1.16b, v21.16b, v22.16b, #8
+ ext v2.16b, v16.16b, v17.16b, #8
+ ext v3.16b, v17.16b, v18.16b, #8
+ sub v24.8h, v24.8h, v20.8h
+ sub v25.8h, v25.8h, v21.8h
+ add v0.8h, v0.8h, v20.8h
+ add v1.8h, v1.8h, v21.8h
+ add v2.8h, v2.8h, v16.8h
+ add v3.8h, v3.8h, v17.8h
+ st1 {v24.8h}, [x1], #16
+ st1 {v25.8h}, [x1], #16
+ mov v20.16b, v22.16b
+ mov v16.16b, v18.16b
+ sub v0.8h, v2.8h, v0.8h
+ sub v1.8h, v3.8h, v1.8h
+ ld1 {v21.8h,v22.8h}, [x3], #32
+ ld1 {v17.8h,v18.8h}, [x8], #32
+ st1 {v0.8h}, [x0], #16
+ st1 {v1.8h}, [x0], #16
+ b.gt 1b
+2:
+ ret
+endfunc
+
+function integral_init8v_neon, export=1
+ add x2, x0, x1, lsl #4
+ sub x1, x1, #8
+ ands x3, x1, #16 - 1
+ b.eq 1f
+ subs x1, x1, #8
+ ld1 {v0.8h}, [x0]
+ ld1 {v2.8h}, [x2], #16
+ sub v4.8h, v2.8h, v0.8h
+ st1 {v4.8h}, [x0], #16
+ b.le 2f
+1:
+ subs x1, x1, #16
+ ld1 {v0.8h,v1.8h}, [x0]
+ ld1 {v2.8h,v3.8h}, [x2], #32
+ sub v4.8h, v2.8h, v0.8h
+ sub v5.8h, v3.8h, v1.8h
+ st1 {v4.8h}, [x0], #16
+ st1 {v5.8h}, [x0], #16
+ b.gt 1b
+2:
+ ret
+endfunc
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index 73f6df9..37fc354 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -89,6 +89,10 @@ void x264_mc_copy_w8_neon ( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_mc_copy_w16_neon( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
void x264_mc_chroma_neon( uint8_t *, uint8_t *, intptr_t, uint8_t *, intptr_t, int, int, int, int );
+void integral_init4h_neon( uint16_t *, uint8_t *, intptr_t );
+void integral_init4v_neon( uint16_t *, uint16_t *, intptr_t );
+void integral_init8h_neon( uint16_t *, uint8_t *, intptr_t );
+void integral_init8v_neon( uint16_t *, intptr_t );
void x264_frame_init_lowres_core_neon( uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, intptr_t, intptr_t, int, int );
#if !HIGH_BIT_DEPTH
@@ -245,5 +249,10 @@ void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
pf->get_ref = get_ref_neon;
pf->hpel_filter = x264_hpel_filter_neon;
pf->frame_init_lowres_core = x264_frame_init_lowres_core_neon;
+
+ pf->integral_init4h = integral_init4h_neon;
+ pf->integral_init8h = integral_init8h_neon;
+ pf->integral_init4v = integral_init4v_neon;
+ pf->integral_init8v = integral_init8v_neon;
#endif // !HIGH_BIT_DEPTH
}
--
2.1.3
More information about the x264-devel
mailing list