[x264-devel] [PATCH 03/23] aarch64: NEON asm for missing x264_zigzag_* functions
Janne Grunau
janne-x264 at jannau.net
Thu Nov 27 08:56:31 CET 2014
zigzag_scan_4x4_field_neon, zigzag_sub_4x4_field_neon,
zigzag_sub_4x4ac_field_neon, zigzag_sub_4x4_frame_neon,
igzag_sub_4x4ac_frame_neon more than 2 times faster
zigzag_scan_8x8_frame_neon, zigzag_scan_8x8_field_neon,
zigzag_sub_8x8_field_neon, zigzag_sub_8x8_frame_neon 4-5 times faster
zigzag_interleave_8x8_cavlc_neon 6 times faster
---
common/aarch64/dct-a.S | 327 +++++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/dct.h | 13 ++
common/dct.c | 24 +++-
3 files changed, 362 insertions(+), 2 deletions(-)
diff --git a/common/aarch64/dct-a.S b/common/aarch64/dct-a.S
index 7b54fbd..e33d067 100644
--- a/common/aarch64/dct-a.S
+++ b/common/aarch64/dct-a.S
@@ -32,6 +32,25 @@ const scan4x4_frame, align=4
.byte 26,27, 28,29, 22,23, 30,31
endconst
+const scan4x4_field, align=4
+.byte 0,1, 2,3, 8,9, 4,5
+.byte 6,7, 10,11, 12,13, 14,15
+endconst
+
+const sub4x4_frame, align=4
+.byte 0, 1, 4, 8
+.byte 5, 2, 3, 6
+.byte 9, 12, 13, 10
+.byte 7, 11, 14, 15
+endconst
+
+const sub4x4_field, align=4
+.byte 0, 4, 1, 8
+.byte 12, 5, 9, 13
+.byte 2, 6, 10, 14
+.byte 3, 7, 11, 15
+endconst
+
// sum = a + (b>>shift) sub = (a>>shift) - b
.macro SUMSUB_SHR shift sum sub a b t0 t1
sshr \t0, \b, #\shift
@@ -655,6 +674,35 @@ function x264_sub8x8_dct_dc_neon, export=1
ret
endfunc
+function x264_zigzag_interleave_8x8_cavlc_neon, export=1
+ mov x3, #7
+ movi v31.4s, #1
+ ld4 {v0.8h,v1.8h,v2.8h,v3.8h}, [x1], #64
+ ld4 {v4.8h,v5.8h,v6.8h,v7.8h}, [x1], #64
+ umax v16.8h, v0.8h, v4.8h
+ umax v17.8h, v1.8h, v5.8h
+ umax v18.8h, v2.8h, v6.8h
+ umax v19.8h, v3.8h, v7.8h
+ st1 {v0.8h}, [x0], #16
+ st1 {v4.8h}, [x0], #16
+ umaxp v16.8h, v16.8h, v17.8h
+ umaxp v18.8h, v18.8h, v19.8h
+ st1 {v1.8h}, [x0], #16
+ st1 {v5.8h}, [x0], #16
+ umaxp v16.8h, v16.8h, v18.8h
+ st1 {v2.8h}, [x0], #16
+ st1 {v6.8h}, [x0], #16
+ cmhi v16.4s, v16.4s, v31.4s
+ st1 {v3.8h}, [x0], #16
+ and v16.16b, v16.16b, v31.16b
+ st1 {v7.8h}, [x0], #16
+ st1 {v16.b}[0], [x2], #1
+ st1 {v16.b}[4], [x2], x3
+ st1 {v16.b}[8], [x2], #1
+ st1 {v16.b}[12], [x2]
+ ret
+endfunc
+
function x264_zigzag_scan_4x4_frame_neon, export=1
movrel x2, scan4x4_frame
ld1 {v0.16b,v1.16b}, [x1]
@@ -664,3 +712,282 @@ function x264_zigzag_scan_4x4_frame_neon, export=1
st1 {v2.16b,v3.16b}, [x0]
ret
endfunc
+
+.macro zigzag_sub_4x4 f ac
+function x264_zigzag_sub_4x4\ac\()_\f\()_neon, export=1
+ mov x9, #FENC_STRIDE
+ mov x4, #FDEC_STRIDE
+ movrel x5, sub4x4_\f
+ mov x6, x2
+ ld1 {v0.s}[0], [x1], x9
+ ld1 {v0.s}[1], [x1], x9
+ ld1 {v0.s}[2], [x1], x9
+ ld1 {v0.s}[3], [x1], x9
+ ld1 {v16.16b}, [x5]
+ ld1 {v1.s}[0], [x2], x4
+ ld1 {v1.s}[1], [x2], x4
+ ld1 {v1.s}[2], [x2], x4
+ ld1 {v1.s}[3], [x2], x4
+ tbl v2.16b, {v0.16b}, v16.16b
+ tbl v3.16b, {v1.16b}, v16.16b
+ st1 {v0.s}[0], [x6], x4
+ usubl v4.8h, v2.8b, v3.8b
+.ifc \ac, ac
+ dup h7, v4.h[0]
+ ins v4.h[0], wzr
+ fmov w5, s7
+ strh w5, [x3]
+ .endif
+ usubl2 v5.8h, v2.16b, v3.16b
+ st1 {v0.s}[1], [x6], x4
+ umax v6.8h, v4.8h, v5.8h
+ umaxv h6, v6.8h
+ st1 {v0.s}[2], [x6], x4
+ fmov w7, s6
+ st1 {v0.s}[3], [x6], x4
+ cmp w7, #0
+ st1 {v4.8h,v5.8h}, [x0]
+ cset w0, ne
+ ret
+endfunc
+.endm
+
+zigzag_sub_4x4 field
+zigzag_sub_4x4 field, ac
+zigzag_sub_4x4 frame
+zigzag_sub_4x4 frame, ac
+
+function x264_zigzag_scan_4x4_field_neon, export=1
+ movrel x2, scan4x4_field
+ ld1 {v0.8h,v1.8h}, [x1]
+ ld1 {v16.16b}, [x2]
+ tbl v0.16b, {v0.16b}, v16.16b
+ st1 {v0.8h,v1.8h}, [x0]
+ ret
+endfunc
+
+function x264_zigzag_scan_8x8_frame_neon, export=1
+ movrel x2, scan8x8_frame
+ ld1 {v0.8h,v1.8h}, [x1], #32
+ ld1 {v2.8h,v3.8h}, [x1], #32
+ ld1 {v4.8h,v5.8h}, [x1], #32
+ ld1 {v6.8h,v7.8h}, [x1]
+ ld1 {v16.16b,v17.16b}, [x2], #32
+ ld1 {v18.16b,v19.16b}, [x2], #32
+ ld1 {v20.16b,v21.16b}, [x2], #32
+ ld1 {v22.16b,v23.16b}, [x2], #32
+ tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
+ tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
+ tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
+ tbl v27.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v19.16b
+ tbl v28.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v20.16b
+ tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v21.16b
+ tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v22.16b
+ tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v23.16b
+ mov v25.h[6], v4.h[0]
+ mov v25.h[7], v5.h[0]
+ mov v26.h[0], v4.h[1]
+ mov v27.h[4], v7.h[0]
+ mov v28.h[7], v4.h[4]
+ mov v29.h[7], v3.h[6]
+ mov v30.h[0], v2.h[7]
+ mov v30.h[1], v3.h[7]
+ st1 {v24.8h,v25.8h}, [x0], #32
+ st1 {v26.8h,v27.8h}, [x0], #32
+ st1 {v28.8h,v29.8h}, [x0], #32
+ st1 {v30.8h,v31.8h}, [x0]
+ ret
+endfunc
+
+#define Z(z) 2*(z), 2*(z)+1
+#define T(x,y) Z(x*8+y)
+const scan8x8_frame, align=5
+ .byte T(0,0), T(1,0), T(0,1), T(0,2)
+ .byte T(1,1), T(2,0), T(3,0), T(2,1)
+ .byte T(1,2), T(0,3), T(0,4), T(1,3)
+ .byte T(2,2), T(3,1), T(4,0), T(5,0)
+ .byte T(4,1), T(3,2), T(2,3), T(1,4)
+ .byte T(0,5), T(0,6), T(1,5), T(2,4)
+#undef T
+#define T(x,y) Z((x-3)*8+y)
+ .byte T(3,3), T(4,2), T(5,1), T(6,0)
+ .byte T(7,0), T(6,1), T(5,2), T(4,3)
+#undef T
+#define T(x,y) Z((x-0)*8+y)
+ .byte T(3,4), T(2,5), T(1,6), T(0,7)
+ .byte T(1,7), T(2,6), T(3,5), T(4,4)
+#undef T
+#define T(x,y) Z((x-4)*8+y)
+ .byte T(5,3), T(6,2), T(7,1), T(7,2)
+ .byte T(6,3), T(5,4), T(4,5), T(3,6)
+ .byte T(2,7), T(3,7), T(4,6), T(5,5)
+ .byte T(6,4), T(7,3), T(7,4), T(6,5)
+ .byte T(5,6), T(4,7), T(5,7), T(6,6)
+ .byte T(7,5), T(7,6), T(6,7), T(7,7)
+endconst
+
+function x264_zigzag_scan_8x8_field_neon, export=1
+ movrel x2, scan8x8_field
+ ld1 {v0.8h,v1.8h}, [x1], #32
+ ld1 {v2.8h,v3.8h}, [x1], #32
+ ld1 {v4.8h,v5.8h}, [x1], #32
+ ld1 {v6.8h,v7.8h}, [x1]
+ ld1 {v16.16b,v17.16b}, [x2], #32
+ ld1 {v18.16b,v19.16b}, [x2], #32
+ ld1 {v20.16b,v21.16b}, [x2], #32
+ ld1 {v22.16b}, [x2]
+ ext v31.16b, v7.16b, v7.16b, #4
+ tbl v24.16b, {v0.16b,v1.16b}, v16.16b
+ tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
+ tbl v26.16b, {v1.16b,v2.16b,v3.16b,v4.16b}, v18.16b
+ tbl v27.16b, {v2.16b,v3.16b,v4.16b,v5.16b}, v19.16b
+ tbl v28.16b, {v3.16b,v4.16b,v5.16b,v6.16b}, v20.16b
+ tbl v29.16b, {v4.16b,v5.16b,v6.16b}, v21.16b
+ tbl v30.16b, {v5.16b,v6.16b,v7.16b}, v22.16b
+ ext v31.16b, v6.16b, v31.16b, #12
+ st1 {v24.8h,v25.8h}, [x0], #32
+ st1 {v26.8h,v27.8h}, [x0], #32
+ st1 {v28.8h,v29.8h}, [x0], #32
+ st1 {v30.8h,v31.8h}, [x0]
+ ret
+endfunc
+
+.macro zigzag_sub8x8 f
+function x264_zigzag_sub_8x8_\f\()_neon, export=1
+ movrel x4, sub8x8_\f
+ mov x5, #FENC_STRIDE
+ mov x6, #FDEC_STRIDE
+ mov x7, x2
+ ld1 {v0.d}[0], [x1], x5
+ ld1 {v0.d}[1], [x1], x5
+ ld1 {v1.d}[0], [x1], x5
+ ld1 {v1.d}[1], [x1], x5
+ ld1 {v2.d}[0], [x1], x5
+ ld1 {v2.d}[1], [x1], x5
+ ld1 {v3.d}[0], [x1], x5
+ ld1 {v3.d}[1], [x1]
+ ld1 {v4.d}[0], [x2], x6
+ ld1 {v4.d}[1], [x2], x6
+ ld1 {v5.d}[0], [x2], x6
+ ld1 {v5.d}[1], [x2], x6
+ ld1 {v6.d}[0], [x2], x6
+ ld1 {v6.d}[1], [x2], x6
+ ld1 {v7.d}[0], [x2], x6
+ ld1 {v7.d}[1], [x2]
+ ld1 {v16.16b,v17.16b}, [x4], #32
+ ld1 {v18.16b,v19.16b}, [x4], #32
+ tbl v24.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v16.16b
+ tbl v25.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v17.16b
+ tbl v26.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v18.16b
+ tbl v27.16b, {v0.16b,v1.16b,v2.16b,v3.16b}, v19.16b
+ tbl v28.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v16.16b
+ tbl v29.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v17.16b
+ tbl v30.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v18.16b
+ tbl v31.16b, {v4.16b,v5.16b,v6.16b,v7.16b}, v19.16b
+ usubl v4.8h, v24.8b, v28.8b
+ usubl2 v5.8h, v24.16b, v28.16b
+ usubl v6.8h, v25.8b, v29.8b
+ usubl2 v7.8h, v25.16b, v29.16b
+ usubl v16.8h, v26.8b, v30.8b
+ usubl2 v17.8h, v26.16b, v30.16b
+ usubl v18.8h, v27.8b, v31.8b
+ usubl2 v19.8h, v27.16b, v31.16b
+ umax v20.8h, v4.8h, v5.8h
+ umax v21.8h, v6.8h, v7.8h
+ umax v22.8h, v16.8h, v17.8h
+ umax v23.8h, v18.8h, v19.8h
+ umax v20.8h, v20.8h, v21.8h
+ umax v21.8h, v22.8h, v23.8h
+ umax v20.8h, v20.8h, v21.8h
+ umaxv h22, v20.8h
+ st1 {v0.d}[0], [x7], x6
+ st1 {v0.d}[1], [x7], x6
+ st1 {v1.d}[0], [x7], x6
+ st1 {v1.d}[1], [x7], x6
+ st1 {v2.d}[0], [x7], x6
+ st1 {v2.d}[1], [x7], x6
+ st1 {v3.d}[0], [x7], x6
+ st1 {v3.d}[1], [x7]
+ st1 {v4.8h,v5.8h}, [x0], #32
+ st1 {v6.8h,v7.8h}, [x0], #32
+ st1 {v16.8h,v17.8h}, [x0], #32
+ st1 {v18.8h,v19.8h}, [x0]
+ fmov w9, s22
+ cmp w9, #0
+ cset w0, ne
+ ret
+endfunc
+.endm
+
+zigzag_sub8x8 field
+zigzag_sub8x8 frame
+
+#undef T
+#define T(x,y) Z(x*8+y)
+const scan8x8_field, align=5
+ .byte T(0,0), T(0,1), T(0,2), T(1,0)
+ .byte T(1,1), T(0,3), T(0,4), T(1,2)
+ .byte T(2,0), T(1,3), T(0,5), T(0,6)
+ .byte T(0,7), T(1,4), T(2,1), T(3,0)
+#undef T
+#define T(x,y) Z((x-1)*8+y)
+ .byte T(2,2), T(1,5), T(1,6), T(1,7)
+ .byte T(2,3), T(3,1), T(4,0), T(3,2)
+#undef T
+#define T(x,y) Z((x-2)*8+y)
+ .byte T(2,4), T(2,5), T(2,6), T(2,7)
+ .byte T(3,3), T(4,1), T(5,0), T(4,2)
+#undef T
+#define T(x,y) Z((x-3)*8+y)
+ .byte T(3,4), T(3,5), T(3,6), T(3,7)
+ .byte T(4,3), T(5,1), T(6,0), T(5,2)
+#undef T
+#define T(x,y) Z((x-4)*8+y)
+ .byte T(4,4), T(4,5), T(4,6), T(4,7)
+ .byte T(5,3), T(6,1), T(6,2), T(5,4)
+#undef T
+#define T(x,y) Z((x-5)*8+y)
+ .byte T(5,5), T(5,6), T(5,7), T(6,3)
+ .byte T(7,0), T(7,1), T(6,4), T(6,5)
+endconst
+
+
+#undef T
+#define T(y,x) x*8+y
+const sub8x8_frame, align=5
+ .byte T(0,0), T(1,0), T(0,1), T(0,2)
+ .byte T(1,1), T(2,0), T(3,0), T(2,1)
+ .byte T(1,2), T(0,3), T(0,4), T(1,3)
+ .byte T(2,2), T(3,1), T(4,0), T(5,0)
+ .byte T(4,1), T(3,2), T(2,3), T(1,4)
+ .byte T(0,5), T(0,6), T(1,5), T(2,4)
+ .byte T(3,3), T(4,2), T(5,1), T(6,0)
+ .byte T(7,0), T(6,1), T(5,2), T(4,3)
+ .byte T(3,4), T(2,5), T(1,6), T(0,7)
+ .byte T(1,7), T(2,6), T(3,5), T(4,4)
+ .byte T(5,3), T(6,2), T(7,1), T(7,2)
+ .byte T(6,3), T(5,4), T(4,5), T(3,6)
+ .byte T(2,7), T(3,7), T(4,6), T(5,5)
+ .byte T(6,4), T(7,3), T(7,4), T(6,5)
+ .byte T(5,6), T(4,7), T(5,7), T(6,6)
+ .byte T(7,5), T(7,6), T(6,7), T(7,7)
+endconst
+
+const sub8x8_field, align=5
+ .byte T(0,0), T(0,1), T(0,2), T(1,0)
+ .byte T(1,1), T(0,3), T(0,4), T(1,2)
+ .byte T(2,0), T(1,3), T(0,5), T(0,6)
+ .byte T(0,7), T(1,4), T(2,1), T(3,0)
+ .byte T(2,2), T(1,5), T(1,6), T(1,7)
+ .byte T(2,3), T(3,1), T(4,0), T(3,2)
+ .byte T(2,4), T(2,5), T(2,6), T(2,7)
+ .byte T(3,3), T(4,1), T(5,0), T(4,2)
+ .byte T(3,4), T(3,5), T(3,6), T(3,7)
+ .byte T(4,3), T(5,1), T(6,0), T(5,2)
+ .byte T(4,4), T(4,5), T(4,6), T(4,7)
+ .byte T(5,3), T(6,1), T(6,2), T(5,4)
+ .byte T(5,5), T(5,6), T(5,7), T(6,3)
+ .byte T(7,0), T(7,1), T(6,4), T(6,5)
+ .byte T(6,6), T(6,7), T(7,2), T(7,3)
+ .byte T(7,4), T(7,5), T(7,6), T(7,7)
+endconst
diff --git a/common/aarch64/dct.h b/common/aarch64/dct.h
index 54c48b3..dfd4ec7 100644
--- a/common/aarch64/dct.h
+++ b/common/aarch64/dct.h
@@ -48,5 +48,18 @@ void x264_add8x8_idct8_neon( uint8_t *p_dst, int16_t dct[64] );
void x264_add16x16_idct8_neon( uint8_t *p_dst, int16_t dct[4][64] );
void x264_zigzag_scan_4x4_frame_neon( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_4x4_field_neon( int16_t level[16], int16_t dct[16] );
+void x264_zigzag_scan_8x8_frame_neon( int16_t level[64], int16_t dct[64] );
+void x264_zigzag_scan_8x8_field_neon( int16_t level[64], int16_t dct[64] );
+
+int x264_zigzag_sub_4x4_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_4x4ac_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
+int x264_zigzag_sub_4x4_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_4x4ac_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst, dctcoef *dc );
+
+int x264_zigzag_sub_8x8_field_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+int x264_zigzag_sub_8x8_frame_neon( dctcoef level[16], const pixel *p_src, pixel *p_dst );
+
+void x264_zigzag_interleave_8x8_cavlc_neon( dctcoef *dst, dctcoef *src, uint8_t *nnz );
#endif
diff --git a/common/dct.c b/common/dct.c
index 08f4e89..910c472 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -1003,8 +1003,20 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
}
#endif
#if HAVE_ARMV6 || ARCH_AARCH64
- if( cpu&X264_CPU_NEON )
- pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
+ if( cpu&X264_CPU_NEON ) {
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_neon;
+#if ARCH_AARCH64
+ pf_interlaced->scan_4x4 = x264_zigzag_scan_4x4_field_neon;
+ pf_interlaced->scan_8x8 = x264_zigzag_scan_8x8_field_neon;
+ pf_interlaced->sub_4x4 = x264_zigzag_sub_4x4_field_neon;
+ pf_interlaced->sub_4x4ac = x264_zigzag_sub_4x4ac_field_neon;
+ pf_interlaced->sub_8x8 = x264_zigzag_sub_8x8_field_neon;
+ pf_progressive->scan_8x8 = x264_zigzag_scan_8x8_frame_neon;
+ pf_progressive->sub_4x4 = x264_zigzag_sub_4x4_frame_neon;
+ pf_progressive->sub_4x4ac = x264_zigzag_sub_4x4ac_frame_neon;
+ pf_progressive->sub_8x8 = x264_zigzag_sub_8x8_frame_neon;
+#endif // ARCH_AARCH64
+ }
#endif // HAVE_ARMV6 || ARCH_AARCH64
#endif // HIGH_BIT_DEPTH
@@ -1047,4 +1059,12 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
}
#endif // HIGH_BIT_DEPTH
#endif
+#if !HIGH_BIT_DEPTH
+#if ARCH_AARCH64
+ if( cpu&X264_CPU_NEON ) {
+ pf_interlaced->interleave_8x8_cavlc =
+ pf_progressive->interleave_8x8_cavlc = x264_zigzag_interleave_8x8_cavlc_neon;
+ }
+#endif // ARCH_AARCH64
+#endif // !HIGH_BIT_DEPTH
}
--
2.1.3
More information about the x264-devel
mailing list