[x264-devel] [PATCH 5/9] aarch64: quantization and level-run NEON asm
Janne Grunau
janne-x264 at jannau.net
Sat Jul 19 20:57:51 CEST 2014
Ported from the ARM NEON asm.
---
Makefile | 3 +-
common/aarch64/quant-a.S | 386 +++++++++++++++++++++++++++++++++++++++++++++++
common/aarch64/quant.h | 47 ++++++
common/quant.c | 13 +-
4 files changed, 447 insertions(+), 2 deletions(-)
create mode 100644 common/aarch64/quant-a.S
create mode 100644 common/aarch64/quant.h
diff --git a/Makefile b/Makefile
index d68d3d8..0ba072a 100644
--- a/Makefile
+++ b/Makefile
@@ -126,7 +126,8 @@ endif
# AArch64 NEON optims
ifeq ($(ARCH),AARCH64)
ifneq ($(AS),)
-ASMSRC += common/aarch64/pixel-a.S
+ASMSRC += common/aarch64/pixel-a.S \
+ common/aarch64/quant-a.S
SRCS +=
OBJASM = $(ASMSRC:%.S=%.o)
endif
diff --git a/common/aarch64/quant-a.S b/common/aarch64/quant-a.S
new file mode 100644
index 0000000..02b71b2
--- /dev/null
+++ b/common/aarch64/quant-a.S
@@ -0,0 +1,386 @@
+/****************************************************************************
+ * quant.S: arm quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2009-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "asm.S"
+
+.macro QUANT_TWO bias0 bias1 mf0_1 mf2_3 mask
+ add v18.8h, v18.8h, \bias0
+ add v19.8h, v19.8h, \bias1
+ umull v20.4s, v18.4h, \mf0_1\().4h
+ umull2 v21.4s, v18.8h, \mf0_1\().8h
+ umull v22.4s, v19.4h, \mf2_3\().4h
+ umull2 v23.4s, v19.8h, \mf2_3\().8h
+ sshr v16.8h, v16.8h, #15
+ sshr v17.8h, v17.8h, #15
+ shrn v18.4h, v20.4s, #16
+ shrn2 v18.8h, v21.4s, #16
+ shrn v19.4h, v22.4s, #16
+ shrn2 v19.8h, v23.4s, #16
+ eor v18.16b, v18.16b, v16.16b
+ eor v19.16b, v19.16b, v17.16b
+ sub v18.8h, v18.8h, v16.8h
+ sub v19.8h, v19.8h, v17.8h
+ orr \mask, v18.16b, v19.16b
+ st1 {v18.8h,v19.8h}, [x0], #32
+.endm
+
+.macro QUANT_END d
+ fmov x2, \d
+ mov w0, #0
+ tst x2, x2
+ cinc w0, w0, ne
+ ret
+.endm
+
+// quant_2x2_dc( int16_t dct[4], int mf, int bias )
+function x264_quant_2x2_dc_neon, export=1
+ ld1 {v0.4h}, [x0]
+ dup v2.4h, w2
+ dup v1.4h, w1
+ abs v3.4h, v0.4h
+ add v3.4h, v3.4h, v2.4h
+ umull v3.4s, v3.4h, v1.4h
+ sshr v0.4h, v0.4h, #15
+ shrn v3.4h, v3.4s, #16
+ eor v3.8b, v3.8b, v0.8b
+ sub v3.4h, v3.4h, v0.4h
+ st1 {v3.4h}, [x0]
+ QUANT_END d3
+endfunc
+
+// quant_4x4_dc( int16_t dct[16], int mf, int bias )
+function x264_quant_4x4_dc_neon, export=1
+ ld1 {v16.8h,v17.8h}, [x0]
+ abs v18.8h, v16.8h
+ abs v19.8h, v17.8h
+ dup v0.8h, w2
+ dup v2.8h, w1
+ QUANT_TWO v0.8h, v0.8h, v2, v2, v0.16b
+ uqxtn v0.8b, v0.8h
+ QUANT_END d0
+endfunc
+
+// quant_4x4( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4_neon, export=1
+ ld1 {v16.8h,v17.8h}, [x0]
+ abs v18.8h, v16.8h
+ abs v19.8h, v17.8h
+ ld1 {v0.8h,v1.8h}, [x2]
+ ld1 {v2.8h,v3.8h}, [x1]
+ QUANT_TWO v0.8h, v1.8h, v2, v3, v0.16b
+ uqxtn v0.8b, v0.8h
+ QUANT_END d0
+endfunc
+
+// quant_4x4x4( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] )
+function x264_quant_4x4x4_neon, export=1
+ ld1 {v16.8h,v17.8h}, [x0]
+ abs v18.8h, v16.8h
+ abs v19.8h, v17.8h
+ ld1 {v0.8h,v1.8h}, [x2]
+ ld1 {v2.8h,v3.8h}, [x1]
+ QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
+ ld1 {v16.8h,v17.8h}, [x0]
+ abs v18.8h, v16.8h
+ abs v19.8h, v17.8h
+ QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
+ ld1 {v16.8h,v17.8h}, [x0]
+ abs v18.8h, v16.8h
+ abs v19.8h, v17.8h
+ QUANT_TWO v0.8h, v1.8h, v2, v3, v6.16b
+ ld1 {v16.8h,v17.8h}, [x0]
+ abs v18.8h, v16.8h
+ abs v19.8h, v17.8h
+ QUANT_TWO v0.8h, v1.8h, v2, v3, v7.16b
+ uqxtn v4.8b, v4.8h
+ uqxtn v7.8b, v7.8h
+ uqxtn v6.8b, v6.8h
+ uqxtn v5.8b, v5.8h
+ fmov x7, d7
+ fmov x6, d6
+ fmov x5, d5
+ fmov x4, d4
+ mov w0, #0
+ tst x7, x7
+ cinc w0, w0, ne
+ lsl w0, w0, #1
+ tst x6, x6
+ cinc w0, w0, ne
+ lsl w0, w0, #1
+ tst x5, x5
+ cinc w0, w0, ne
+ lsl w0, w0, #1
+ tst x4, x4
+ cinc w0, w0, ne
+ ret
+endfunc
+
+// quant_8x8( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
+function x264_quant_8x8_neon, export=1
+ ld1 {v16.8h,v17.8h}, [x0]
+ abs v18.8h, v16.8h
+ abs v19.8h, v17.8h
+ ld1 {v0.8h,v1.8h}, [x2], #32
+ ld1 {v2.8h,v3.8h}, [x1], #32
+ QUANT_TWO v0.8h, v1.8h, v2, v3, v4.16b
+.rept 3
+ ld1 {v16.8h,v17.8h}, [x0]
+ abs v18.8h, v16.8h
+ abs v19.8h, v17.8h
+ ld1 {v0.8h,v1.8h}, [x2], #32
+ ld1 {v2.8h,v3.8h}, [x1], #32
+ QUANT_TWO v0.8h, v1.8h, v2, v3, v5.16b
+ orr v4.16b, v4.16b, v5.16b
+.endr
+ uqxtn v0.8b, v4.8h
+ QUANT_END d0
+endfunc
+
+.macro DEQUANT_START mf_size offset dc=no
+ mov w3, #0x2b
+ mul w3, w3, w2
+ lsr w3, w3, #8 // i_qbits = i_qp / 6
+ add w5, w3, w3, lsl #1
+ sub w2, w2, w5, lsl #1 // i_mf = i_qp % 6
+ lsl w2, w2, #\mf_size
+.ifc \dc,no
+ add x1, x1, w2, sxtw // dequant_mf[i_mf]
+.else
+ ldr x1, [x1, w2, sxtw] // dequant_mf[i_mf][0][0]
+.endif
+ subs w3, w3, #\offset // 6 for 8x8
+.endm
+
+// dequant_4x4( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+.macro DEQUANT size bits
+function x264_dequant_\size\()_neon, export=1
+ DEQUANT_START \bits+2, \bits
+.ifc \size, 8x8
+ mov w2, #4
+.endif
+ b.lt dequant_\size\()_rshift
+
+ dup v31.8h, w3
+dequant_\size\()_lshift_loop:
+.ifc \size, 8x8
+ subs w2, w2, #1
+.endif
+ ld1 {v16.4s}, [x1], #16
+ ld1 {v17.4s}, [x1], #16
+ sqxtn v2.4h, v16.4s
+ ld1 {v18.4s}, [x1], #16
+ sqxtn2 v2.8h, v17.4s
+ ld1 {v19.4s}, [x1], #16
+ sqxtn v3.4h, v18.4s
+ ld1 {v0.8h,v1.8h}, [x0]
+ sqxtn2 v3.8h, v19.4s
+ mul v0.8h, v0.8h, v2.8h
+ mul v1.8h, v1.8h, v3.8h
+ sshl v0.8h, v0.8h, v31.8h
+ sshl v1.8h, v1.8h, v31.8h
+ st1 {v0.8h,v1.8h}, [x0], #32
+.ifc \size, 8x8
+ b.gt dequant_\size\()_lshift_loop
+.endif
+ ret
+
+dequant_\size\()_rshift:
+ dup v31.4s, w3
+ neg w3, w3
+ mov w5, #1
+ sub w3, w3, #1
+ lsl w5, w5, w3
+
+.ifc \size, 8x8
+dequant_\size\()_rshift_loop:
+ subs w2, w2, #1
+.endif
+ ld1 {v16.4s}, [x1], #16
+ ld1 {v17.4s}, [x1], #16
+ sqxtn v2.4h, v16.4s
+ ld1 {v18.4s}, [x1], #16
+ dup v16.4s, w5
+ sqxtn2 v2.8h, v17.4s
+ ld1 {v19.4s}, [x1], #16
+ dup v17.4s, w5
+ sqxtn v3.4h, v18.4s
+ ld1 {v0.8h,v1.8h}, [x0]
+ dup v18.4s, w5
+ sqxtn2 v3.8h, v19.4s
+ dup v19.4s, w5
+
+ smlal v16.4s, v0.4h, v2.4h
+ smlal2 v17.4s, v0.8h, v2.8h
+ smlal v18.4s, v1.4h, v3.4h
+ smlal2 v19.4s, v1.8h, v3.8h
+ sshl v16.4s, v16.4s, v31.4s
+ sshl v17.4s, v17.4s, v31.4s
+ sshl v18.4s, v18.4s, v31.4s
+ sshl v19.4s, v19.4s, v31.4s
+
+ sqxtn v0.4h, v16.4s
+ sqxtn2 v0.8h, v17.4s
+ sqxtn v1.4h, v18.4s
+ sqxtn2 v1.8h, v19.4s
+ st1 {v0.8h,v1.8h}, [x0], #32
+.ifc \size, 8x8
+ b.gt dequant_\size\()_rshift_loop
+.endif
+ ret
+endfunc
+.endm
+
+DEQUANT 4x4, 4
+DEQUANT 8x8, 6
+
+// dequant_4x4_dc( int16_t dct[16], int dequant_mf[6][16], int i_qp )
+function x264_dequant_4x4_dc_neon, export=1
+ DEQUANT_START 6, 6, yes
+ b.lt dequant_4x4_dc_rshift
+
+ lsl w1, w1, w3
+ dup v2.8h, w1
+ ld1 {v0.8h,v1.8h}, [x0]
+
+ mul v0.8h, v0.8h, v2.8h
+ mul v1.8h, v1.8h, v2.8h
+ st1 {v0.8h,v1.8h}, [x0]
+ ret
+
+dequant_4x4_dc_rshift:
+ dup v4.8h, w1
+ dup v3.4s, w3
+ neg w3, w3
+ mov w5, #1
+ sub w3, w3, #1
+ lsl w5, w5, w3
+
+ dup v16.4s, w5
+ dup v17.4s, w5
+ ld1 {v0.8h,v1.8h}, [x0]
+ dup v18.4s, w5
+ dup v19.4s, w5
+
+ smlal v16.4s, v0.4h, v4.4h
+ smlal2 v17.4s, v0.8h, v4.8h
+ smlal v18.4s, v1.4h, v4.4h
+ smlal2 v19.4s, v1.8h, v4.8h
+ sshl v16.4s, v16.4s, v3.4s
+ sshl v17.4s, v17.4s, v3.4s
+ sshl v18.4s, v18.4s, v3.4s
+ sshl v19.4s, v19.4s, v3.4s
+
+ sqxtn v0.4h, v16.4s
+ sqxtn2 v0.8h, v17.4s
+ sqxtn v1.4h, v18.4s
+ sqxtn2 v1.8h, v19.4s
+ st1 {v0.8h,v1.8h}, [x0]
+ ret
+endfunc
+
+// int coeff_last( int16_t *l )
+function x264_coeff_last4_aarch64, export=1
+ ldr x2, [x0]
+ mov w4, #3
+ clz x0, x2
+ sub w0, w4, w0, lsr #4
+ ret
+endfunc
+
+function x264_coeff_last8_aarch64, export=1
+ ldr x3, [x0, #8]
+ mov w4, #7
+ clz x2, x3
+ cmp w2, #64
+ b.ne 1f
+ ldr x3, [x0]
+ sub w4, w4, #4
+ clz x2, x3
+1:
+ sub w0, w4, w2, lsr #4
+ ret
+endfunc
+
+.macro COEFF_LAST_1x size
+function x264_coeff_last\size\()_neon, export=1
+.if \size == 15
+ sub x0, x0, #2
+.endif
+ ld1 {v0.8h,v1.8h}, [x0]
+ uqxtn v0.8b, v0.8h
+ uqxtn2 v0.16b, v1.8h
+ cmtst v0.16b, v0.16b, v0.16b
+ shrn v0.8b, v0.8h, #4
+ fmov x1, d0
+ mov w3, #\size - 1
+ clz x2, x1
+ sub w0, w3, w2, lsr #2
+ ret
+endfunc
+.endm
+
+COEFF_LAST_1x 15
+COEFF_LAST_1x 16
+
+function x264_coeff_last64_neon, export=1
+ ld1 {v0.8h,v1.8h,v2.8h,v3.8h}, [x0], 64
+ movi v31.8h, #8
+ movi v30.8h, #1
+ uqxtn v0.8b, v0.8h
+ uqxtn2 v0.16b, v1.8h
+ ld1 {v4.8h,v5.8h,v6.8h,v7.8h}, [x0], 64
+ uqxtn v1.8b, v2.8h
+ uqxtn2 v1.16b, v3.8h
+ uqxtn v2.8b, v4.8h
+ uqxtn2 v2.16b, v5.8h
+ uqxtn v3.8b, v6.8h
+ uqxtn2 v3.16b, v7.8h
+
+ cmtst v0.16b, v0.16b, v0.16b
+ cmtst v1.16b, v1.16b, v1.16b
+ cmtst v2.16b, v2.16b, v2.16b
+ cmtst v3.16b, v3.16b, v3.16b
+
+ shrn v0.8b, v0.8h, #4
+ shrn2 v0.16b, v1.8h, #4
+ shrn v1.8b, v2.8h, #4
+ shrn2 v1.16b, v3.8h, #4
+
+ clz v0.4s, v0.4s
+ clz v1.4s, v1.4s
+
+ shrn v0.4h, v0.4s, #2
+ shrn2 v0.8h, v1.4s, #2
+
+ sub v0.8h, v31.8h, v0.8h
+ sshl v0.8h, v30.8h, v0.8h
+ shrn v0.8b, v0.8h, #1
+
+ fmov x2, d0
+ mov w3, #63
+ clz x2, x2
+ sub w0, w3, w2
+ ret
+endfunc
diff --git a/common/aarch64/quant.h b/common/aarch64/quant.h
new file mode 100644
index 0000000..dfcac25
--- /dev/null
+++ b/common/aarch64/quant.h
@@ -0,0 +1,47 @@
+/*****************************************************************************
+ * quant.h: arm quantization and level-run
+ *****************************************************************************
+ * Copyright (C) 2005-2014 x264 project
+ *
+ * Authors: David Conrad <lessen42 at gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_AARCH64_QUANT_H
+#define X264_AARCH64_QUANT_H
+
+int x264_quant_2x2_dc_aarch64( int16_t dct[4], int mf, int bias );
+
+int x264_quant_2x2_dc_neon( int16_t dct[4], int mf, int bias );
+int x264_quant_4x4_dc_neon( int16_t dct[16], int mf, int bias );
+int x264_quant_4x4_neon( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_4x4x4_neon( int16_t dct[4][16], uint16_t mf[16], uint16_t bias[16] );
+int x264_quant_8x8_neon( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] );
+
+void x264_dequant_4x4_dc_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_4x4_neon( int16_t dct[16], int dequant_mf[6][16], int i_qp );
+void x264_dequant_8x8_neon( int16_t dct[64], int dequant_mf[6][64], int i_qp );
+
+int x264_coeff_last4_aarch64( int16_t * );
+int x264_coeff_last8_aarch64( int16_t * );
+int x264_coeff_last15_neon( int16_t * );
+int x264_coeff_last16_neon( int16_t * );
+int x264_coeff_last64_neon( int16_t * );
+
+#endif
diff --git a/common/quant.c b/common/quant.c
index 1a9e4dc..3515b2e 100644
--- a/common/quant.c
+++ b/common/quant.c
@@ -37,6 +37,9 @@
#if ARCH_ARM
# include "arm/quant.h"
#endif
+#if ARCH_AARCH64
+# include "aarch64/quant.h"
+#endif
#define QUANT_ONE( coef, mf, f ) \
{ \
@@ -729,7 +732,8 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last4 = x264_coeff_last4_arm;
pf->coeff_last8 = x264_coeff_last8_arm;
}
-
+#endif
+#if HAVE_ARMV6 || ARCH_AARCH64
if( cpu&X264_CPU_NEON )
{
pf->quant_2x2_dc = x264_quant_2x2_dc_neon;
@@ -745,6 +749,13 @@ void x264_quant_init( x264_t *h, int cpu, x264_quant_function_t *pf )
pf->coeff_last[DCT_LUMA_8x8] = x264_coeff_last64_neon;
}
#endif
+#if ARCH_AARCH64
+ if( cpu&X264_CPU_ARMV8 )
+ {
+ pf->coeff_last4 = x264_coeff_last4_aarch64;
+ pf->coeff_last8 = x264_coeff_last8_aarch64;
+ }
+#endif
#endif // HIGH_BIT_DEPTH
pf->coeff_last[DCT_LUMA_DC] = pf->coeff_last[DCT_CHROMAU_DC] = pf->coeff_last[DCT_CHROMAV_DC] =
pf->coeff_last[DCT_CHROMAU_4x4] = pf->coeff_last[DCT_CHROMAV_4x4] = pf->coeff_last[DCT_LUMA_4x4];
--
2.0.0
More information about the x264-devel
mailing list