[x264-devel] [PATCH 1/1] arm: use available neon functions for intra_sa8d/sad/satd_x3
Janne Grunau
janne-x264 at jannau.net
Thu Mar 13 09:44:30 CET 2014
4% faster on main/medium, 15% faster on baseline/superfast on a cortex-a9.
---
common/arm/predict.h | 10 ++++++++++
common/pixel.c | 26 ++++++++++++++++++++++++++
2 files changed, 36 insertions(+)
diff --git a/common/arm/predict.h b/common/arm/predict.h
index 0d07c2a..6cf2f5f 100644
--- a/common/arm/predict.h
+++ b/common/arm/predict.h
@@ -26,6 +26,16 @@
#ifndef X264_ARM_PREDICT_H
#define X264_ARM_PREDICT_H
+void x264_predict_8x8_v_neon( pixel *src, pixel edge[36] );
+void x264_predict_8x8_h_neon( pixel *src, pixel edge[36] );
+void x264_predict_8x8_dc_neon( pixel *src, pixel edge[36] );
+void x264_predict_8x8c_dc_neon( pixel *src );
+void x264_predict_8x8c_h_neon( pixel *src );
+void x264_predict_8x8c_v_neon( pixel *src );
+void x264_predict_16x16_v_neon( pixel *src );
+void x264_predict_16x16_h_neon( pixel *src );
+void x264_predict_16x16_dc_neon( pixel *src );
+
void x264_predict_4x4_init_arm( int cpu, x264_predict_t pf[12] );
void x264_predict_8x8_init_arm( int cpu, x264_predict8x8_t pf[12], x264_predict_8x8_filter_t *predict_filter );
void x264_predict_8x8c_init_arm( int cpu, x264_predict_t pf[7] );
diff --git a/common/pixel.c b/common/pixel.c
index e16f292..b389792 100644
--- a/common/pixel.c
+++ b/common/pixel.c
@@ -36,6 +36,7 @@
#endif
#if ARCH_ARM
# include "arm/pixel.h"
+# include "arm/predict.h"
#endif
#if ARCH_UltraSPARC
# include "sparc/pixel.h"
@@ -532,6 +533,10 @@ INTRA_MBCMP_8x8(sa8d,, _c )
INTRA_MBCMP_8x8( sad, _mmx2, _c )
INTRA_MBCMP_8x8(sa8d, _sse2, _sse2 )
#endif
+#if !HIGH_BIT_DEPTH && HAVE_ARMV6
+INTRA_MBCMP_8x8( sad, _neon, _neon )
+INTRA_MBCMP_8x8(sa8d, _neon, _neon )
+#endif
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma, cpu, cpu2 )\
void x264_intra_##mbcmp##_x3_##size##chroma##cpu( pixel *fenc, pixel *fdec, int res[3] )\
@@ -587,6 +592,16 @@ INTRA_MBCMP(satd, 8x16, dc, h, v, c, _avx, _mmx2 )
INTRA_MBCMP(satd, 8x16, dc, h, v, c, _xop, _mmx2 )
#endif
#endif
+#if !HIGH_BIT_DEPTH && HAVE_ARMV6
+INTRA_MBCMP( sad, 4x4, v, h, dc, , _neon, _c )
+INTRA_MBCMP(satd, 4x4, v, h, dc, , _neon, _c )
+INTRA_MBCMP( sad, 8x8, dc, h, v, c, _neon, _neon )
+INTRA_MBCMP(satd, 8x8, dc, h, v, c, _neon, _neon )
+INTRA_MBCMP( sad, 8x16, dc, h, v, c, _neon, _c )
+INTRA_MBCMP(satd, 8x16, dc, h, v, c, _neon, _c )
+INTRA_MBCMP( sad, 16x16, v, h, dc, , _neon, _neon )
+INTRA_MBCMP(satd, 16x16, v, h, dc, , _neon, _neon )
+#endif
// No C implementation of intra_satd_x9. See checkasm for its behavior,
// or see x264_mb_analyse_intra for the entirely different algorithm we
@@ -1352,6 +1367,17 @@ void x264_pixel_init( int cpu, x264_pixel_function_t *pixf )
pixf->var2[PIXEL_8x8] = x264_pixel_var2_8x8_neon;
pixf->var2[PIXEL_8x16] = x264_pixel_var2_8x16_neon;
+ pixf->intra_sad_x3_4x4 = x264_intra_sad_x3_4x4_neon;
+ pixf->intra_satd_x3_4x4 = x264_intra_satd_x3_4x4_neon;
+ pixf->intra_sad_x3_8x8 = x264_intra_sad_x3_8x8_neon;
+ pixf->intra_sa8d_x3_8x8 = x264_intra_sa8d_x3_8x8_neon;
+ pixf->intra_sad_x3_8x8c = x264_intra_sad_x3_8x8c_neon;
+ pixf->intra_satd_x3_8x8c = x264_intra_satd_x3_8x8c_neon;
+ pixf->intra_sad_x3_8x16c = x264_intra_sad_x3_8x16c_neon;
+ pixf->intra_satd_x3_8x16c = x264_intra_satd_x3_8x16c_neon;
+ pixf->intra_sad_x3_16x16 = x264_intra_sad_x3_16x16_neon;
+ pixf->intra_satd_x3_16x16 = x264_intra_satd_x3_16x16_neon;
+
pixf->ssim_4x4x2_core = x264_pixel_ssim_4x4x2_core_neon;
pixf->ssim_end4 = x264_pixel_ssim_end4_neon;
--
1.9.0
More information about the x264-devel
mailing list