[x264-devel] [PATCH 1/2] x86: Share the mbtree_propagate_list macro with aarch64
Martin Storsjö
martin at martin.st
Thu Sep 3 08:30:43 CEST 2015
This avoids having to duplicate the same code for all architectures
that implement only the internal part of this function in assembler.
---
Moved the x86 inline asm versions of CLIP_ADD into the shared
header.
---
common/aarch64/mc-c.c | 84 +-------------------------------
common/mc-int.h | 130 +++++++++++++++++++++++++++++++++++++++++++++++++
common/x86/mc-c.c | 105 +--------------------------------------
3 files changed, 133 insertions(+), 186 deletions(-)
create mode 100644 common/mc-int.h
diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index b94e3d3..8b98b94 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -26,6 +26,7 @@
#include "common/common.h"
#include "mc.h"
+#include "common/mc-int.h"
void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
@@ -205,88 +206,7 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
int height, int16_t *buf );
#endif // !HIGH_BIT_DEPTH
-#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
-#define CLIP_ADD2(s,x)\
-do\
-{\
- CLIP_ADD((s)[0], (x)[0]);\
- CLIP_ADD((s)[1], (x)[1]);\
-} while(0)
-
-void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
- int16_t *propagate_amount,
- uint16_t *lowres_costs,
- int16_t *output,
- int bipred_weight, int mb_y,
- int len );
-
-static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
- int16_t (*mvs)[2],
- int16_t *propagate_amount,
- uint16_t *lowres_costs,
- int bipred_weight, int mb_y,
- int len, int list )
-{
- int16_t *current = h->scratch_buffer2;
-
- x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
- lowres_costs, current,
- bipred_weight, mb_y, len );
-
- unsigned stride = h->mb.i_mb_stride;
- unsigned width = h->mb.i_mb_width;
- unsigned height = h->mb.i_mb_height;
-
- for( unsigned i = 0; i < len; current += 32 )
- {
- int end = X264_MIN( i+8, len );
- for( ; i < end; i++, current += 2 )
- {
- if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
- continue;
-
- unsigned mbx = current[0];
- unsigned mby = current[1];
- unsigned idx0 = mbx + mby * stride;
- unsigned idx2 = idx0 + stride;
-
- /* Shortcut for the simple/common case of zero MV */
- if( !M32( mvs[i] ) )
- {
- CLIP_ADD( ref_costs[idx0], current[16] );
- continue;
- }
-
- if( mbx < width-1 && mby < height-1 )
- {
- CLIP_ADD2( ref_costs+idx0, current+16 );
- CLIP_ADD2( ref_costs+idx2, current+32 );
- }
- else
- {
- /* Note: this takes advantage of unsigned representation to
- * catch negative mbx/mby. */
- if( mby < height )
- {
- if( mbx < width )
- CLIP_ADD( ref_costs[idx0+0], current[16] );
- if( mbx+1 < width )
- CLIP_ADD( ref_costs[idx0+1], current[17] );
- }
- if( mby+1 < height )
- {
- if( mbx < width )
- CLIP_ADD( ref_costs[idx2+0], current[32] );
- if( mbx+1 < width )
- CLIP_ADD( ref_costs[idx2+1], current[33] );
- }
- }
- }
- }
-}
-
-#undef CLIP_ADD
-#undef CLIP_ADD2
+PROPAGATE_LIST(neon)
void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
{
diff --git a/common/mc-int.h b/common/mc-int.h
new file mode 100644
index 0000000..d1e424f
--- /dev/null
+++ b/common/mc-int.h
@@ -0,0 +1,130 @@
+/*****************************************************************************
+ * mc-int.h: motion compensation internal defines
+ *****************************************************************************
+ * Copyright (C) 2003-2015 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ * Loren Merritt <lorenm at u.washington.edu>
+ * Fiona Glaser <fiona at x264.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+
+#if HAVE_X86_INLINE_ASM
+#define CLIP_ADD(s,x)\
+do\
+{\
+ int temp;\
+ asm("movd %0, %%xmm0 \n"\
+ "movd %2, %%xmm1 \n"\
+ "paddsw %%xmm1, %%xmm0 \n"\
+ "movd %%xmm0, %1 \n"\
+ :"+m"(s), "=&r"(temp)\
+ :"m"(x)\
+ );\
+ s = temp;\
+} while(0)
+
+#define CLIP_ADD2(s,x)\
+do\
+{\
+ asm("movd %0, %%xmm0 \n"\
+ "movd %1, %%xmm1 \n"\
+ "paddsw %%xmm1, %%xmm0 \n"\
+ "movd %%xmm0, %0 \n"\
+ :"+m"(M32(s))\
+ :"m"(M32(x))\
+ );\
+} while(0)
+#else
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+ CLIP_ADD((s)[0], (x)[0]);\
+ CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+#endif
+
+#define PROPAGATE_LIST(cpu)\
+void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
+ uint16_t *lowres_costs, int16_t *output,\
+ int bipred_weight, int mb_y, int len );\
+\
+static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
+ int16_t *propagate_amount, uint16_t *lowres_costs,\
+ int bipred_weight, int mb_y, int len, int list )\
+{\
+ int16_t *current = h->scratch_buffer2;\
+\
+ x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
+ current, bipred_weight, mb_y, len );\
+\
+ unsigned stride = h->mb.i_mb_stride;\
+ unsigned width = h->mb.i_mb_width;\
+ unsigned height = h->mb.i_mb_height;\
+\
+ for( unsigned i = 0; i < len; current += 32 )\
+ {\
+ int end = X264_MIN( i+8, len );\
+ for( ; i < end; i++, current += 2 )\
+ {\
+ if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
+ continue;\
+\
+ unsigned mbx = current[0];\
+ unsigned mby = current[1];\
+ unsigned idx0 = mbx + mby * stride;\
+ unsigned idx2 = idx0 + stride;\
+\
+ /* Shortcut for the simple/common case of zero MV */\
+ if( !M32( mvs[i] ) )\
+ {\
+ CLIP_ADD( ref_costs[idx0], current[16] );\
+ continue;\
+ }\
+\
+ if( mbx < width-1 && mby < height-1 )\
+ {\
+ CLIP_ADD2( ref_costs+idx0, current+16 );\
+ CLIP_ADD2( ref_costs+idx2, current+32 );\
+ }\
+ else\
+ {\
+ /* Note: this takes advantage of unsigned representation to\
+ * catch negative mbx/mby. */\
+ if( mby < height )\
+ {\
+ if( mbx < width )\
+ CLIP_ADD( ref_costs[idx0+0], current[16] );\
+ if( mbx+1 < width )\
+ CLIP_ADD( ref_costs[idx0+1], current[17] );\
+ }\
+ if( mby+1 < height )\
+ {\
+ if( mbx < width )\
+ CLIP_ADD( ref_costs[idx2+0], current[32] );\
+ if( mbx+1 < width )\
+ CLIP_ADD( ref_costs[idx2+1], current[33] );\
+ }\
+ }\
+ }\
+ }\
+}
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index d868706..54798d1 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -31,6 +31,7 @@
#include "common/common.h"
#include "mc.h"
+#include "common/mc-int.h"
#define DECL_SUF( func, args )\
void func##_mmx2 args;\
@@ -589,112 +590,8 @@ PLANE_INTERLEAVE(sse2)
PLANE_INTERLEAVE(avx)
#endif
-#if HAVE_X86_INLINE_ASM
-#define CLIP_ADD(s,x)\
-do\
-{\
- int temp;\
- asm("movd %0, %%xmm0 \n"\
- "movd %2, %%xmm1 \n"\
- "paddsw %%xmm1, %%xmm0 \n"\
- "movd %%xmm0, %1 \n"\
- :"+m"(s), "=&r"(temp)\
- :"m"(x)\
- );\
- s = temp;\
-} while(0)
-
-#define CLIP_ADD2(s,x)\
-do\
-{\
- asm("movd %0, %%xmm0 \n"\
- "movd %1, %%xmm1 \n"\
- "paddsw %%xmm1, %%xmm0 \n"\
- "movd %%xmm0, %0 \n"\
- :"+m"(M32(s))\
- :"m"(M32(x))\
- );\
-} while(0)
-#else
-#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
-#define CLIP_ADD2(s,x)\
-do\
-{\
- CLIP_ADD((s)[0], (x)[0]);\
- CLIP_ADD((s)[1], (x)[1]);\
-} while(0)
-#endif
-
-#define PROPAGATE_LIST(cpu)\
-void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
- uint16_t *lowres_costs, int16_t *output,\
- int bipred_weight, int mb_y, int len );\
-\
-static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
- int16_t *propagate_amount, uint16_t *lowres_costs,\
- int bipred_weight, int mb_y, int len, int list )\
-{\
- int16_t *current = h->scratch_buffer2;\
-\
- x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
- current, bipred_weight, mb_y, len );\
-\
- unsigned stride = h->mb.i_mb_stride;\
- unsigned width = h->mb.i_mb_width;\
- unsigned height = h->mb.i_mb_height;\
-\
- for( unsigned i = 0; i < len; current += 32 )\
- {\
- int end = X264_MIN( i+8, len );\
- for( ; i < end; i++, current += 2 )\
- {\
- if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
- continue;\
-\
- unsigned mbx = current[0];\
- unsigned mby = current[1];\
- unsigned idx0 = mbx + mby * stride;\
- unsigned idx2 = idx0 + stride;\
-\
- /* Shortcut for the simple/common case of zero MV */\
- if( !M32( mvs[i] ) )\
- {\
- CLIP_ADD( ref_costs[idx0], current[16] );\
- continue;\
- }\
-\
- if( mbx < width-1 && mby < height-1 )\
- {\
- CLIP_ADD2( ref_costs+idx0, current+16 );\
- CLIP_ADD2( ref_costs+idx2, current+32 );\
- }\
- else\
- {\
- /* Note: this takes advantage of unsigned representation to\
- * catch negative mbx/mby. */\
- if( mby < height )\
- {\
- if( mbx < width )\
- CLIP_ADD( ref_costs[idx0+0], current[16] );\
- if( mbx+1 < width )\
- CLIP_ADD( ref_costs[idx0+1], current[17] );\
- }\
- if( mby+1 < height )\
- {\
- if( mbx < width )\
- CLIP_ADD( ref_costs[idx2+0], current[32] );\
- if( mbx+1 < width )\
- CLIP_ADD( ref_costs[idx2+1], current[33] );\
- }\
- }\
- }\
- }\
-}
-
PROPAGATE_LIST(ssse3)
PROPAGATE_LIST(avx)
-#undef CLIP_ADD
-#undef CLIP_ADD2
void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
{
--
1.7.10.4
More information about the x264-devel
mailing list