[x264-devel] [PATCH 1/2] x86: Share the mbtree_propagate_list macro with aarch64

Thu Sep 3 08:30:43 CEST 2015

This avoids having to duplicate the same code for all architectures
that implement only the internal part of this function in assembler.
---
Moved the x86 inline asm versions of CLIP_ADD into the shared
header.
---
 common/aarch64/mc-c.c |   84 +-------------------------------
 common/mc-int.h       |  130 +++++++++++++++++++++++++++++++++++++++++++++++++
 common/x86/mc-c.c     |  105 +--------------------------------------
 3 files changed, 133 insertions(+), 186 deletions(-)
 create mode 100644 common/mc-int.h

diff --git a/common/aarch64/mc-c.c b/common/aarch64/mc-c.c
index b94e3d3..8b98b94 100644
--- a/common/aarch64/mc-c.c
+++ b/common/aarch64/mc-c.c
@@ -26,6 +26,7 @@
 
 #include "common/common.h"
 #include "mc.h"
+#include "common/mc-int.h"
 
 void x264_prefetch_ref_aarch64( uint8_t *, intptr_t, int );
 void x264_prefetch_fenc_420_aarch64( uint8_t *, intptr_t, uint8_t *, intptr_t, int );
@@ -205,88 +206,7 @@ void x264_hpel_filter_neon( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
                             int height, int16_t *buf );
 #endif // !HIGH_BIT_DEPTH
 
-#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
-#define CLIP_ADD2(s,x)\
-do\
-{\
-    CLIP_ADD((s)[0], (x)[0]);\
-    CLIP_ADD((s)[1], (x)[1]);\
-} while(0)
-
-void x264_mbtree_propagate_list_internal_neon( int16_t (*mvs)[2],
-                                               int16_t *propagate_amount,
-                                               uint16_t *lowres_costs,
-                                               int16_t *output,
-                                               int bipred_weight, int mb_y,
-                                               int len );
-
-static void x264_mbtree_propagate_list_neon( x264_t *h, uint16_t *ref_costs,
-                                             int16_t (*mvs)[2],
-                                             int16_t *propagate_amount,
-                                             uint16_t *lowres_costs,
-                                             int bipred_weight, int mb_y,
-                                             int len, int list )
-{
-    int16_t *current = h->scratch_buffer2;
-
-    x264_mbtree_propagate_list_internal_neon( mvs, propagate_amount,
-                                              lowres_costs, current,
-                                              bipred_weight, mb_y, len );
-
-    unsigned stride = h->mb.i_mb_stride;
-    unsigned width = h->mb.i_mb_width;
-    unsigned height = h->mb.i_mb_height;
-
-    for( unsigned i = 0; i < len; current += 32 )
-    {
-        int end = X264_MIN( i+8, len );
-        for( ; i < end; i++, current += 2 )
-        {
-            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )
-                continue;
-
-            unsigned mbx = current[0];
-            unsigned mby = current[1];
-            unsigned idx0 = mbx + mby * stride;
-            unsigned idx2 = idx0 + stride;
-
-            /* Shortcut for the simple/common case of zero MV */
-            if( !M32( mvs[i] ) )
-            {
-                CLIP_ADD( ref_costs[idx0], current[16] );
-                continue;
-            }
-
-            if( mbx < width-1 && mby < height-1 )
-            {
-                CLIP_ADD2( ref_costs+idx0, current+16 );
-                CLIP_ADD2( ref_costs+idx2, current+32 );
-            }
-            else
-            {
-                /* Note: this takes advantage of unsigned representation to
-                 * catch negative mbx/mby. */
-                if( mby < height )
-                {
-                    if( mbx < width )
-                        CLIP_ADD( ref_costs[idx0+0], current[16] );
-                    if( mbx+1 < width )
-                        CLIP_ADD( ref_costs[idx0+1], current[17] );
-                }
-                if( mby+1 < height )
-                {
-                    if( mbx < width )
-                        CLIP_ADD( ref_costs[idx2+0], current[32] );
-                    if( mbx+1 < width )
-                        CLIP_ADD( ref_costs[idx2+1], current[33] );
-                }
-            }
-        }
-    }
-}
-
-#undef CLIP_ADD
-#undef CLIP_ADD2
+PROPAGATE_LIST(neon)
 
 void x264_mc_init_aarch64( int cpu, x264_mc_functions_t *pf )
 {
diff --git a/common/mc-int.h b/common/mc-int.h
new file mode 100644
index 0000000..d1e424f
--- /dev/null
+++ b/common/mc-int.h
@@ -0,0 +1,130 @@
+/*****************************************************************************
+ * mc-int.h: motion compensation internal defines
+ *****************************************************************************
+ * Copyright (C) 2003-2015 x264 project
+ *
+ * Authors: Laurent Aimar <fenrir at via.ecp.fr>
+ *          Loren Merritt <lorenm at u.washington.edu>
+ *          Fiona Glaser <fiona at x264.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+
+#if HAVE_X86_INLINE_ASM
+#define CLIP_ADD(s,x)\
+do\
+{\
+    int temp;\
+    asm("movd       %0, %%xmm0     \n"\
+        "movd       %2, %%xmm1     \n"\
+        "paddsw %%xmm1, %%xmm0     \n"\
+        "movd   %%xmm0, %1         \n"\
+        :"+m"(s), "=&r"(temp)\
+        :"m"(x)\
+    );\
+    s = temp;\
+} while(0)
+
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    asm("movd       %0, %%xmm0     \n"\
+        "movd       %1, %%xmm1     \n"\
+        "paddsw %%xmm1, %%xmm0     \n"\
+        "movd   %%xmm0, %0         \n"\
+        :"+m"(M32(s))\
+        :"m"(M32(x))\
+    );\
+} while(0)
+#else
+#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
+#define CLIP_ADD2(s,x)\
+do\
+{\
+    CLIP_ADD((s)[0], (x)[0]);\
+    CLIP_ADD((s)[1], (x)[1]);\
+} while(0)
+#endif
+
+#define PROPAGATE_LIST(cpu)\
+void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
+                                                uint16_t *lowres_costs, int16_t *output,\
+                                                int bipred_weight, int mb_y, int len );\
+\
+static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
+                                              int16_t *propagate_amount, uint16_t *lowres_costs,\
+                                              int bipred_weight, int mb_y, int len, int list )\
+{\
+    int16_t *current = h->scratch_buffer2;\
+\
+    x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
+                                               current, bipred_weight, mb_y, len );\
+\
+    unsigned stride = h->mb.i_mb_stride;\
+    unsigned width = h->mb.i_mb_width;\
+    unsigned height = h->mb.i_mb_height;\
+\
+    for( unsigned i = 0; i < len; current += 32 )\
+    {\
+        int end = X264_MIN( i+8, len );\
+        for( ; i < end; i++, current += 2 )\
+        {\
+            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
+                continue;\
+\
+            unsigned mbx = current[0];\
+            unsigned mby = current[1];\
+            unsigned idx0 = mbx + mby * stride;\
+            unsigned idx2 = idx0 + stride;\
+\
+            /* Shortcut for the simple/common case of zero MV */\
+            if( !M32( mvs[i] ) )\
+            {\
+                CLIP_ADD( ref_costs[idx0], current[16] );\
+                continue;\
+            }\
+\
+            if( mbx < width-1 && mby < height-1 )\
+            {\
+                CLIP_ADD2( ref_costs+idx0, current+16 );\
+                CLIP_ADD2( ref_costs+idx2, current+32 );\
+            }\
+            else\
+            {\
+                /* Note: this takes advantage of unsigned representation to\
+                 * catch negative mbx/mby. */\
+                if( mby < height )\
+                {\
+                    if( mbx < width )\
+                        CLIP_ADD( ref_costs[idx0+0], current[16] );\
+                    if( mbx+1 < width )\
+                        CLIP_ADD( ref_costs[idx0+1], current[17] );\
+                }\
+                if( mby+1 < height )\
+                {\
+                    if( mbx < width )\
+                        CLIP_ADD( ref_costs[idx2+0], current[32] );\
+                    if( mbx+1 < width )\
+                        CLIP_ADD( ref_costs[idx2+1], current[33] );\
+                }\
+            }\
+        }\
+    }\
+}
diff --git a/common/x86/mc-c.c b/common/x86/mc-c.c
index d868706..54798d1 100644
--- a/common/x86/mc-c.c
+++ b/common/x86/mc-c.c
@@ -31,6 +31,7 @@
 
 #include "common/common.h"
 #include "mc.h"
+#include "common/mc-int.h"
 
 #define DECL_SUF( func, args )\
     void func##_mmx2 args;\
@@ -589,112 +590,8 @@ PLANE_INTERLEAVE(sse2)
 PLANE_INTERLEAVE(avx)
 #endif
 
-#if HAVE_X86_INLINE_ASM
-#define CLIP_ADD(s,x)\
-do\
-{\
-    int temp;\
-    asm("movd       %0, %%xmm0     \n"\
-        "movd       %2, %%xmm1     \n"\
-        "paddsw %%xmm1, %%xmm0     \n"\
-        "movd   %%xmm0, %1         \n"\
-        :"+m"(s), "=&r"(temp)\
-        :"m"(x)\
-    );\
-    s = temp;\
-} while(0)
-
-#define CLIP_ADD2(s,x)\
-do\
-{\
-    asm("movd       %0, %%xmm0     \n"\
-        "movd       %1, %%xmm1     \n"\
-        "paddsw %%xmm1, %%xmm0     \n"\
-        "movd   %%xmm0, %0         \n"\
-        :"+m"(M32(s))\
-        :"m"(M32(x))\
-    );\
-} while(0)
-#else
-#define CLIP_ADD(s,x) (s) = X264_MIN((s)+(x),(1<<15)-1)
-#define CLIP_ADD2(s,x)\
-do\
-{\
-    CLIP_ADD((s)[0], (x)[0]);\
-    CLIP_ADD((s)[1], (x)[1]);\
-} while(0)
-#endif
-
-#define PROPAGATE_LIST(cpu)\
-void x264_mbtree_propagate_list_internal_##cpu( int16_t (*mvs)[2], int16_t *propagate_amount,\
-                                                uint16_t *lowres_costs, int16_t *output,\
-                                                int bipred_weight, int mb_y, int len );\
-\
-static void x264_mbtree_propagate_list_##cpu( x264_t *h, uint16_t *ref_costs, int16_t (*mvs)[2],\
-                                              int16_t *propagate_amount, uint16_t *lowres_costs,\
-                                              int bipred_weight, int mb_y, int len, int list )\
-{\
-    int16_t *current = h->scratch_buffer2;\
-\
-    x264_mbtree_propagate_list_internal_##cpu( mvs, propagate_amount, lowres_costs,\
-                                               current, bipred_weight, mb_y, len );\
-\
-    unsigned stride = h->mb.i_mb_stride;\
-    unsigned width = h->mb.i_mb_width;\
-    unsigned height = h->mb.i_mb_height;\
-\
-    for( unsigned i = 0; i < len; current += 32 )\
-    {\
-        int end = X264_MIN( i+8, len );\
-        for( ; i < end; i++, current += 2 )\
-        {\
-            if( !(lowres_costs[i] & (1 << (list+LOWRES_COST_SHIFT))) )\
-                continue;\
-\
-            unsigned mbx = current[0];\
-            unsigned mby = current[1];\
-            unsigned idx0 = mbx + mby * stride;\
-            unsigned idx2 = idx0 + stride;\
-\
-            /* Shortcut for the simple/common case of zero MV */\
-            if( !M32( mvs[i] ) )\
-            {\
-                CLIP_ADD( ref_costs[idx0], current[16] );\
-                continue;\
-            }\
-\
-            if( mbx < width-1 && mby < height-1 )\
-            {\
-                CLIP_ADD2( ref_costs+idx0, current+16 );\
-                CLIP_ADD2( ref_costs+idx2, current+32 );\
-            }\
-            else\
-            {\
-                /* Note: this takes advantage of unsigned representation to\
-                 * catch negative mbx/mby. */\
-                if( mby < height )\
-                {\
-                    if( mbx < width )\
-                        CLIP_ADD( ref_costs[idx0+0], current[16] );\
-                    if( mbx+1 < width )\
-                        CLIP_ADD( ref_costs[idx0+1], current[17] );\
-                }\
-                if( mby+1 < height )\
-                {\
-                    if( mbx < width )\
-                        CLIP_ADD( ref_costs[idx2+0], current[32] );\
-                    if( mbx+1 < width )\
-                        CLIP_ADD( ref_costs[idx2+1], current[33] );\
-                }\
-            }\
-        }\
-    }\
-}
-
 PROPAGATE_LIST(ssse3)
 PROPAGATE_LIST(avx)
-#undef CLIP_ADD
-#undef CLIP_ADD2
 
 void x264_mc_init_mmx( int cpu, x264_mc_functions_t *pf )
 {
-- 
1.7.10.4