[x264-devel] [PATCH] MIPS MSA Macros

Kaustubh Raste kaustubh.raste at imgtec.com
Fri Apr 17 14:48:33 CEST 2015


All defined macros are required by subsequent source patches.

Signed-off-by: Kaustubh Raste <kaustubh.raste at imgtec.com>
---
 common/mips/macros.h | 1230 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 1230 insertions(+)
 create mode 100644 common/mips/macros.h

diff --git a/common/mips/macros.h b/common/mips/macros.h
new file mode 100644
index 0000000..9312d44
--- /dev/null
+++ b/common/mips/macros.h
@@ -0,0 +1,1230 @@
+/*****************************************************************************
+ * macros.h: mips msa macros
+ *****************************************************************************
+ * Copyright (C) 2009-2015 x264 project
+ *
+ * Authors: Parag Salasakar <parag.salasakar at imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_MACROS_H
+#define X264_MIPS_MACROS_H
+
+#include <stdint.h>
+#include <msa.h>
+
+#define LOAD_UB( p_src ) *( ( v16u8 * ) ( p_src ) )
+#define LOAD_SB( p_src ) *( ( v16i8 * ) ( p_src ) )
+#define LOAD_UH( p_src ) *( ( v8u16 * ) ( p_src ) )
+#define LOAD_SH( p_src ) *( ( v8i16 * ) ( p_src ) )
+#define LOAD_SW( p_src ) *( ( v4i32 * ) ( p_src ) )
+
+#define STORE_UB( vec, p_dest ) *( ( v16u8 * ) ( p_dest ) ) = ( vec )
+#define STORE_SB( vec, p_dest ) *( ( v16i8 * ) ( p_dest ) ) = ( vec )
+#define STORE_UH( vec, p_dest ) *( ( v8u16 * ) ( p_dest ) ) = ( vec )
+#define STORE_SH( vec, p_dest ) *( ( v8i16 * ) ( p_dest ) ) = ( vec )
+#define STORE_SD( vec, p_dest ) *( ( v2i64 * ) ( p_dest ) ) = ( vec )
+
+#if ( __mips_isa_rev >= 6 )
+    #define LOAD_WORD( p_src )                       \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint32_t u_val_m;                            \
+                                                     \
+        __asm__ __volatile__ (                       \
+            "lw  %[u_val_m],  %[p_src_m]  \n\t"      \
+                                                     \
+            : [u_val_m] "=r" ( u_val_m )             \
+            : [p_src_m] "m" ( *p_src_m )             \
+         );                                          \
+                                                     \
+        u_val_m;                                     \
+    } )
+
+    #if ( __mips == 64 )
+        #define LOAD_DWORD( p_src )                      \
+        ( {                                              \
+            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+            uint64_t u_val_m = 0;                        \
+                                                         \
+            __asm__ __volatile__ (                       \
+                "ld  %[u_val_m],  %[p_src_m]  \n\t"      \
+                                                         \
+                : [u_val_m] "=r" ( u_val_m )             \
+                : [p_src_m] "m" ( *p_src_m )             \
+             );                                          \
+                                                         \
+            u_val_m;                                     \
+        } )
+    #else  // !( __mips == 64 )
+        #define LOAD_DWORD( p_src )                                            \
+        ( {                                                                    \
+            uint8_t *p_src1_m = ( uint8_t * ) ( p_src );                       \
+            uint8_t *p_src2_m = ( ( uint8_t * ) ( p_src ) ) + 4;               \
+            uint32_t u_val0_m, u_val1_m;                                       \
+            uint64_t u_genval_m = 0;                                           \
+                                                                               \
+            __asm__ __volatile__ (                                             \
+                "lw  %[u_val0_m],  %[p_src1_m]  \n\t"                          \
+                                                                               \
+                : [u_val0_m] "=r" ( u_val0_m )                                 \
+                : [p_src1_m] "m" ( *p_src1_m )                                 \
+             );                                                                \
+                                                                               \
+            __asm__ __volatile__ (                                             \
+                "lw  %[u_val1_m],  %[p_src2_m]  \n\t"                          \
+                                                                               \
+                : [u_val1_m] "=r" ( u_val1_m )                                 \
+                : [p_src2_m] "m" ( *p_src2_m )                                 \
+             );                                                                \
+                                                                               \
+            u_genval_m = ( uint64_t ) ( u_val1_m );                            \
+            u_genval_m = ( uint64_t ) ( ( u_genval_m << 32 ) &                 \
+                                      0xFFFFFFFF00000000 );                    \
+            u_genval_m = ( uint64_t ) ( u_genval_m | ( uint64_t ) u_val0_m );  \
+                                                                               \
+            u_genval_m;                                                        \
+        } )
+    #endif  // ( __mips == 64 )
+
+    #define STORE_WORD( p_dst_ma, val )                    \
+    {                                                      \
+        uint8_t *p_dst_temp = ( uint8_t * ) ( p_dst_ma );  \
+        uint32_t u_val_m = ( val );                        \
+                                                           \
+        __asm__ __volatile__ (                             \
+            "sw  %[u_val_m],  %[p_dst_temp]  \n\t"         \
+                                                           \
+            : [p_dst_temp] "=m" ( *p_dst_temp )            \
+            : [u_val_m] "r" ( u_val_m )                    \
+         );                                                \
+    }
+
+    #define STORE_DWORD( p_dst_ma, val )                   \
+    {                                                      \
+        uint8_t *p_dst_temp = ( uint8_t * ) ( p_dst_ma );  \
+        uint64_t u_val_m = ( val );                        \
+                                                           \
+        __asm__ __volatile__ (                             \
+            "sd  %[u_val_m],  %[p_dst_temp]  \n\t"         \
+                                                           \
+            : [p_dst_temp] "=m" ( *p_dst_temp )            \
+            : [u_val_m] "r" ( u_val_m )                    \
+         );                                                \
+    }
+
+    #define STORE_HWORD( p_dst_ma, val )                   \
+    {                                                      \
+        uint8_t *p_dst_temp = ( uint8_t * ) ( p_dst_ma );  \
+        uint16_t u_val_m = ( val );                        \
+                                                           \
+        __asm__ __volatile__ (                             \
+            "sh  %[u_val_m],  %[p_dst_temp]  \n\t"         \
+                                                           \
+            : [p_dst_temp] "=m" ( *p_dst_temp )            \
+            : [u_val_m] "r" ( u_val_m )                    \
+         );                                                \
+    }
+#else  // !( __mips_isa_rev >= 6 )
+    #define LOAD_WORD( p_src )                       \
+    ( {                                              \
+        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+        uint32_t u_val_m;                            \
+                                                     \
+        __asm__ __volatile__ (                       \
+            "ulw  %[u_val_m],  %[p_src_m]  \n\t"     \
+                                                     \
+            : [u_val_m] "=r" ( u_val_m )             \
+            : [p_src_m] "m" ( *p_src_m )             \
+         );                                          \
+                                                     \
+        u_val_m;                                     \
+    } )
+
+    #if ( __mips == 64 )
+        #define LOAD_DWORD( p_src )                      \
+        ( {                                              \
+            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
+            uint64_t u_val_m = 0;                        \
+                                                         \
+            __asm__ __volatile__ (                       \
+                "uld  %[u_val_m],  %[p_src_m]  \n\t"     \
+                                                         \
+                : [u_val_m] "=r" ( u_val_m )             \
+                : [p_src_m] "m" ( *p_src_m )             \
+             );                                          \
+                                                         \
+            u_val_m;                                     \
+        } )
+    #else  // !( __mips == 64 )
+        #define LOAD_DWORD( p_src )                                            \
+        ( {                                                                    \
+            uint8_t *p_src1_m = ( uint8_t * ) ( p_src );                       \
+            uint8_t *p_src2_m = ( ( uint8_t * ) ( p_src ) ) + 4;               \
+            uint32_t u_val0_m, u_val1_m;                                       \
+            uint64_t u_genval_m = 0;                                           \
+                                                                               \
+            __asm__ __volatile__ (                                             \
+                "ulw  %[u_val0_m],  %[p_src1_m]  \n\t"                         \
+                                                                               \
+                : [u_val0_m] "=r" ( u_val0_m )                                 \
+                : [p_src1_m] "m" ( *p_src1_m )                                 \
+             );                                                                \
+                                                                               \
+            __asm__ __volatile__ (                                             \
+                "ulw  %[u_val1_m],  %[p_src2_m]  \n\t"                         \
+                                                                               \
+                : [u_val1_m] "=r" ( u_val1_m )                                 \
+                : [p_src2_m] "m" ( *p_src2_m )                                 \
+             );                                                                \
+                                                                               \
+            u_genval_m = ( uint64_t ) ( u_val1_m );                            \
+            u_genval_m = ( uint64_t ) ( ( u_genval_m << 32 ) &                 \
+                                        0xFFFFFFFF00000000 );                  \
+            u_genval_m = ( uint64_t ) ( u_genval_m | ( uint64_t ) u_val0_m );  \
+                                                                               \
+            u_genval_m;                                                        \
+        } )
+    #endif  // ( __mips == 64 )
+
+    #define STORE_WORD( p_dst_ma, val )                   \
+    {                                                     \
+        uint8_t *p_dst_tmp = ( uint8_t * ) ( p_dst_ma );  \
+        uint32_t u_val_m = ( val );                       \
+                                                          \
+        __asm__ __volatile__ (                            \
+            "usw  %[u_val_m],  %[p_dst_tmp]  \n\t"        \
+                                                          \
+            : [p_dst_tmp] "=m" ( *p_dst_tmp )             \
+            : [u_val_m] "r" ( u_val_m )                   \
+         );                                               \
+    }
+
+    #define STORE_DWORD( p_dst_ma, val )                                     \
+    {                                                                        \
+        uint8_t *p_dst1_m = ( uint8_t * ) ( p_dst_ma );                      \
+        uint8_t *p_dst2_m = ( ( uint8_t * ) ( p_dst_ma ) ) + 4;              \
+        uint32_t u_val0_m, u_val1_m;                                         \
+                                                                             \
+        u_val0_m = ( uint32_t ) ( ( val ) & 0x00000000FFFFFFFF );            \
+        u_val1_m = ( uint32_t ) ( ( ( val ) >> 32 ) & 0x00000000FFFFFFFF );  \
+                                                                             \
+        __asm__ __volatile__ (                                               \
+            "usw  %[u_val0_m],  %[p_dst1_m]  \n\t"                           \
+            "usw  %[u_val1_m],  %[p_dst2_m]  \n\t"                           \
+                                                                             \
+            : [p_dst1_m] "=m" ( *p_dst1_m ), [p_dst2_m] "=m" ( *p_dst2_m )   \
+            : [u_val0_m] "r" ( u_val0_m ), [u_val1_m] "r" ( u_val1_m )       \
+         );                                                                  \
+    }
+
+    #define STORE_HWORD( p_dst_ma, val )                \
+    {                                                   \
+        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst_ma );  \
+        uint16_t u_val_m = ( val );                     \
+                                                        \
+        __asm__ __volatile__ (                          \
+            "ush  %[u_val_m],  %[p_dst_m]  \n\t"        \
+                                                        \
+            : [p_dst_m] "=m" ( *p_dst_m )               \
+            : [u_val_m] "r" ( u_val_m )                 \
+         );                                             \
+    }
+#endif  // ( __mips_isa_rev >= 6 )
+
+#define LOAD_4WORDS_WITH_STRIDE( p_src, src_stride,        \
+                                 src0, src1, src2, src3 )  \
+{                                                          \
+    src0 = LOAD_WORD( p_src + 0 * src_stride );            \
+    src1 = LOAD_WORD( p_src + 1 * src_stride );            \
+    src2 = LOAD_WORD( p_src + 2 * src_stride );            \
+    src3 = LOAD_WORD( p_src + 3 * src_stride );            \
+}
+
+#define LOAD_2VECS_UB( p_src, i_stride,      \
+                       val0, val1 )          \
+{                                            \
+    val0 = LOAD_UB( p_src + 0 * i_stride );  \
+    val1 = LOAD_UB( p_src + 1 * i_stride );  \
+}
+
+#define LOAD_3VECS_UB( p_src, i_stride,      \
+                       val0, val1, val2 )    \
+{                                            \
+    val0 = LOAD_UB( p_src + 0 * i_stride );  \
+    val1 = LOAD_UB( p_src + 1 * i_stride );  \
+    val2 = LOAD_UB( p_src + 2 * i_stride );  \
+}
+
+#define LOAD_4VECS_UB( p_src, i_stride,          \
+                       val0, val1, val2, val3 )  \
+{                                                \
+    val0 = LOAD_UB( p_src + 0 * i_stride );      \
+    val1 = LOAD_UB( p_src + 1 * i_stride );      \
+    val2 = LOAD_UB( p_src + 2 * i_stride );      \
+    val3 = LOAD_UB( p_src + 3 * i_stride );      \
+}
+
+#define LOAD_4VECS_SB( p_src, i_stride,          \
+                       val0, val1, val2, val3 )  \
+{                                                \
+    val0 = LOAD_SB( p_src + 0 * i_stride );      \
+    val1 = LOAD_SB( p_src + 1 * i_stride );      \
+    val2 = LOAD_SB( p_src + 2 * i_stride );      \
+    val3 = LOAD_SB( p_src + 3 * i_stride );      \
+}
+
+#define LOAD_5VECS_UB( p_src, i_stride,                       \
+                       out0, out1, out2, out3, out4 )         \
+{                                                             \
+    LOAD_4VECS_UB( ( p_src ), ( i_stride ),                   \
+                   ( out0 ), ( out1 ), ( out2 ), ( out3 ) );  \
+    out4 = LOAD_UB( p_src + 4 * i_stride );                   \
+}
+
+#define LOAD_5VECS_SB( p_src, i_stride,                       \
+                       out0, out1, out2, out3, out4 )         \
+{                                                             \
+    LOAD_4VECS_SB( ( p_src ), ( i_stride ),                   \
+                   ( out0 ), ( out1 ), ( out2 ), ( out3 ) );  \
+    out4 = LOAD_SB( p_src + 4 * i_stride );                   \
+}
+
+#define LOAD_8VECS_UB( p_src, i_stride,                       \
+                       out0, out1, out2, out3,                \
+                       out4, out5, out6, out7 )               \
+{                                                             \
+    LOAD_4VECS_UB( ( p_src ), ( i_stride ),                   \
+                   ( out0 ), ( out1 ), ( out2 ), ( out3 ) );  \
+    LOAD_4VECS_UB( ( p_src + 4 * i_stride ), ( i_stride ),    \
+                   ( out4 ), ( out5 ), ( out6 ), ( out7 ) );  \
+}
+
+#define LOAD_8VECS_SB( psrc, stride,               \
+                       out0, out1, out2, out3,     \
+                       out4, out5, out6, out7 )    \
+{                                                  \
+    LOAD_4VECS_SB( psrc, stride,                   \
+                   out0, out1, out2, out3 );       \
+    LOAD_4VECS_SB( ( psrc + 4 * stride ), stride,  \
+                   out4, out5, out6, out7 );       \
+}
+
+#define LOAD_2VECS_SH( p_src, i_stride,              \
+                       val0, val1 )                  \
+{                                                    \
+    val0 = LOAD_SH( ( p_src ) + 0 * ( i_stride ) );  \
+    val1 = LOAD_SH( ( p_src ) + 1 * ( i_stride ) );  \
+}
+
+#define LOAD_4VECS_SH( p_src, i_stride,                                   \
+                       val0, val1, val2, val3 )                           \
+{                                                                         \
+    LOAD_2VECS_SH( ( p_src ), ( i_stride ), val0, val1 );                 \
+    LOAD_2VECS_SH( ( p_src + 2 * i_stride ), ( i_stride ), val2, val3 );  \
+}
+
+#define LOAD_8VECS_SH( p_src, i_stride,                     \
+                       val0, val1, val2, val3,              \
+                       val4, val5, val6, val7 )             \
+{                                                           \
+    LOAD_4VECS_SH( ( p_src ), ( i_stride ),                 \
+                   val0, val1, val2, val3 );                \
+    LOAD_4VECS_SH( ( p_src + 4 * i_stride ), ( i_stride ),  \
+                   val4, val5, val6, val7 );                \
+}
+
+#define STORE_4VECS_UB( dst_out, pitch,                    \
+                        in0, in1, in2, in3 )               \
+{                                                          \
+    STORE_UB( ( in0 ), ( dst_out ) );                      \
+    STORE_UB( ( in1 ), ( ( dst_out ) + ( pitch ) ) );      \
+    STORE_UB( ( in2 ), ( ( dst_out ) + 2 * ( pitch ) ) );  \
+    STORE_UB( ( in3 ), ( ( dst_out ) + 3 * ( pitch ) ) );  \
+}
+
+#define STORE_4VECS_SB( dst_out, pitch,                    \
+                        in0, in1, in2, in3 )               \
+{                                                          \
+    STORE_SB( ( in0 ), ( dst_out ) );                      \
+    STORE_SB( ( in1 ), ( ( dst_out ) + ( pitch ) ) );      \
+    STORE_SB( ( in2 ), ( ( dst_out ) + 2 * ( pitch ) ) );  \
+    STORE_SB( ( in3 ), ( ( dst_out ) + 3 * ( pitch ) ) );  \
+}
+
+#define STORE_8VECS_UB( dst_out, pitch_in,                     \
+                        in0, in1, in2, in3,                    \
+                        in4, in5, in6, in7 )                   \
+{                                                              \
+    STORE_4VECS_UB( dst_out, pitch_in,                         \
+                    in0, in1, in2, in3 );                      \
+    STORE_4VECS_UB( ( dst_out + 4 * ( pitch_in ) ), pitch_in,  \
+                    in4, in5, in6, in7 );                      \
+}
+
+#define CLIP_MIN_TO_MAX_H( in, min, max )                          \
+( {                                                                \
+    v8i16 out_m;                                                   \
+                                                                   \
+    out_m = __msa_max_s_h( ( v8i16 ) ( min ), ( v8i16 ) ( in ) );  \
+    out_m = __msa_min_s_h( ( v8i16 ) ( max ), ( v8i16 ) out_m );   \
+    out_m;                                                         \
+} )
+
+#define CLIP_UNSIGNED_CHAR_H( in )                              \
+( {                                                             \
+    v8i16 max_m = __msa_ldi_h( 255 );                           \
+    v8i16 out_m;                                                \
+                                                                \
+    out_m = __msa_maxi_s_h( ( v8i16 ) ( in ), 0 );              \
+    out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m );  \
+    out_m;                                                      \
+} )
+
+#define CALC_ADDITIVE_SUM( result )                                           \
+( {                                                                           \
+    v2i64 result_m, result_dup_m;                                             \
+    int32_t sum_m;                                                            \
+                                                                              \
+    result_m = __msa_hadd_s_d( ( v4i32 ) ( result ), ( v4i32 ) ( result ) );  \
+    result_dup_m = __msa_splati_d( result_m, 1 );                             \
+    result_m = result_m + result_dup_m;                                       \
+    sum_m = __msa_copy_s_w( ( v4i32 ) result_m, 0 );                          \
+    sum_m;                                                                    \
+} )
+
+#define CALC_ADDITIVE_SUM_H( sad )                                   \
+( {                                                                  \
+    v4u32 sad_m;                                                     \
+    uint32_t sad_out_m;                                              \
+                                                                     \
+    sad_m = __msa_hadd_u_w( ( v8u16 ) ( sad ), ( v8u16 ) ( sad ) );  \
+    sad_out_m = ( uint32_t ) CALC_ADDITIVE_SUM( sad_m );             \
+    sad_out_m;                                                       \
+} )
+
+#define CALC_MSE_B( src, ref, var )                                            \
+{                                                                              \
+    v16u8 src_l0_m, src_l1_m;                                                  \
+    v8i16 res_l0_m, res_l1_m;                                                  \
+                                                                               \
+    src_l0_m = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src, ( v16i8 ) ref );         \
+    src_l1_m = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) src, ( v16i8 ) ref );         \
+                                                                               \
+    res_l0_m = __msa_hsub_u_h( src_l0_m, src_l0_m );                           \
+    res_l1_m = __msa_hsub_u_h( src_l1_m, src_l1_m );                           \
+                                                                               \
+    ( var ) = ( v4i32 ) __msa_dpadd_s_w( ( v4i32 ) var, res_l0_m, res_l0_m );  \
+    ( var ) = ( v4i32 ) __msa_dpadd_s_w( ( v4i32 ) var, res_l1_m, res_l1_m );  \
+}
+
+#define CALC_MSE_AVG_B( src, ref, var, sub )                               \
+{                                                                          \
+    v16u8 src_l0_m, src_l1_m;                                              \
+    v8i16 res_l0_m, res_l1_m;                                              \
+                                                                           \
+    src_l0_m = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src, ( v16i8 ) ref );     \
+    src_l1_m = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) src, ( v16i8 ) ref );     \
+                                                                           \
+    res_l0_m = __msa_hsub_u_h( src_l0_m, src_l0_m );                       \
+    res_l1_m = __msa_hsub_u_h( src_l1_m, src_l1_m );                       \
+                                                                           \
+    var = ( v4i32 ) __msa_dpadd_s_w( ( v4i32 ) var, res_l0_m, res_l0_m );  \
+    var = ( v4i32 ) __msa_dpadd_s_w( ( v4i32 ) var, res_l1_m, res_l1_m );  \
+                                                                           \
+    ( sub ) += res_l0_m + res_l1_m;                                        \
+}
+
+#define VARIANCE_WxH( sse, diff, shift )                                      \
+( {                                                                           \
+    uint32_t var_m;                                                           \
+                                                                              \
+    var_m = ( sse ) - ( ( ( uint32_t ) ( diff ) * ( diff ) ) >> ( shift ) );  \
+                                                                              \
+    var_m;                                                                    \
+} )
+
+#define VEC_INSERT_4W_UB( src, src0, src1, src2, src3 )                \
+{                                                                      \
+    src = ( v16u8 ) __msa_insert_w( ( v4i32 ) ( src ), 0, ( src0 ) );  \
+    src = ( v16u8 ) __msa_insert_w( ( v4i32 ) ( src ), 1, ( src1 ) );  \
+    src = ( v16u8 ) __msa_insert_w( ( v4i32 ) ( src ), 2, ( src2 ) );  \
+    src = ( v16u8 ) __msa_insert_w( ( v4i32 ) ( src ), 3, ( src3 ) );  \
+}
+
+#define VEC_INSERT_4W_SB( src, src0, src1, src2, src3 )                \
+{                                                                      \
+    src = ( v16i8 ) __msa_insert_w( ( v4i32 ) ( src ), 0, ( src0 ) );  \
+    src = ( v16i8 ) __msa_insert_w( ( v4i32 ) ( src ), 1, ( src1 ) );  \
+    src = ( v16i8 ) __msa_insert_w( ( v4i32 ) ( src ), 2, ( src2 ) );  \
+    src = ( v16i8 ) __msa_insert_w( ( v4i32 ) ( src ), 3, ( src3 ) );  \
+}
+
+#define TRANSPOSE8x8_B_UB( in0, in1, in2, in3,                            \
+                           in4, in5, in6, in7,                            \
+                           out0, out1, out2, out3,                        \
+                           out4, out5, out6, out7 )                       \
+{                                                                         \
+    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                 \
+    v16i8 zero_m = { 0 };                                                 \
+                                                                          \
+    tmp0_m = __msa_ilvr_b( ( v16i8 ) ( in2 ), ( v16i8 ) ( in0 ) );        \
+    tmp1_m = __msa_ilvr_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in1 ) );        \
+    tmp2_m = __msa_ilvr_b( ( v16i8 ) ( in6 ), ( v16i8 ) ( in4 ) );        \
+    tmp3_m = __msa_ilvr_b( ( v16i8 ) ( in7 ), ( v16i8 ) ( in5 ) );        \
+                                                                          \
+    tmp4_m = __msa_ilvr_b( ( v16i8 ) tmp1_m, ( v16i8 ) tmp0_m );          \
+    tmp5_m = __msa_ilvl_b( ( v16i8 ) tmp1_m, ( v16i8 ) tmp0_m );          \
+    tmp6_m = __msa_ilvr_b( ( v16i8 ) tmp3_m, ( v16i8 ) tmp2_m );          \
+    tmp7_m = __msa_ilvl_b( ( v16i8 ) tmp3_m, ( v16i8 ) tmp2_m );          \
+                                                                          \
+    out0 = ( v16u8 ) __msa_ilvr_w( ( v4i32 ) tmp6_m, ( v4i32 ) tmp4_m );  \
+    out2 = ( v16u8 ) __msa_ilvl_w( ( v4i32 ) tmp6_m, ( v4i32 ) tmp4_m );  \
+    out4 = ( v16u8 ) __msa_ilvr_w( ( v4i32 ) tmp7_m, ( v4i32 ) tmp5_m );  \
+    out6 = ( v16u8 ) __msa_ilvl_w( ( v4i32 ) tmp7_m, ( v4i32 ) tmp5_m );  \
+                                                                          \
+    out1 = ( v16u8 ) __msa_sldi_b( zero_m, ( v16i8 ) out0, 8 );           \
+    out3 = ( v16u8 ) __msa_sldi_b( zero_m, ( v16i8 ) out2, 8 );           \
+    out5 = ( v16u8 ) __msa_sldi_b( zero_m, ( v16i8 ) out4, 8 );           \
+    out7 = ( v16u8 ) __msa_sldi_b( zero_m, ( v16i8 ) out6, 8 );           \
+}
+
+/* transpose 16x8 matrix into 8x16 */
+#define TRANSPOSE16x8_B_UB( in0, in1, in2, in3,                              \
+                            in4, in5, in6, in7,                              \
+                            in8, in9, in10, in11,                            \
+                            in12, in13, in14, in15,                          \
+                            out0, out1, out2, out3,                          \
+                            out4, out5, out6, out7 )                         \
+{                                                                            \
+    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                    \
+    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                    \
+                                                                             \
+    out7 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in8, ( v2i64 ) ( in0 ) );      \
+    out6 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in9, ( v2i64 ) ( in1 ) );      \
+    out5 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in10, ( v2i64 ) ( in2 ) );     \
+    out4 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in11, ( v2i64 ) ( in3 ) );     \
+    out3 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in12, ( v2i64 ) ( in4 ) );     \
+    out2 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in13, ( v2i64 ) ( in5 ) );     \
+    out1 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in14, ( v2i64 ) ( in6 ) );     \
+    out0 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in15, ( v2i64 ) ( in7 ) );     \
+                                                                             \
+    tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 );      \
+    tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 );      \
+    tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 );      \
+    tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 );      \
+    out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 );        \
+    tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 );      \
+    out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 );        \
+    tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 );      \
+                                                                             \
+    tmp2_m = ( v16u8 ) __msa_ilvev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );  \
+    tmp3_m = ( v16u8 ) __msa_ilvev_h( ( v8i16 ) out7, ( v8i16 ) out5 );      \
+    out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );    \
+    out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );    \
+                                                                             \
+    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );  \
+    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 );      \
+    out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );    \
+    out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );    \
+                                                                             \
+    tmp2_m = ( v16u8 ) __msa_ilvev_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );  \
+    tmp3_m = ( v16u8 ) __msa_ilvev_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );  \
+    out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );    \
+    out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );    \
+                                                                             \
+    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );  \
+    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );  \
+    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );  \
+    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );  \
+    out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );    \
+    out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );    \
+}
+
+/* halfword transpose macro */
+#define TRANSPOSE4x4_H( in0, in1, in2, in3,                           \
+                        out0, out1, out2, out3 )                      \
+{                                                                     \
+    v8i16 s0_m, s1_m;                                                 \
+                                                                      \
+    s0_m = __msa_ilvr_h( ( v8i16 ) ( in1 ), ( v8i16 ) ( in0 ) );      \
+    s1_m = __msa_ilvr_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in2 ) );      \
+                                                                      \
+    out0 = ( v8i16 ) __msa_ilvr_w( ( v4i32 ) s1_m, ( v4i32 ) s0_m );  \
+    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );  \
+    out2 = ( v8i16 ) __msa_ilvl_w( ( v4i32 ) s1_m, ( v4i32 ) s0_m );  \
+    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 );  \
+}
+
+#define TRANSPOSE4X8_H( in0, in1, in2, in3,                               \
+                        in4, in5, in6, in7,                               \
+                        out0, out1, out2, out3,                           \
+                        out4, out5, out6, out7 )                          \
+{                                                                         \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
+    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                 \
+    v8i16 zero_m = { 0 };                                                 \
+                                                                          \
+    tmp0_n = __msa_ilvr_h( ( v8i16 ) ( in1 ), ( v8i16 ) ( in0 ) );        \
+    tmp1_n = __msa_ilvr_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in2 ) );        \
+    tmp2_n = __msa_ilvr_h( ( v8i16 ) ( in5 ), ( v8i16 ) ( in4 ) );        \
+    tmp3_n = __msa_ilvr_h( ( v8i16 ) ( in7 ), ( v8i16 ) ( in6 ) );        \
+                                                                          \
+    ILV_W_LRLR_SH( ( tmp0_n ), ( tmp1_n ), ( tmp2_n ), ( tmp3_n ),        \
+                   tmp2_m, tmp0_m, tmp3_m, tmp1_m );                      \
+                                                                          \
+    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );  \
+    out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );  \
+    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );  \
+    out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );  \
+                                                                          \
+    out4 = zero_m;                                                        \
+    out5 = zero_m;                                                        \
+    out6 = zero_m;                                                        \
+    out7 = zero_m;                                                        \
+}
+
+#define TRANSPOSE8X4_H( in0, in1, in2, in3,             \
+                        out0, out1, out2, out3 )        \
+{                                                       \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;               \
+                                                        \
+    ILV_H_LRLR_SH( ( in0 ), ( in1 ), ( in2 ), ( in3 ),  \
+                   tmp2_m, tmp0_m, tmp3_m, tmp1_m );    \
+                                                        \
+    ILV_W_LRLR_SH( tmp0_m, tmp1_m, tmp2_m, tmp3_m,      \
+                   out1, out0, out3, out2 );            \
+}
+
+#define TRANSPOSE8x8_H_SH( in0, in1, in2, in3,                             \
+                           in4, in5, in6, in7,                             \
+                           out0, out1, out2, out3,                         \
+                           out4, out5, out6, out7 )                        \
+{                                                                          \
+    v8i16 s0_m, s1_m;                                                      \
+    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
+    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
+                                                                           \
+    s0_m = __msa_ilvr_h( ( v8i16 ) ( in6 ), ( v8i16 ) ( in4 ) );           \
+    s1_m = __msa_ilvr_h( ( v8i16 ) ( in7 ), ( v8i16 ) ( in5 ) );           \
+    tmp0_m = __msa_ilvr_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m );               \
+    tmp1_m = __msa_ilvl_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m );               \
+                                                                           \
+    s0_m = __msa_ilvl_h( ( v8i16 ) ( in6 ), ( v8i16 ) ( in4 ) );           \
+    s1_m = __msa_ilvl_h( ( v8i16 ) ( in7 ), ( v8i16 ) ( in5 ) );           \
+    tmp2_m = __msa_ilvr_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m );               \
+    tmp3_m = __msa_ilvl_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m );               \
+                                                                           \
+    s0_m = __msa_ilvr_h( ( v8i16 ) ( in2 ), ( v8i16 ) ( in0 ) );           \
+    s1_m = __msa_ilvr_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in1 ) );           \
+    tmp4_m = __msa_ilvr_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m );               \
+    tmp5_m = __msa_ilvl_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m );               \
+                                                                           \
+    s0_m = __msa_ilvl_h( ( v8i16 ) ( in2 ), ( v8i16 ) ( in0 ) );           \
+    s1_m = __msa_ilvl_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in1 ) );           \
+    tmp6_m = __msa_ilvr_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m );               \
+    tmp7_m = __msa_ilvl_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m );               \
+                                                                           \
+    out0 = ( v8i16 ) __msa_pckev_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m );  \
+    out1 = ( v8i16 ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m );  \
+    out2 = ( v8i16 ) __msa_pckev_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m );  \
+    out3 = ( v8i16 ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m );  \
+    out4 = ( v8i16 ) __msa_pckev_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m );  \
+    out5 = ( v8i16 ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m );  \
+    out6 = ( v8i16 ) __msa_pckev_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m );  \
+    out7 = ( v8i16 ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m );  \
+}
+
+/* word transpose macro */
+#define TRANSPOSE4x4_W( in0, in1, in2, in3,                           \
+                        out0, out1, out2, out3 )                      \
+{                                                                     \
+    v4i32 s0_m, s1_m, s2_m, s3_m;                                     \
+                                                                      \
+    s0_m = __msa_ilvr_w( ( v4i32 ) ( in1 ), ( v4i32 ) ( in0 ) );      \
+    s1_m = __msa_ilvl_w( ( v4i32 ) ( in1 ), ( v4i32 ) ( in0 ) );      \
+    s2_m = __msa_ilvr_w( ( v4i32 ) ( in3 ), ( v4i32 ) ( in2 ) );      \
+    s3_m = __msa_ilvl_w( ( v4i32 ) ( in3 ), ( v4i32 ) ( in2 ) );      \
+                                                                      \
+    out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );  \
+    out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );  \
+    out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );  \
+    out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );  \
+}
+
+/* interleave macros */
+/* no in-place support */
+#define ILV_B_LRLR_UB( in0, in1, in2, in3,                                  \
+                       out0, out1, out2, out3 )                             \
+{                                                                           \
+    out0 = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in0 ) );  \
+    out1 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in0 ) );  \
+    out2 = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in2 ) );  \
+    out3 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in2 ) );  \
+}
+
+#define ILV_B_LRLR_UH( in0, in1, in2, in3,                                  \
+                       out0, out1, out2, out3 )                             \
+{                                                                           \
+    out0 = ( v8u16 ) __msa_ilvl_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in0 ) );  \
+    out1 = ( v8u16 ) __msa_ilvr_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in0 ) );  \
+    out2 = ( v8u16 ) __msa_ilvl_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in2 ) );  \
+    out3 = ( v8u16 ) __msa_ilvr_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in2 ) );  \
+}
+
+#define ILV_H_LRLR_SH( in0, in1, in2, in3,                        \
+                       out0, out1, out2, out3 )                   \
+{                                                                 \
+    out0 = __msa_ilvl_h( ( v8i16 ) ( in1 ), ( v8i16 ) ( in0 ) );  \
+    out1 = __msa_ilvr_h( ( v8i16 ) ( in1 ), ( v8i16 ) ( in0 ) );  \
+    out2 = __msa_ilvl_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in2 ) );  \
+    out3 = __msa_ilvr_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in2 ) );  \
+}
+
+#define ILV_W_LRLR_SH( in0, in1, in2, in3,                                  \
+                       out0, out1, out2, out3 )                             \
+{                                                                           \
+    out0 = ( v8i16 ) __msa_ilvl_w( ( v4i32 ) ( in1 ), ( v4i32 ) ( in0 ) );  \
+    out1 = ( v8i16 ) __msa_ilvr_w( ( v4i32 ) ( in1 ), ( v4i32 ) ( in0 ) );  \
+    out2 = ( v8i16 ) __msa_ilvl_w( ( v4i32 ) ( in3 ), ( v4i32 ) ( in2 ) );  \
+    out3 = ( v8i16 ) __msa_ilvr_w( ( v4i32 ) ( in3 ), ( v4i32 ) ( in2 ) );  \
+}
+
+#define ILVR_B_2VECS_UB( in0_r, in1_r, in0_l, in1_l,                    \
+                         out0, out1 )                                   \
+{                                                                       \
+    out0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r );  \
+    out1 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r );  \
+}
+
+#define ILVR_B_2VECS_SB( in0_r, in1_r, in0_l, in1_l,                  \
+                         out0, out1 )                                 \
+{                                                                     \
+    out0 = __msa_ilvr_b( ( v16i8 ) ( in0_l ), ( v16i8 ) ( in0_r ) );  \
+    out1 = __msa_ilvr_b( ( v16i8 ) ( in1_l ), ( v16i8 ) ( in1_r ) );  \
+}
+
+#define ILVR_B_4VECS_UB( in0_r, in1_r, in2_r, in3_r,  \
+                         in0_l, in1_l, in2_l, in3_l,  \
+                         out0, out1, out2, out3 )     \
+{                                                     \
+    ILVR_B_2VECS_UB( in0_r, in1_r, in0_l, in1_l,      \
+                     out0, out1 );                    \
+    ILVR_B_2VECS_UB( in2_r, in3_r, in2_l, in3_l,      \
+                     out2, out3 );                    \
+}
+
+#define ILVR_B_4VECS_SB( in0_r, in1_r, in2_r, in3_r,  \
+                         in0_l, in1_l, in2_l, in3_l,  \
+                         out0, out1, out2, out3 )     \
+{                                                     \
+    ILVR_B_2VECS_SB( in0_r, in1_r, in0_l, in1_l,      \
+                     out0, out1 );                    \
+    ILVR_B_2VECS_SB( in2_r, in3_r, in2_l, in3_l,      \
+                     out2, out3 );                    \
+}
+
+#define ILVR_B_2VECS_UH( in0_r, in1_r, in0_l, in1_l,                    \
+                         out0, out1 )                                   \
+{                                                                       \
+    out0 = ( v8u16 ) __msa_ilvr_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r );  \
+    out1 = ( v8u16 ) __msa_ilvr_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r );  \
+}
+
+#define ILVR_B_2VECS_SH( in0_r, in1_r, in0_l, in1_l,                    \
+                         out0, out1 )                                   \
+{                                                                       \
+    out0 = ( v8i16 ) __msa_ilvr_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r );  \
+    out1 = ( v8i16 ) __msa_ilvr_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r );  \
+}
+
+#define ILVR_B_4VECS_UH( in0_r, in1_r, in2_r, in3_r,  \
+                         in0_l, in1_l, in2_l, in3_l,  \
+                         out0, out1, out2, out3 )     \
+{                                                     \
+    ILVR_B_2VECS_UH( in0_r, in1_r, in0_l, in1_l,      \
+                     out0, out1 );                    \
+    ILVR_B_2VECS_UH( in2_r, in3_r, in2_l, in3_l,      \
+                     out2, out3 );                    \
+}
+
+#define ILVR_B_8VECS_SH( in0_r, in1_r, in2_r, in3_r,  \
+                         in4_r, in5_r, in6_r, in7_r,  \
+                         in0_l, in1_l, in2_l, in3_l,  \
+                         in4_l, in5_l, in6_l, in7_l,  \
+                         out0, out1, out2, out3,      \
+                         out4, out5, out6, out7 )     \
+{                                                     \
+    ILVR_B_2VECS_SH( in0_r, in1_r, in0_l, in1_l,      \
+                     out0, out1 );                    \
+    ILVR_B_2VECS_SH( in2_r, in3_r, in2_l, in3_l,      \
+                     out2, out3 );                    \
+    ILVR_B_2VECS_SH( in4_r, in5_r, in4_l, in5_l,      \
+                     out4, out5 );                    \
+    ILVR_B_2VECS_SH( in6_r, in7_r, in6_l, in7_l,      \
+                     out6, out7 );                    \
+}
+
+#define ILVL_B_2VECS_UB( in0_r, in1_r, in0_l, in1_l,                    \
+                         out0, out1 )                                   \
+{                                                                       \
+    out0 = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r );  \
+    out1 = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r );  \
+}
+
+#define ILVL_B_2VECS_SB( in0_r, in1_r, in0_l, in1_l,                  \
+                         out0, out1 )                                 \
+{                                                                     \
+    out0 = __msa_ilvl_b( ( v16i8 ) ( in0_l ), ( v16i8 ) ( in0_r ) );  \
+    out1 = __msa_ilvl_b( ( v16i8 ) ( in1_l ), ( v16i8 ) ( in1_r ) );  \
+}
+
+#define ILVL_B_4VECS_UB( in0_r, in1_r, in2_r, in3_r,  \
+                         in0_l, in1_l, in2_l, in3_l,  \
+                         out0, out1, out2, out3 )     \
+{                                                     \
+    ILVL_B_2VECS_UB( in0_r, in1_r, in0_l, in1_l,      \
+                     out0, out1 );                    \
+    ILVL_B_2VECS_UB( in2_r, in3_r, in2_l, in3_l,      \
+                     out2, out3 );                    \
+}
+
+#define ILVL_B_4VECS_SB( in0_r, in1_r, in2_r, in3_r,  \
+                         in0_l, in1_l, in2_l, in3_l,  \
+                         out0, out1, out2, out3 )     \
+{                                                     \
+    ILVL_B_2VECS_SB( in0_r, in1_r, in0_l, in1_l,      \
+                     out0, out1 );                    \
+    ILVL_B_2VECS_SB( in2_r, in3_r, in2_l, in3_l,      \
+                     out2, out3 );                    \
+}
+
+/* dot product macros */
+#define DPADD_U_H_4VECS_UH( a0, m0, c0, a1, m1, c1,                      \
+                            a2, m2, c2, a3, m3, c3,                      \
+                            out0, out1, out2, out3 )                     \
+{                                                                        \
+    out0 = __msa_dpadd_u_h( ( v8u16 ) a0, ( v16u8 ) m0, ( v16u8 ) c0 );  \
+    out1 = __msa_dpadd_u_h( ( v8u16 ) a1, ( v16u8 ) m1, ( v16u8 ) c1 );  \
+    out2 = __msa_dpadd_u_h( ( v8u16 ) a2, ( v16u8 ) m2, ( v16u8 ) c2 );  \
+    out3 = __msa_dpadd_u_h( ( v8u16 ) a3, ( v16u8 ) m3, ( v16u8 ) c3 );  \
+}
+
+/* macros with builtins for 4 vectors */
+#define MAXI_S_H_4VECS_UH( vec0, vec1, vec2, vec3,                         \
+                           max_value )                                     \
+{                                                                          \
+    vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) ( vec0 ), ( max_value ) );  \
+    vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) ( vec1 ), ( max_value ) );  \
+    vec2 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) ( vec2 ), ( max_value ) );  \
+    vec3 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) ( vec3 ), ( max_value ) );  \
+}
+
+#define SAT_U_H_4VECS_UH( vec0, vec1, vec2, vec3,               \
+                          sat_value )                           \
+{                                                               \
+    vec0 = __msa_sat_u_h( ( v8u16 ) ( vec0 ), ( sat_value ) );  \
+    vec1 = __msa_sat_u_h( ( v8u16 ) ( vec1 ), ( sat_value ) );  \
+    vec2 = __msa_sat_u_h( ( v8u16 ) ( vec2 ), ( sat_value ) );  \
+    vec3 = __msa_sat_u_h( ( v8u16 ) ( vec3 ), ( sat_value ) );  \
+}
+
+#define PCKEV_D_4VECS_UB( in0_l, in0_r, in1_l, in1_r,                    \
+                          in2_l, in2_r, in3_l, in3_r,                    \
+                          out0, out1, out2, out3 )                       \
+{                                                                        \
+    out0 = ( v16u8 ) __msa_pckev_d( ( v2i64 ) in0_l, ( v2i64 ) in0_r );  \
+    out1 = ( v16u8 ) __msa_pckev_d( ( v2i64 ) in1_l, ( v2i64 ) in1_r );  \
+    out2 = ( v16u8 ) __msa_pckev_d( ( v2i64 ) in2_l, ( v2i64 ) in2_r );  \
+    out3 = ( v16u8 ) __msa_pckev_d( ( v2i64 ) in3_l, ( v2i64 ) in3_r );  \
+}
+
+#define PCKEV_B_4VECS_UB( in0_l, in1_l, in2_l, in3_l,                    \
+                          in0_r, in1_r, in2_r, in3_r,                    \
+                          out0, out1, out2, out3 )                       \
+{                                                                        \
+    out0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r );  \
+    out1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r );  \
+    out2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in2_l, ( v16i8 ) in2_r );  \
+    out3 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in3_l, ( v16i8 ) in3_r );  \
+}
+
+#define PCKOD_B_4VECS_UB( in0_l, in1_l, in2_l, in3_l,                    \
+                          in0_r, in1_r, in2_r, in3_r,                    \
+                          out0, out1, out2, out3 )                       \
+{                                                                        \
+    out0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r );  \
+    out1 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r );  \
+    out2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in2_l, ( v16i8 ) in2_r );  \
+    out3 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in3_l, ( v16i8 ) in3_r );  \
+}
+
+#define XORI_B_2VECS_SB( val0, val1,                                   \
+                         out0, out1, xor_val )                         \
+{                                                                      \
+    out0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) ( val0 ), ( xor_val ) );  \
+    out1 = ( v16i8 ) __msa_xori_b( ( v16u8 ) ( val1 ), ( xor_val ) );  \
+}
+
+#define XORI_B_3VECS_SB( val0, val1, val2,                             \
+                         out0, out1, out2,                             \
+                         xor_val )                                     \
+{                                                                      \
+    XORI_B_2VECS_SB( val0, val1,                                       \
+                     out0, out1, xor_val );                            \
+    out2 = ( v16i8 ) __msa_xori_b( ( v16u8 ) ( val2 ), ( xor_val ) );  \
+}
+
+#define XORI_B_4VECS_SB( val0, val1, val2, val3,  \
+                         out0, out1, out2, out3,  \
+                         xor_val )                \
+{                                                 \
+    XORI_B_2VECS_SB( val0, val1,                  \
+                     out0, out1, xor_val );       \
+    XORI_B_2VECS_SB( val2, val3,                  \
+                     out2, out3, xor_val );       \
+}
+
+#define XORI_B_5VECS_SB( val0, val1, val2, val3, val4,  \
+                         out0, out1, out2, out3, out4,  \
+                         xor_val )                      \
+{                                                       \
+    XORI_B_3VECS_SB( val0, val1, val2,                  \
+                     out0, out1, out2, xor_val );       \
+    XORI_B_2VECS_SB( val3, val4,                        \
+                     out3, out4, xor_val );             \
+}
+
+#define ADDS_S_H_4VECS_UH( in0, in1, in2, in3,                                \
+                           in4, in5, in6, in7,                                \
+                           out0, out1, out2, out3 )                           \
+{                                                                             \
+    out0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) ( in0 ), ( v8i16 ) ( in1 ) );  \
+    out1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) ( in2 ), ( v8i16 ) ( in3 ) );  \
+    out2 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) ( in4 ), ( v8i16 ) ( in5 ) );  \
+    out3 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) ( in6 ), ( v8i16 ) ( in7 ) );  \
+}
+
+#define SRA_4VECS( in0, in1, in2, in3,      \
+                   out0, out1, out2, out3,  \
+                   shift_right_vec )        \
+{                                           \
+    out0 = ( in0 ) >> ( shift_right_vec );  \
+    out1 = ( in1 ) >> ( shift_right_vec );  \
+    out2 = ( in2 ) >> ( shift_right_vec );  \
+    out3 = ( in3 ) >> ( shift_right_vec );  \
+}
+
+#define SRL_H_4VECS_UH( in0, in1, in2, in3,                                    \
+                        out0, out1, out2, out3,                                \
+                        shift_right_vec )                                      \
+{                                                                              \
+    out0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift_right_vec );  \
+    out1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift_right_vec );  \
+    out2 = ( v8u16 ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift_right_vec );  \
+    out3 = ( v8u16 ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift_right_vec );  \
+}
+
+#define SRARI_W_4VECS_SW( val0, val1, val2, val3,                     \
+                          out0, out1, out2, out3,                     \
+                          shift_right_val )                           \
+{                                                                     \
+    out0 = __msa_srari_w( ( v4i32 ) ( val0 ), ( shift_right_val ) );  \
+    out1 = __msa_srari_w( ( v4i32 ) ( val1 ), ( shift_right_val ) );  \
+    out2 = __msa_srari_w( ( v4i32 ) ( val2 ), ( shift_right_val ) );  \
+    out3 = __msa_srari_w( ( v4i32 ) ( val3 ), ( shift_right_val ) );  \
+}
+
+#define SRARI_SATURATE_UNSIGNED_H( input, right_shift_val, sat_val )      \
+( {                                                                       \
+    v8u16 out_m;                                                          \
+                                                                          \
+    out_m = ( v8u16 ) __msa_srari_h( ( v8i16 ) input, right_shift_val );  \
+    out_m = __msa_sat_u_h( out_m, ( sat_val ) );                          \
+    out_m;                                                                \
+} )
+
+#define SRARI_SATURATE_SIGNED_H( input, right_shift_val, sat_val )      \
+( {                                                                     \
+    v8i16 out_m;                                                        \
+                                                                        \
+    out_m = __msa_srari_h( ( v8i16 ) ( input ), ( right_shift_val ) );  \
+    out_m = __msa_sat_s_h( out_m, ( sat_val ) );                        \
+    out_m;                                                              \
+} )
+
+#define SRARI_SATURATE_SIGNED_W( input, right_shift_val, sat_val )      \
+( {                                                                     \
+    v4i32 out_m;                                                        \
+                                                                        \
+    out_m = __msa_srari_w( ( v4i32 ) ( input ), ( right_shift_val ) );  \
+    out_m = __msa_sat_s_w( out_m, ( sat_val ) );                        \
+    out_m;                                                              \
+} )
+
+#define PCKEV_B_4_XORI128_STORE_8_BYTES_4( in1, in2,                 \
+                                           in3, in4,                 \
+                                           p_dst_ma, i_stride )      \
+{                                                                    \
+    uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                 \
+    v16i8 tmp0_m, tmp1_m;                                            \
+    uint8_t *p_dst_m = ( uint8_t * ) ( p_dst_ma );                   \
+                                                                     \
+    tmp0_m = __msa_pckev_b( ( v16i8 ) ( in2 ), ( v16i8 ) ( in1 ) );  \
+    tmp1_m = __msa_pckev_b( ( v16i8 ) ( in4 ), ( v16i8 ) ( in3 ) );  \
+                                                                     \
+    tmp0_m = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0_m, 128 );        \
+    tmp1_m = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp1_m, 128 );        \
+                                                                     \
+    u_out0_m = __msa_copy_u_d( ( v2i64 ) tmp0_m, 0 );                \
+    u_out1_m = __msa_copy_u_d( ( v2i64 ) tmp0_m, 1 );                \
+    u_out2_m = __msa_copy_u_d( ( v2i64 ) tmp1_m, 0 );                \
+    u_out3_m = __msa_copy_u_d( ( v2i64 ) tmp1_m, 1 );                \
+                                                                     \
+    STORE_DWORD( p_dst_m, u_out0_m );                                \
+    p_dst_m += i_stride;                                             \
+    STORE_DWORD( p_dst_m, u_out1_m );                                \
+    p_dst_m += i_stride;                                             \
+    STORE_DWORD( p_dst_m, u_out2_m );                                \
+    p_dst_m += i_stride;                                             \
+    STORE_DWORD( p_dst_m, u_out3_m );                                \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_XORI128_STORE_VEC( in1, in2, p_dest )               \
+{                                                                   \
+    v16i8 tmp_m;                                                    \
+                                                                    \
+    tmp_m = __msa_pckev_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in2 ) );  \
+    tmp_m = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp_m, 128 );         \
+    STORE_SB( tmp_m, ( p_dest ) );                                  \
+}
+
+#define PCKEV_B_STORE_4_BYTES_4( in1, in2, in3, in4,                 \
+                                 p_dst_ma, i_stride )                \
+{                                                                    \
+    uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                 \
+    v16i8 tmp0_m, tmp1_m;                                            \
+    uint8_t *p_dst_m = ( uint8_t * ) ( p_dst_ma );                   \
+                                                                     \
+    tmp0_m = __msa_pckev_b( ( v16i8 ) ( in2 ), ( v16i8 ) ( in1 ) );  \
+    tmp1_m = __msa_pckev_b( ( v16i8 ) ( in4 ), ( v16i8 ) ( in3 ) );  \
+                                                                     \
+    u_out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 );                \
+    u_out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 );                \
+    u_out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 );                \
+    u_out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 );                \
+                                                                     \
+    STORE_WORD( p_dst_m, u_out0_m );                                 \
+    p_dst_m += i_stride;                                             \
+    STORE_WORD( p_dst_m, u_out1_m );                                 \
+    p_dst_m += i_stride;                                             \
+    STORE_WORD( p_dst_m, u_out2_m );                                 \
+    p_dst_m += i_stride;                                             \
+    STORE_WORD( p_dst_m, u_out3_m );                                 \
+}
+
+#define PCKEV_B_STORE_8_BYTES_4( in1, in2, in3, in4,                 \
+                                 p_dst_ma, i_stride )                \
+{                                                                    \
+    uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                 \
+    v16i8 tmp0_m, tmp1_m;                                            \
+    uint8_t *p_dst_m = ( uint8_t * ) ( p_dst_ma );                   \
+                                                                     \
+    tmp0_m = __msa_pckev_b( ( v16i8 ) ( in2 ), ( v16i8 ) ( in1 ) );  \
+    tmp1_m = __msa_pckev_b( ( v16i8 ) ( in4 ), ( v16i8 ) ( in3 ) );  \
+                                                                     \
+    u_out0_m = __msa_copy_u_d( ( v2i64 ) tmp0_m, 0 );                \
+    u_out1_m = __msa_copy_u_d( ( v2i64 ) tmp0_m, 1 );                \
+    u_out2_m = __msa_copy_u_d( ( v2i64 ) tmp1_m, 0 );                \
+    u_out3_m = __msa_copy_u_d( ( v2i64 ) tmp1_m, 1 );                \
+                                                                     \
+    STORE_DWORD( p_dst_m, u_out0_m );                                \
+    p_dst_m += i_stride;                                             \
+    STORE_DWORD( p_dst_m, u_out1_m );                                \
+    p_dst_m += i_stride;                                             \
+    STORE_DWORD( p_dst_m, u_out2_m );                                \
+    p_dst_m += i_stride;                                             \
+    STORE_DWORD( p_dst_m, u_out3_m );                                \
+}
+
+#define UNPCK_SIGNED_H_TO_W( in, out1, out2 )                  \
+{                                                              \
+    v8i16 tmp_m;                                               \
+                                                               \
+    tmp_m = __msa_clti_s_h( ( v8i16 ) ( in ), 0 );             \
+    out1 = ( v4i32 ) __msa_ilvr_h( tmp_m, ( v8i16 ) ( in ) );  \
+    out2 = ( v4i32 ) __msa_ilvl_h( tmp_m, ( v8i16 ) ( in ) );  \
+}
+
+/* Generic for Vector types and GP operations */
+#define BUTTERFLY_4( in0, in1, in2, in3,       \
+                     out0, out1, out2, out3 )  \
+{                                              \
+    out0 = ( in0 ) + ( in3 );                  \
+    out1 = ( in1 ) + ( in2 );                  \
+                                               \
+    out2 = ( in1 ) - ( in2 );                  \
+    out3 = ( in0 ) - ( in3 );                  \
+}
+
+/* Generic for Vector types and GP operations */
+#define BUTTERFLY_8( in0, in1, in2, in3,       \
+                     in4, in5, in6, in7,       \
+                     out0, out1, out2, out3,   \
+                     out4, out5, out6, out7 )  \
+{                                              \
+    out0 = ( in0 ) + ( in7 );                  \
+    out1 = ( in1 ) + ( in6 );                  \
+    out2 = ( in2 ) + ( in5 );                  \
+    out3 = ( in3 ) + ( in4 );                  \
+                                               \
+    out4 = ( in3 ) - ( in4 );                  \
+    out5 = ( in2 ) - ( in5 );                  \
+    out6 = ( in1 ) - ( in6 );                  \
+    out7 = ( in0 ) - ( in7 );                  \
+}
+
+#define ADD_RESIDUE_PRED_CLIP_AND_STORE_4( p_dest, dst_stride,                \
+                                           in0, in1, in2, in3 )               \
+{                                                                             \
+    uint32_t u_src0_m, u_src1_m, u_src2_m, u_src3_m;                          \
+    uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                          \
+    v8i16 inp0_m, inp1_m;                                                     \
+    v8i16 res0_m, res1_m;                                                     \
+    v16i8 dest0_m = { 0 };                                                    \
+    v16i8 dest1_m = { 0 };                                                    \
+    v16i8 zero_m = { 0 };                                                     \
+                                                                              \
+    inp0_m = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) ( in1 ), ( v2i64 ) ( in0 ) );  \
+    inp1_m = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) ( in3 ), ( v2i64 ) ( in2 ) );  \
+                                                                              \
+    LOAD_4WORDS_WITH_STRIDE( p_dest, dst_stride,                              \
+                             u_src0_m, u_src1_m, u_src2_m, u_src3_m );        \
+    dest0_m = ( v16i8 ) __msa_insert_w( ( v4i32 ) dest0_m, 0, u_src0_m );     \
+    dest0_m = ( v16i8 ) __msa_insert_w( ( v4i32 ) dest0_m, 1, u_src1_m );     \
+    dest1_m = ( v16i8 ) __msa_insert_w( ( v4i32 ) dest1_m, 0, u_src2_m );     \
+    dest1_m = ( v16i8 ) __msa_insert_w( ( v4i32 ) dest1_m, 1, u_src3_m );     \
+                                                                              \
+    res0_m = ( v8i16 ) __msa_ilvr_b( zero_m, dest0_m );                       \
+    res1_m = ( v8i16 ) __msa_ilvr_b( zero_m, dest1_m );                       \
+                                                                              \
+    res0_m += inp0_m;                                                         \
+    res1_m += inp1_m;                                                         \
+                                                                              \
+    res0_m = CLIP_UNSIGNED_CHAR_H( res0_m );                                  \
+    res1_m = CLIP_UNSIGNED_CHAR_H( res1_m );                                  \
+                                                                              \
+    dest0_m = __msa_pckev_b( ( v16i8 ) res0_m, ( v16i8 ) res0_m );            \
+    dest1_m = __msa_pckev_b( ( v16i8 ) res1_m, ( v16i8 ) res1_m );            \
+                                                                              \
+    u_out0_m = __msa_copy_u_w( ( v4i32 ) dest0_m, 0 );                        \
+    u_out1_m = __msa_copy_u_w( ( v4i32 ) dest0_m, 1 );                        \
+    u_out2_m = __msa_copy_u_w( ( v4i32 ) dest1_m, 0 );                        \
+    u_out3_m = __msa_copy_u_w( ( v4i32 ) dest1_m, 1 );                        \
+                                                                              \
+    STORE_WORD( p_dest, u_out0_m );                                           \
+    p_dest += dst_stride;                                                     \
+    STORE_WORD( p_dest, u_out1_m );                                           \
+    p_dest += dst_stride;                                                     \
+    STORE_WORD( p_dest, u_out2_m );                                           \
+    p_dest += dst_stride;                                                     \
+    STORE_WORD( p_dest, u_out3_m );                                           \
+}
+
+#define LOAD_4x4_1D_BLOCK_SH( src, in0, in1, in2, in3 )            \
+{                                                                  \
+    in0 = LOAD_SH( src );                                          \
+    in2 = LOAD_SH( src + 8 );                                      \
+                                                                   \
+    in1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) in0, ( v2i64 ) in0 );  \
+    in3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) in2, ( v2i64 ) in2 );  \
+}
+
+#define FILT_6TAP_DPADD_S_H( vec0, vec1, vec2,                            \
+                             filt0, filt1, filt2 )                        \
+( {                                                                       \
+    v8i16 tmp0_m, tmp1_m;                                                 \
+                                                                          \
+    tmp0_m = __msa_dotp_s_h( ( v16i8 ) ( vec0 ), ( v16i8 ) ( filt0 ) );   \
+    tmp0_m = __msa_dpadd_s_h( tmp0_m, ( v16i8 ) vec1, ( v16i8 ) filt1 );  \
+    tmp1_m = __msa_dotp_s_h( ( v16i8 ) ( vec2 ), ( v16i8 ) ( filt2 ) );   \
+    tmp0_m = __msa_adds_s_h( tmp0_m, tmp1_m );                            \
+                                                                          \
+    tmp0_m;                                                               \
+} )
+
+#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( vec0, vec1, vec2,               \
+                                         vec3, vec4, vec5,               \
+                                         const_minus5, const20 )         \
+( {                                                                      \
+    v4i32 tmp1_m, tmp2_m;                                                \
+    v8i16 tmp3_m, tmp4_m, tmp5_m, tmp6_m;                                \
+                                                                         \
+    tmp1_m = ( v4i32 ) __msa_ilvr_h( ( v8i16 ) vec5, ( v8i16 ) vec0 );   \
+    tmp2_m = ( v4i32 ) __msa_ilvl_h( ( v8i16 ) vec5, ( v8i16 ) vec0 );   \
+                                                                         \
+    tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m );       \
+    tmp2_m = __msa_hadd_s_w( ( v8i16 ) tmp2_m, ( v8i16 ) tmp2_m );       \
+                                                                         \
+    tmp3_m = __msa_ilvr_h( ( v8i16 ) ( vec1 ), ( v8i16 ) ( vec4 ) );     \
+    tmp4_m = __msa_ilvl_h( ( v8i16 ) ( vec1 ), ( v8i16 ) ( vec4 ) );     \
+                                                                         \
+    tmp1_m = __msa_dpadd_s_w( tmp1_m, tmp3_m, ( v8i16 ) const_minus5 );  \
+    tmp2_m = __msa_dpadd_s_w( tmp2_m, tmp4_m, ( v8i16 ) const_minus5 );  \
+                                                                         \
+    tmp5_m = __msa_ilvr_h( ( v8i16 ) ( vec2 ), ( v8i16 ) ( vec3 ) );     \
+    tmp6_m = __msa_ilvl_h( ( v8i16 ) ( vec2 ), ( v8i16 ) ( vec3 ) );     \
+                                                                         \
+    tmp1_m = __msa_dpadd_s_w( tmp1_m, tmp5_m, ( v8i16 ) const20 );       \
+    tmp2_m = __msa_dpadd_s_w( tmp2_m, tmp6_m, ( v8i16 ) const20 );       \
+                                                                         \
+    tmp1_m = SRARI_SATURATE_SIGNED_W( tmp1_m, 10, 7 );                   \
+    tmp2_m = SRARI_SATURATE_SIGNED_W( tmp2_m, 10, 7 );                   \
+                                                                         \
+    tmp3_m = __msa_pckev_h( ( v8i16 ) tmp2_m, ( v8i16 ) tmp1_m );        \
+                                                                         \
+    tmp3_m;                                                              \
+} )
+
+#define AVC_XOR_SHF_B_AND_APPLY_6TAP_2COEFF_HORIZ_FILT_SH( p_src,            \
+                                                           mask0, mask1,     \
+                                                           mask2,            \
+                                                           const_minus5,     \
+                                                           const20 )         \
+( {                                                                          \
+    v8i16 vec0_m, horiz_out_m;                                               \
+    v16i8 vec1_m, vec2_m, tmp_m;                                             \
+                                                                             \
+    tmp_m = ( v16i8 ) __msa_xori_b( ( v16u8 ) ( p_src ), 128 );              \
+                                                                             \
+    vec0_m = ( v8i16 ) __msa_vshf_b( ( v16i8 ) ( mask0 ), tmp_m, tmp_m );    \
+    vec0_m = __msa_hadd_s_h( ( v16i8 ) vec0_m, ( v16i8 ) vec0_m );           \
+                                                                             \
+    vec1_m = __msa_vshf_b( ( v16i8 ) ( mask1 ), tmp_m, tmp_m );              \
+    vec0_m = __msa_dpadd_s_h( vec0_m, ( v16i8 ) ( const_minus5 ), vec1_m );  \
+                                                                             \
+    vec2_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), tmp_m, tmp_m );              \
+    horiz_out_m = __msa_dpadd_s_h( vec0_m, ( v16i8 ) ( const20 ), vec2_m );  \
+                                                                             \
+    horiz_out_m;                                                             \
+} )
+
+#endif  /* X264_MIPS_MACROS_H */
-- 
2.3.2



More information about the x264-devel mailing list