[x264-devel] [PATCH] MIPS MSA Macros
Kaustubh Raste
kaustubh.raste at imgtec.com
Fri Apr 17 14:48:33 CEST 2015
All defined macros are required by subsequent source patches.
Signed-off-by: Kaustubh Raste <kaustubh.raste at imgtec.com>
---
common/mips/macros.h | 1230 ++++++++++++++++++++++++++++++++++++++++++++++++++
1 file changed, 1230 insertions(+)
create mode 100644 common/mips/macros.h
diff --git a/common/mips/macros.h b/common/mips/macros.h
new file mode 100644
index 0000000..9312d44
--- /dev/null
+++ b/common/mips/macros.h
@@ -0,0 +1,1230 @@
+/*****************************************************************************
+ * macros.h: mips msa macros
+ *****************************************************************************
+ * Copyright (C) 2009-2015 x264 project
+ *
+ * Authors: Parag Salasakar <parag.salasakar at imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_MACROS_H
+#define X264_MIPS_MACROS_H
+
+#include <stdint.h>
+#include <msa.h>
+
+#define LOAD_UB( p_src ) *( ( v16u8 * ) ( p_src ) )
+#define LOAD_SB( p_src ) *( ( v16i8 * ) ( p_src ) )
+#define LOAD_UH( p_src ) *( ( v8u16 * ) ( p_src ) )
+#define LOAD_SH( p_src ) *( ( v8i16 * ) ( p_src ) )
+#define LOAD_SW( p_src ) *( ( v4i32 * ) ( p_src ) )
+
+#define STORE_UB( vec, p_dest ) *( ( v16u8 * ) ( p_dest ) ) = ( vec )
+#define STORE_SB( vec, p_dest ) *( ( v16i8 * ) ( p_dest ) ) = ( vec )
+#define STORE_UH( vec, p_dest ) *( ( v8u16 * ) ( p_dest ) ) = ( vec )
+#define STORE_SH( vec, p_dest ) *( ( v8i16 * ) ( p_dest ) ) = ( vec )
+#define STORE_SD( vec, p_dest ) *( ( v2i64 * ) ( p_dest ) ) = ( vec )
+
+#if ( __mips_isa_rev >= 6 )
+ #define LOAD_WORD( p_src ) \
+ ( { \
+ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
+ uint32_t u_val_m; \
+ \
+ __asm__ __volatile__ ( \
+ "lw %[u_val_m], %[p_src_m] \n\t" \
+ \
+ : [u_val_m] "=r" ( u_val_m ) \
+ : [p_src_m] "m" ( *p_src_m ) \
+ ); \
+ \
+ u_val_m; \
+ } )
+
+ #if ( __mips == 64 )
+ #define LOAD_DWORD( p_src ) \
+ ( { \
+ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
+ uint64_t u_val_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "ld %[u_val_m], %[p_src_m] \n\t" \
+ \
+ : [u_val_m] "=r" ( u_val_m ) \
+ : [p_src_m] "m" ( *p_src_m ) \
+ ); \
+ \
+ u_val_m; \
+ } )
+ #else // !( __mips == 64 )
+ #define LOAD_DWORD( p_src ) \
+ ( { \
+ uint8_t *p_src1_m = ( uint8_t * ) ( p_src ); \
+ uint8_t *p_src2_m = ( ( uint8_t * ) ( p_src ) ) + 4; \
+ uint32_t u_val0_m, u_val1_m; \
+ uint64_t u_genval_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "lw %[u_val0_m], %[p_src1_m] \n\t" \
+ \
+ : [u_val0_m] "=r" ( u_val0_m ) \
+ : [p_src1_m] "m" ( *p_src1_m ) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "lw %[u_val1_m], %[p_src2_m] \n\t" \
+ \
+ : [u_val1_m] "=r" ( u_val1_m ) \
+ : [p_src2_m] "m" ( *p_src2_m ) \
+ ); \
+ \
+ u_genval_m = ( uint64_t ) ( u_val1_m ); \
+ u_genval_m = ( uint64_t ) ( ( u_genval_m << 32 ) & \
+ 0xFFFFFFFF00000000 ); \
+ u_genval_m = ( uint64_t ) ( u_genval_m | ( uint64_t ) u_val0_m ); \
+ \
+ u_genval_m; \
+ } )
+ #endif // ( __mips == 64 )
+
+ #define STORE_WORD( p_dst_ma, val ) \
+ { \
+ uint8_t *p_dst_temp = ( uint8_t * ) ( p_dst_ma ); \
+ uint32_t u_val_m = ( val ); \
+ \
+ __asm__ __volatile__ ( \
+ "sw %[u_val_m], %[p_dst_temp] \n\t" \
+ \
+ : [p_dst_temp] "=m" ( *p_dst_temp ) \
+ : [u_val_m] "r" ( u_val_m ) \
+ ); \
+ }
+
+ #define STORE_DWORD( p_dst_ma, val ) \
+ { \
+ uint8_t *p_dst_temp = ( uint8_t * ) ( p_dst_ma ); \
+ uint64_t u_val_m = ( val ); \
+ \
+ __asm__ __volatile__ ( \
+ "sd %[u_val_m], %[p_dst_temp] \n\t" \
+ \
+ : [p_dst_temp] "=m" ( *p_dst_temp ) \
+ : [u_val_m] "r" ( u_val_m ) \
+ ); \
+ }
+
+ #define STORE_HWORD( p_dst_ma, val ) \
+ { \
+ uint8_t *p_dst_temp = ( uint8_t * ) ( p_dst_ma ); \
+ uint16_t u_val_m = ( val ); \
+ \
+ __asm__ __volatile__ ( \
+ "sh %[u_val_m], %[p_dst_temp] \n\t" \
+ \
+ : [p_dst_temp] "=m" ( *p_dst_temp ) \
+ : [u_val_m] "r" ( u_val_m ) \
+ ); \
+ }
+#else // !( __mips_isa_rev >= 6 )
+ #define LOAD_WORD( p_src ) \
+ ( { \
+ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
+ uint32_t u_val_m; \
+ \
+ __asm__ __volatile__ ( \
+ "ulw %[u_val_m], %[p_src_m] \n\t" \
+ \
+ : [u_val_m] "=r" ( u_val_m ) \
+ : [p_src_m] "m" ( *p_src_m ) \
+ ); \
+ \
+ u_val_m; \
+ } )
+
+ #if ( __mips == 64 )
+ #define LOAD_DWORD( p_src ) \
+ ( { \
+ uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
+ uint64_t u_val_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "uld %[u_val_m], %[p_src_m] \n\t" \
+ \
+ : [u_val_m] "=r" ( u_val_m ) \
+ : [p_src_m] "m" ( *p_src_m ) \
+ ); \
+ \
+ u_val_m; \
+ } )
+ #else // !( __mips == 64 )
+ #define LOAD_DWORD( p_src ) \
+ ( { \
+ uint8_t *p_src1_m = ( uint8_t * ) ( p_src ); \
+ uint8_t *p_src2_m = ( ( uint8_t * ) ( p_src ) ) + 4; \
+ uint32_t u_val0_m, u_val1_m; \
+ uint64_t u_genval_m = 0; \
+ \
+ __asm__ __volatile__ ( \
+ "ulw %[u_val0_m], %[p_src1_m] \n\t" \
+ \
+ : [u_val0_m] "=r" ( u_val0_m ) \
+ : [p_src1_m] "m" ( *p_src1_m ) \
+ ); \
+ \
+ __asm__ __volatile__ ( \
+ "ulw %[u_val1_m], %[p_src2_m] \n\t" \
+ \
+ : [u_val1_m] "=r" ( u_val1_m ) \
+ : [p_src2_m] "m" ( *p_src2_m ) \
+ ); \
+ \
+ u_genval_m = ( uint64_t ) ( u_val1_m ); \
+ u_genval_m = ( uint64_t ) ( ( u_genval_m << 32 ) & \
+ 0xFFFFFFFF00000000 ); \
+ u_genval_m = ( uint64_t ) ( u_genval_m | ( uint64_t ) u_val0_m ); \
+ \
+ u_genval_m; \
+ } )
+ #endif // ( __mips == 64 )
+
+ #define STORE_WORD( p_dst_ma, val ) \
+ { \
+ uint8_t *p_dst_tmp = ( uint8_t * ) ( p_dst_ma ); \
+ uint32_t u_val_m = ( val ); \
+ \
+ __asm__ __volatile__ ( \
+ "usw %[u_val_m], %[p_dst_tmp] \n\t" \
+ \
+ : [p_dst_tmp] "=m" ( *p_dst_tmp ) \
+ : [u_val_m] "r" ( u_val_m ) \
+ ); \
+ }
+
+ #define STORE_DWORD( p_dst_ma, val ) \
+ { \
+ uint8_t *p_dst1_m = ( uint8_t * ) ( p_dst_ma ); \
+ uint8_t *p_dst2_m = ( ( uint8_t * ) ( p_dst_ma ) ) + 4; \
+ uint32_t u_val0_m, u_val1_m; \
+ \
+ u_val0_m = ( uint32_t ) ( ( val ) & 0x00000000FFFFFFFF ); \
+ u_val1_m = ( uint32_t ) ( ( ( val ) >> 32 ) & 0x00000000FFFFFFFF ); \
+ \
+ __asm__ __volatile__ ( \
+ "usw %[u_val0_m], %[p_dst1_m] \n\t" \
+ "usw %[u_val1_m], %[p_dst2_m] \n\t" \
+ \
+ : [p_dst1_m] "=m" ( *p_dst1_m ), [p_dst2_m] "=m" ( *p_dst2_m ) \
+ : [u_val0_m] "r" ( u_val0_m ), [u_val1_m] "r" ( u_val1_m ) \
+ ); \
+ }
+
+ #define STORE_HWORD( p_dst_ma, val ) \
+ { \
+ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst_ma ); \
+ uint16_t u_val_m = ( val ); \
+ \
+ __asm__ __volatile__ ( \
+ "ush %[u_val_m], %[p_dst_m] \n\t" \
+ \
+ : [p_dst_m] "=m" ( *p_dst_m ) \
+ : [u_val_m] "r" ( u_val_m ) \
+ ); \
+ }
+#endif // ( __mips_isa_rev >= 6 )
+
+#define LOAD_4WORDS_WITH_STRIDE( p_src, src_stride, \
+ src0, src1, src2, src3 ) \
+{ \
+ src0 = LOAD_WORD( p_src + 0 * src_stride ); \
+ src1 = LOAD_WORD( p_src + 1 * src_stride ); \
+ src2 = LOAD_WORD( p_src + 2 * src_stride ); \
+ src3 = LOAD_WORD( p_src + 3 * src_stride ); \
+}
+
+#define LOAD_2VECS_UB( p_src, i_stride, \
+ val0, val1 ) \
+{ \
+ val0 = LOAD_UB( p_src + 0 * i_stride ); \
+ val1 = LOAD_UB( p_src + 1 * i_stride ); \
+}
+
+#define LOAD_3VECS_UB( p_src, i_stride, \
+ val0, val1, val2 ) \
+{ \
+ val0 = LOAD_UB( p_src + 0 * i_stride ); \
+ val1 = LOAD_UB( p_src + 1 * i_stride ); \
+ val2 = LOAD_UB( p_src + 2 * i_stride ); \
+}
+
+#define LOAD_4VECS_UB( p_src, i_stride, \
+ val0, val1, val2, val3 ) \
+{ \
+ val0 = LOAD_UB( p_src + 0 * i_stride ); \
+ val1 = LOAD_UB( p_src + 1 * i_stride ); \
+ val2 = LOAD_UB( p_src + 2 * i_stride ); \
+ val3 = LOAD_UB( p_src + 3 * i_stride ); \
+}
+
+#define LOAD_4VECS_SB( p_src, i_stride, \
+ val0, val1, val2, val3 ) \
+{ \
+ val0 = LOAD_SB( p_src + 0 * i_stride ); \
+ val1 = LOAD_SB( p_src + 1 * i_stride ); \
+ val2 = LOAD_SB( p_src + 2 * i_stride ); \
+ val3 = LOAD_SB( p_src + 3 * i_stride ); \
+}
+
+#define LOAD_5VECS_UB( p_src, i_stride, \
+ out0, out1, out2, out3, out4 ) \
+{ \
+ LOAD_4VECS_UB( ( p_src ), ( i_stride ), \
+ ( out0 ), ( out1 ), ( out2 ), ( out3 ) ); \
+ out4 = LOAD_UB( p_src + 4 * i_stride ); \
+}
+
+#define LOAD_5VECS_SB( p_src, i_stride, \
+ out0, out1, out2, out3, out4 ) \
+{ \
+ LOAD_4VECS_SB( ( p_src ), ( i_stride ), \
+ ( out0 ), ( out1 ), ( out2 ), ( out3 ) ); \
+ out4 = LOAD_SB( p_src + 4 * i_stride ); \
+}
+
+#define LOAD_8VECS_UB( p_src, i_stride, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7 ) \
+{ \
+ LOAD_4VECS_UB( ( p_src ), ( i_stride ), \
+ ( out0 ), ( out1 ), ( out2 ), ( out3 ) ); \
+ LOAD_4VECS_UB( ( p_src + 4 * i_stride ), ( i_stride ), \
+ ( out4 ), ( out5 ), ( out6 ), ( out7 ) ); \
+}
+
+#define LOAD_8VECS_SB( psrc, stride, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7 ) \
+{ \
+ LOAD_4VECS_SB( psrc, stride, \
+ out0, out1, out2, out3 ); \
+ LOAD_4VECS_SB( ( psrc + 4 * stride ), stride, \
+ out4, out5, out6, out7 ); \
+}
+
+#define LOAD_2VECS_SH( p_src, i_stride, \
+ val0, val1 ) \
+{ \
+ val0 = LOAD_SH( ( p_src ) + 0 * ( i_stride ) ); \
+ val1 = LOAD_SH( ( p_src ) + 1 * ( i_stride ) ); \
+}
+
+#define LOAD_4VECS_SH( p_src, i_stride, \
+ val0, val1, val2, val3 ) \
+{ \
+ LOAD_2VECS_SH( ( p_src ), ( i_stride ), val0, val1 ); \
+ LOAD_2VECS_SH( ( p_src + 2 * i_stride ), ( i_stride ), val2, val3 ); \
+}
+
+#define LOAD_8VECS_SH( p_src, i_stride, \
+ val0, val1, val2, val3, \
+ val4, val5, val6, val7 ) \
+{ \
+ LOAD_4VECS_SH( ( p_src ), ( i_stride ), \
+ val0, val1, val2, val3 ); \
+ LOAD_4VECS_SH( ( p_src + 4 * i_stride ), ( i_stride ), \
+ val4, val5, val6, val7 ); \
+}
+
+#define STORE_4VECS_UB( dst_out, pitch, \
+ in0, in1, in2, in3 ) \
+{ \
+ STORE_UB( ( in0 ), ( dst_out ) ); \
+ STORE_UB( ( in1 ), ( ( dst_out ) + ( pitch ) ) ); \
+ STORE_UB( ( in2 ), ( ( dst_out ) + 2 * ( pitch ) ) ); \
+ STORE_UB( ( in3 ), ( ( dst_out ) + 3 * ( pitch ) ) ); \
+}
+
+#define STORE_4VECS_SB( dst_out, pitch, \
+ in0, in1, in2, in3 ) \
+{ \
+ STORE_SB( ( in0 ), ( dst_out ) ); \
+ STORE_SB( ( in1 ), ( ( dst_out ) + ( pitch ) ) ); \
+ STORE_SB( ( in2 ), ( ( dst_out ) + 2 * ( pitch ) ) ); \
+ STORE_SB( ( in3 ), ( ( dst_out ) + 3 * ( pitch ) ) ); \
+}
+
+#define STORE_8VECS_UB( dst_out, pitch_in, \
+ in0, in1, in2, in3, \
+ in4, in5, in6, in7 ) \
+{ \
+ STORE_4VECS_UB( dst_out, pitch_in, \
+ in0, in1, in2, in3 ); \
+ STORE_4VECS_UB( ( dst_out + 4 * ( pitch_in ) ), pitch_in, \
+ in4, in5, in6, in7 ); \
+}
+
+#define CLIP_MIN_TO_MAX_H( in, min, max ) \
+( { \
+ v8i16 out_m; \
+ \
+ out_m = __msa_max_s_h( ( v8i16 ) ( min ), ( v8i16 ) ( in ) ); \
+ out_m = __msa_min_s_h( ( v8i16 ) ( max ), ( v8i16 ) out_m ); \
+ out_m; \
+} )
+
+#define CLIP_UNSIGNED_CHAR_H( in ) \
+( { \
+ v8i16 max_m = __msa_ldi_h( 255 ); \
+ v8i16 out_m; \
+ \
+ out_m = __msa_maxi_s_h( ( v8i16 ) ( in ), 0 ); \
+ out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m ); \
+ out_m; \
+} )
+
+#define CALC_ADDITIVE_SUM( result ) \
+( { \
+ v2i64 result_m, result_dup_m; \
+ int32_t sum_m; \
+ \
+ result_m = __msa_hadd_s_d( ( v4i32 ) ( result ), ( v4i32 ) ( result ) ); \
+ result_dup_m = __msa_splati_d( result_m, 1 ); \
+ result_m = result_m + result_dup_m; \
+ sum_m = __msa_copy_s_w( ( v4i32 ) result_m, 0 ); \
+ sum_m; \
+} )
+
+#define CALC_ADDITIVE_SUM_H( sad ) \
+( { \
+ v4u32 sad_m; \
+ uint32_t sad_out_m; \
+ \
+ sad_m = __msa_hadd_u_w( ( v8u16 ) ( sad ), ( v8u16 ) ( sad ) ); \
+ sad_out_m = ( uint32_t ) CALC_ADDITIVE_SUM( sad_m ); \
+ sad_out_m; \
+} )
+
+#define CALC_MSE_B( src, ref, var ) \
+{ \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ src_l0_m = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src, ( v16i8 ) ref ); \
+ src_l1_m = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) src, ( v16i8 ) ref ); \
+ \
+ res_l0_m = __msa_hsub_u_h( src_l0_m, src_l0_m ); \
+ res_l1_m = __msa_hsub_u_h( src_l1_m, src_l1_m ); \
+ \
+ ( var ) = ( v4i32 ) __msa_dpadd_s_w( ( v4i32 ) var, res_l0_m, res_l0_m ); \
+ ( var ) = ( v4i32 ) __msa_dpadd_s_w( ( v4i32 ) var, res_l1_m, res_l1_m ); \
+}
+
+#define CALC_MSE_AVG_B( src, ref, var, sub ) \
+{ \
+ v16u8 src_l0_m, src_l1_m; \
+ v8i16 res_l0_m, res_l1_m; \
+ \
+ src_l0_m = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src, ( v16i8 ) ref ); \
+ src_l1_m = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) src, ( v16i8 ) ref ); \
+ \
+ res_l0_m = __msa_hsub_u_h( src_l0_m, src_l0_m ); \
+ res_l1_m = __msa_hsub_u_h( src_l1_m, src_l1_m ); \
+ \
+ var = ( v4i32 ) __msa_dpadd_s_w( ( v4i32 ) var, res_l0_m, res_l0_m ); \
+ var = ( v4i32 ) __msa_dpadd_s_w( ( v4i32 ) var, res_l1_m, res_l1_m ); \
+ \
+ ( sub ) += res_l0_m + res_l1_m; \
+}
+
+#define VARIANCE_WxH( sse, diff, shift ) \
+( { \
+ uint32_t var_m; \
+ \
+ var_m = ( sse ) - ( ( ( uint32_t ) ( diff ) * ( diff ) ) >> ( shift ) ); \
+ \
+ var_m; \
+} )
+
+#define VEC_INSERT_4W_UB( src, src0, src1, src2, src3 ) \
+{ \
+ src = ( v16u8 ) __msa_insert_w( ( v4i32 ) ( src ), 0, ( src0 ) ); \
+ src = ( v16u8 ) __msa_insert_w( ( v4i32 ) ( src ), 1, ( src1 ) ); \
+ src = ( v16u8 ) __msa_insert_w( ( v4i32 ) ( src ), 2, ( src2 ) ); \
+ src = ( v16u8 ) __msa_insert_w( ( v4i32 ) ( src ), 3, ( src3 ) ); \
+}
+
+#define VEC_INSERT_4W_SB( src, src0, src1, src2, src3 ) \
+{ \
+ src = ( v16i8 ) __msa_insert_w( ( v4i32 ) ( src ), 0, ( src0 ) ); \
+ src = ( v16i8 ) __msa_insert_w( ( v4i32 ) ( src ), 1, ( src1 ) ); \
+ src = ( v16i8 ) __msa_insert_w( ( v4i32 ) ( src ), 2, ( src2 ) ); \
+ src = ( v16i8 ) __msa_insert_w( ( v4i32 ) ( src ), 3, ( src3 ) ); \
+}
+
+#define TRANSPOSE8x8_B_UB( in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7 ) \
+{ \
+ v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ v16i8 zero_m = { 0 }; \
+ \
+ tmp0_m = __msa_ilvr_b( ( v16i8 ) ( in2 ), ( v16i8 ) ( in0 ) ); \
+ tmp1_m = __msa_ilvr_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in1 ) ); \
+ tmp2_m = __msa_ilvr_b( ( v16i8 ) ( in6 ), ( v16i8 ) ( in4 ) ); \
+ tmp3_m = __msa_ilvr_b( ( v16i8 ) ( in7 ), ( v16i8 ) ( in5 ) ); \
+ \
+ tmp4_m = __msa_ilvr_b( ( v16i8 ) tmp1_m, ( v16i8 ) tmp0_m ); \
+ tmp5_m = __msa_ilvl_b( ( v16i8 ) tmp1_m, ( v16i8 ) tmp0_m ); \
+ tmp6_m = __msa_ilvr_b( ( v16i8 ) tmp3_m, ( v16i8 ) tmp2_m ); \
+ tmp7_m = __msa_ilvl_b( ( v16i8 ) tmp3_m, ( v16i8 ) tmp2_m ); \
+ \
+ out0 = ( v16u8 ) __msa_ilvr_w( ( v4i32 ) tmp6_m, ( v4i32 ) tmp4_m ); \
+ out2 = ( v16u8 ) __msa_ilvl_w( ( v4i32 ) tmp6_m, ( v4i32 ) tmp4_m ); \
+ out4 = ( v16u8 ) __msa_ilvr_w( ( v4i32 ) tmp7_m, ( v4i32 ) tmp5_m ); \
+ out6 = ( v16u8 ) __msa_ilvl_w( ( v4i32 ) tmp7_m, ( v4i32 ) tmp5_m ); \
+ \
+ out1 = ( v16u8 ) __msa_sldi_b( zero_m, ( v16i8 ) out0, 8 ); \
+ out3 = ( v16u8 ) __msa_sldi_b( zero_m, ( v16i8 ) out2, 8 ); \
+ out5 = ( v16u8 ) __msa_sldi_b( zero_m, ( v16i8 ) out4, 8 ); \
+ out7 = ( v16u8 ) __msa_sldi_b( zero_m, ( v16i8 ) out6, 8 ); \
+}
+
+/* transpose 16x8 matrix into 8x16 */
+#define TRANSPOSE16x8_B_UB( in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ in8, in9, in10, in11, \
+ in12, in13, in14, in15, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7 ) \
+{ \
+ v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ out7 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in8, ( v2i64 ) ( in0 ) ); \
+ out6 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in9, ( v2i64 ) ( in1 ) ); \
+ out5 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in10, ( v2i64 ) ( in2 ) ); \
+ out4 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in11, ( v2i64 ) ( in3 ) ); \
+ out3 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in12, ( v2i64 ) ( in4 ) ); \
+ out2 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in13, ( v2i64 ) ( in5 ) ); \
+ out1 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in14, ( v2i64 ) ( in6 ) ); \
+ out0 = ( v16u8 ) __msa_ilvev_d( ( v2i64 ) in15, ( v2i64 ) ( in7 ) ); \
+ \
+ tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \
+ tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \
+ tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \
+ tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \
+ out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \
+ tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \
+ out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \
+ tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \
+ \
+ tmp2_m = ( v16u8 ) __msa_ilvev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \
+ tmp3_m = ( v16u8 ) __msa_ilvev_h( ( v8i16 ) out7, ( v8i16 ) out5 ); \
+ out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
+ out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
+ \
+ tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \
+ tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 ); \
+ out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
+ out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
+ \
+ tmp2_m = ( v16u8 ) __msa_ilvev_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \
+ tmp3_m = ( v16u8 ) __msa_ilvev_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \
+ out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
+ out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
+ \
+ tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \
+ tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \
+ tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \
+ tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \
+ out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
+ out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
+}
+
+/* halfword transpose macro */
+#define TRANSPOSE4x4_H( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ v8i16 s0_m, s1_m; \
+ \
+ s0_m = __msa_ilvr_h( ( v8i16 ) ( in1 ), ( v8i16 ) ( in0 ) ); \
+ s1_m = __msa_ilvr_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in2 ) ); \
+ \
+ out0 = ( v8i16 ) __msa_ilvr_w( ( v4i32 ) s1_m, ( v4i32 ) s0_m ); \
+ out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \
+ out2 = ( v8i16 ) __msa_ilvl_w( ( v4i32 ) s1_m, ( v4i32 ) s0_m ); \
+ out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 ); \
+}
+
+#define TRANSPOSE4X8_H( in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7 ) \
+{ \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
+ v8i16 zero_m = { 0 }; \
+ \
+ tmp0_n = __msa_ilvr_h( ( v8i16 ) ( in1 ), ( v8i16 ) ( in0 ) ); \
+ tmp1_n = __msa_ilvr_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in2 ) ); \
+ tmp2_n = __msa_ilvr_h( ( v8i16 ) ( in5 ), ( v8i16 ) ( in4 ) ); \
+ tmp3_n = __msa_ilvr_h( ( v8i16 ) ( in7 ), ( v8i16 ) ( in6 ) ); \
+ \
+ ILV_W_LRLR_SH( ( tmp0_n ), ( tmp1_n ), ( tmp2_n ), ( tmp3_n ), \
+ tmp2_m, tmp0_m, tmp3_m, tmp1_m ); \
+ \
+ out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \
+ out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \
+ out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \
+ out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \
+ \
+ out4 = zero_m; \
+ out5 = zero_m; \
+ out6 = zero_m; \
+ out7 = zero_m; \
+}
+
+#define TRANSPOSE8X4_H( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ \
+ ILV_H_LRLR_SH( ( in0 ), ( in1 ), ( in2 ), ( in3 ), \
+ tmp2_m, tmp0_m, tmp3_m, tmp1_m ); \
+ \
+ ILV_W_LRLR_SH( tmp0_m, tmp1_m, tmp2_m, tmp3_m, \
+ out1, out0, out3, out2 ); \
+}
+
+#define TRANSPOSE8x8_H_SH( in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7 ) \
+{ \
+ v8i16 s0_m, s1_m; \
+ v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
+ v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
+ \
+ s0_m = __msa_ilvr_h( ( v8i16 ) ( in6 ), ( v8i16 ) ( in4 ) ); \
+ s1_m = __msa_ilvr_h( ( v8i16 ) ( in7 ), ( v8i16 ) ( in5 ) ); \
+ tmp0_m = __msa_ilvr_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m ); \
+ tmp1_m = __msa_ilvl_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m ); \
+ \
+ s0_m = __msa_ilvl_h( ( v8i16 ) ( in6 ), ( v8i16 ) ( in4 ) ); \
+ s1_m = __msa_ilvl_h( ( v8i16 ) ( in7 ), ( v8i16 ) ( in5 ) ); \
+ tmp2_m = __msa_ilvr_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m ); \
+ tmp3_m = __msa_ilvl_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m ); \
+ \
+ s0_m = __msa_ilvr_h( ( v8i16 ) ( in2 ), ( v8i16 ) ( in0 ) ); \
+ s1_m = __msa_ilvr_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in1 ) ); \
+ tmp4_m = __msa_ilvr_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m ); \
+ tmp5_m = __msa_ilvl_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m ); \
+ \
+ s0_m = __msa_ilvl_h( ( v8i16 ) ( in2 ), ( v8i16 ) ( in0 ) ); \
+ s1_m = __msa_ilvl_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in1 ) ); \
+ tmp6_m = __msa_ilvr_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m ); \
+ tmp7_m = __msa_ilvl_h( ( v8i16 ) s1_m, ( v8i16 ) s0_m ); \
+ \
+ out0 = ( v8i16 ) __msa_pckev_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m ); \
+ out1 = ( v8i16 ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m ); \
+ out2 = ( v8i16 ) __msa_pckev_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m ); \
+ out3 = ( v8i16 ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m ); \
+ out4 = ( v8i16 ) __msa_pckev_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m ); \
+ out5 = ( v8i16 ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m ); \
+ out6 = ( v8i16 ) __msa_pckev_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m ); \
+ out7 = ( v8i16 ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m ); \
+}
+
+/* word transpose macro */
+#define TRANSPOSE4x4_W( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ v4i32 s0_m, s1_m, s2_m, s3_m; \
+ \
+ s0_m = __msa_ilvr_w( ( v4i32 ) ( in1 ), ( v4i32 ) ( in0 ) ); \
+ s1_m = __msa_ilvl_w( ( v4i32 ) ( in1 ), ( v4i32 ) ( in0 ) ); \
+ s2_m = __msa_ilvr_w( ( v4i32 ) ( in3 ), ( v4i32 ) ( in2 ) ); \
+ s3_m = __msa_ilvl_w( ( v4i32 ) ( in3 ), ( v4i32 ) ( in2 ) ); \
+ \
+ out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \
+ out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \
+ out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \
+ out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \
+}
+
+/* interleave macros */
+/* no in-place support */
+#define ILV_B_LRLR_UB( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in0 ) ); \
+ out1 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in0 ) ); \
+ out2 = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in2 ) ); \
+ out3 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in2 ) ); \
+}
+
+#define ILV_B_LRLR_UH( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = ( v8u16 ) __msa_ilvl_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in0 ) ); \
+ out1 = ( v8u16 ) __msa_ilvr_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in0 ) ); \
+ out2 = ( v8u16 ) __msa_ilvl_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in2 ) ); \
+ out3 = ( v8u16 ) __msa_ilvr_b( ( v16i8 ) ( in3 ), ( v16i8 ) ( in2 ) ); \
+}
+
+#define ILV_H_LRLR_SH( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = __msa_ilvl_h( ( v8i16 ) ( in1 ), ( v8i16 ) ( in0 ) ); \
+ out1 = __msa_ilvr_h( ( v8i16 ) ( in1 ), ( v8i16 ) ( in0 ) ); \
+ out2 = __msa_ilvl_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in2 ) ); \
+ out3 = __msa_ilvr_h( ( v8i16 ) ( in3 ), ( v8i16 ) ( in2 ) ); \
+}
+
+#define ILV_W_LRLR_SH( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = ( v8i16 ) __msa_ilvl_w( ( v4i32 ) ( in1 ), ( v4i32 ) ( in0 ) ); \
+ out1 = ( v8i16 ) __msa_ilvr_w( ( v4i32 ) ( in1 ), ( v4i32 ) ( in0 ) ); \
+ out2 = ( v8i16 ) __msa_ilvl_w( ( v4i32 ) ( in3 ), ( v4i32 ) ( in2 ) ); \
+ out3 = ( v8i16 ) __msa_ilvr_w( ( v4i32 ) ( in3 ), ( v4i32 ) ( in2 ) ); \
+}
+
+#define ILVR_B_2VECS_UB( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ) \
+{ \
+ out0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r ); \
+ out1 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r ); \
+}
+
+#define ILVR_B_2VECS_SB( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ) \
+{ \
+ out0 = __msa_ilvr_b( ( v16i8 ) ( in0_l ), ( v16i8 ) ( in0_r ) ); \
+ out1 = __msa_ilvr_b( ( v16i8 ) ( in1_l ), ( v16i8 ) ( in1_r ) ); \
+}
+
+#define ILVR_B_4VECS_UB( in0_r, in1_r, in2_r, in3_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ out0, out1, out2, out3 ) \
+{ \
+ ILVR_B_2VECS_UB( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ); \
+ ILVR_B_2VECS_UB( in2_r, in3_r, in2_l, in3_l, \
+ out2, out3 ); \
+}
+
+#define ILVR_B_4VECS_SB( in0_r, in1_r, in2_r, in3_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ out0, out1, out2, out3 ) \
+{ \
+ ILVR_B_2VECS_SB( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ); \
+ ILVR_B_2VECS_SB( in2_r, in3_r, in2_l, in3_l, \
+ out2, out3 ); \
+}
+
+#define ILVR_B_2VECS_UH( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ) \
+{ \
+ out0 = ( v8u16 ) __msa_ilvr_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r ); \
+ out1 = ( v8u16 ) __msa_ilvr_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r ); \
+}
+
+#define ILVR_B_2VECS_SH( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ) \
+{ \
+ out0 = ( v8i16 ) __msa_ilvr_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r ); \
+ out1 = ( v8i16 ) __msa_ilvr_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r ); \
+}
+
+#define ILVR_B_4VECS_UH( in0_r, in1_r, in2_r, in3_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ out0, out1, out2, out3 ) \
+{ \
+ ILVR_B_2VECS_UH( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ); \
+ ILVR_B_2VECS_UH( in2_r, in3_r, in2_l, in3_l, \
+ out2, out3 ); \
+}
+
+#define ILVR_B_8VECS_SH( in0_r, in1_r, in2_r, in3_r, \
+ in4_r, in5_r, in6_r, in7_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ in4_l, in5_l, in6_l, in7_l, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7 ) \
+{ \
+ ILVR_B_2VECS_SH( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ); \
+ ILVR_B_2VECS_SH( in2_r, in3_r, in2_l, in3_l, \
+ out2, out3 ); \
+ ILVR_B_2VECS_SH( in4_r, in5_r, in4_l, in5_l, \
+ out4, out5 ); \
+ ILVR_B_2VECS_SH( in6_r, in7_r, in6_l, in7_l, \
+ out6, out7 ); \
+}
+
+#define ILVL_B_2VECS_UB( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ) \
+{ \
+ out0 = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r ); \
+ out1 = ( v16u8 ) __msa_ilvl_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r ); \
+}
+
+#define ILVL_B_2VECS_SB( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ) \
+{ \
+ out0 = __msa_ilvl_b( ( v16i8 ) ( in0_l ), ( v16i8 ) ( in0_r ) ); \
+ out1 = __msa_ilvl_b( ( v16i8 ) ( in1_l ), ( v16i8 ) ( in1_r ) ); \
+}
+
+#define ILVL_B_4VECS_UB( in0_r, in1_r, in2_r, in3_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ out0, out1, out2, out3 ) \
+{ \
+ ILVL_B_2VECS_UB( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ); \
+ ILVL_B_2VECS_UB( in2_r, in3_r, in2_l, in3_l, \
+ out2, out3 ); \
+}
+
+#define ILVL_B_4VECS_SB( in0_r, in1_r, in2_r, in3_r, \
+ in0_l, in1_l, in2_l, in3_l, \
+ out0, out1, out2, out3 ) \
+{ \
+ ILVL_B_2VECS_SB( in0_r, in1_r, in0_l, in1_l, \
+ out0, out1 ); \
+ ILVL_B_2VECS_SB( in2_r, in3_r, in2_l, in3_l, \
+ out2, out3 ); \
+}
+
+/* dot product macros */
+#define DPADD_U_H_4VECS_UH( a0, m0, c0, a1, m1, c1, \
+ a2, m2, c2, a3, m3, c3, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = __msa_dpadd_u_h( ( v8u16 ) a0, ( v16u8 ) m0, ( v16u8 ) c0 ); \
+ out1 = __msa_dpadd_u_h( ( v8u16 ) a1, ( v16u8 ) m1, ( v16u8 ) c1 ); \
+ out2 = __msa_dpadd_u_h( ( v8u16 ) a2, ( v16u8 ) m2, ( v16u8 ) c2 ); \
+ out3 = __msa_dpadd_u_h( ( v8u16 ) a3, ( v16u8 ) m3, ( v16u8 ) c3 ); \
+}
+
+/* macros with builtins for 4 vectors */
+#define MAXI_S_H_4VECS_UH( vec0, vec1, vec2, vec3, \
+ max_value ) \
+{ \
+ vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) ( vec0 ), ( max_value ) ); \
+ vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) ( vec1 ), ( max_value ) ); \
+ vec2 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) ( vec2 ), ( max_value ) ); \
+ vec3 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) ( vec3 ), ( max_value ) ); \
+}
+
+#define SAT_U_H_4VECS_UH( vec0, vec1, vec2, vec3, \
+ sat_value ) \
+{ \
+ vec0 = __msa_sat_u_h( ( v8u16 ) ( vec0 ), ( sat_value ) ); \
+ vec1 = __msa_sat_u_h( ( v8u16 ) ( vec1 ), ( sat_value ) ); \
+ vec2 = __msa_sat_u_h( ( v8u16 ) ( vec2 ), ( sat_value ) ); \
+ vec3 = __msa_sat_u_h( ( v8u16 ) ( vec3 ), ( sat_value ) ); \
+}
+
+#define PCKEV_D_4VECS_UB( in0_l, in0_r, in1_l, in1_r, \
+ in2_l, in2_r, in3_l, in3_r, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = ( v16u8 ) __msa_pckev_d( ( v2i64 ) in0_l, ( v2i64 ) in0_r ); \
+ out1 = ( v16u8 ) __msa_pckev_d( ( v2i64 ) in1_l, ( v2i64 ) in1_r ); \
+ out2 = ( v16u8 ) __msa_pckev_d( ( v2i64 ) in2_l, ( v2i64 ) in2_r ); \
+ out3 = ( v16u8 ) __msa_pckev_d( ( v2i64 ) in3_l, ( v2i64 ) in3_r ); \
+}
+
+#define PCKEV_B_4VECS_UB( in0_l, in1_l, in2_l, in3_l, \
+ in0_r, in1_r, in2_r, in3_r, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r ); \
+ out1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r ); \
+ out2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in2_l, ( v16i8 ) in2_r ); \
+ out3 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in3_l, ( v16i8 ) in3_r ); \
+}
+
+#define PCKOD_B_4VECS_UB( in0_l, in1_l, in2_l, in3_l, \
+ in0_r, in1_r, in2_r, in3_r, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0_l, ( v16i8 ) in0_r ); \
+ out1 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in1_l, ( v16i8 ) in1_r ); \
+ out2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in2_l, ( v16i8 ) in2_r ); \
+ out3 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in3_l, ( v16i8 ) in3_r ); \
+}
+
+#define XORI_B_2VECS_SB( val0, val1, \
+ out0, out1, xor_val ) \
+{ \
+ out0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) ( val0 ), ( xor_val ) ); \
+ out1 = ( v16i8 ) __msa_xori_b( ( v16u8 ) ( val1 ), ( xor_val ) ); \
+}
+
+#define XORI_B_3VECS_SB( val0, val1, val2, \
+ out0, out1, out2, \
+ xor_val ) \
+{ \
+ XORI_B_2VECS_SB( val0, val1, \
+ out0, out1, xor_val ); \
+ out2 = ( v16i8 ) __msa_xori_b( ( v16u8 ) ( val2 ), ( xor_val ) ); \
+}
+
+#define XORI_B_4VECS_SB( val0, val1, val2, val3, \
+ out0, out1, out2, out3, \
+ xor_val ) \
+{ \
+ XORI_B_2VECS_SB( val0, val1, \
+ out0, out1, xor_val ); \
+ XORI_B_2VECS_SB( val2, val3, \
+ out2, out3, xor_val ); \
+}
+
+#define XORI_B_5VECS_SB( val0, val1, val2, val3, val4, \
+ out0, out1, out2, out3, out4, \
+ xor_val ) \
+{ \
+ XORI_B_3VECS_SB( val0, val1, val2, \
+ out0, out1, out2, xor_val ); \
+ XORI_B_2VECS_SB( val3, val4, \
+ out3, out4, xor_val ); \
+}
+
+#define ADDS_S_H_4VECS_UH( in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) ( in0 ), ( v8i16 ) ( in1 ) ); \
+ out1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) ( in2 ), ( v8i16 ) ( in3 ) ); \
+ out2 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) ( in4 ), ( v8i16 ) ( in5 ) ); \
+ out3 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) ( in6 ), ( v8i16 ) ( in7 ) ); \
+}
+
+#define SRA_4VECS( in0, in1, in2, in3, \
+ out0, out1, out2, out3, \
+ shift_right_vec ) \
+{ \
+ out0 = ( in0 ) >> ( shift_right_vec ); \
+ out1 = ( in1 ) >> ( shift_right_vec ); \
+ out2 = ( in2 ) >> ( shift_right_vec ); \
+ out3 = ( in3 ) >> ( shift_right_vec ); \
+}
+
+#define SRL_H_4VECS_UH( in0, in1, in2, in3, \
+ out0, out1, out2, out3, \
+ shift_right_vec ) \
+{ \
+ out0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift_right_vec ); \
+ out1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift_right_vec ); \
+ out2 = ( v8u16 ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift_right_vec ); \
+ out3 = ( v8u16 ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift_right_vec ); \
+}
+
+#define SRARI_W_4VECS_SW( val0, val1, val2, val3, \
+ out0, out1, out2, out3, \
+ shift_right_val ) \
+{ \
+ out0 = __msa_srari_w( ( v4i32 ) ( val0 ), ( shift_right_val ) ); \
+ out1 = __msa_srari_w( ( v4i32 ) ( val1 ), ( shift_right_val ) ); \
+ out2 = __msa_srari_w( ( v4i32 ) ( val2 ), ( shift_right_val ) ); \
+ out3 = __msa_srari_w( ( v4i32 ) ( val3 ), ( shift_right_val ) ); \
+}
+
+#define SRARI_SATURATE_UNSIGNED_H( input, right_shift_val, sat_val ) \
+( { \
+ v8u16 out_m; \
+ \
+ out_m = ( v8u16 ) __msa_srari_h( ( v8i16 ) input, right_shift_val ); \
+ out_m = __msa_sat_u_h( out_m, ( sat_val ) ); \
+ out_m; \
+} )
+
+#define SRARI_SATURATE_SIGNED_H( input, right_shift_val, sat_val ) \
+( { \
+ v8i16 out_m; \
+ \
+ out_m = __msa_srari_h( ( v8i16 ) ( input ), ( right_shift_val ) ); \
+ out_m = __msa_sat_s_h( out_m, ( sat_val ) ); \
+ out_m; \
+} )
+
+#define SRARI_SATURATE_SIGNED_W( input, right_shift_val, sat_val ) \
+( { \
+ v4i32 out_m; \
+ \
+ out_m = __msa_srari_w( ( v4i32 ) ( input ), ( right_shift_val ) ); \
+ out_m = __msa_sat_s_w( out_m, ( sat_val ) ); \
+ out_m; \
+} )
+
+#define PCKEV_B_4_XORI128_STORE_8_BYTES_4( in1, in2, \
+ in3, in4, \
+ p_dst_ma, i_stride ) \
+{ \
+ uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst_ma ); \
+ \
+ tmp0_m = __msa_pckev_b( ( v16i8 ) ( in2 ), ( v16i8 ) ( in1 ) ); \
+ tmp1_m = __msa_pckev_b( ( v16i8 ) ( in4 ), ( v16i8 ) ( in3 ) ); \
+ \
+ tmp0_m = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0_m, 128 ); \
+ tmp1_m = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp1_m, 128 ); \
+ \
+ u_out0_m = __msa_copy_u_d( ( v2i64 ) tmp0_m, 0 ); \
+ u_out1_m = __msa_copy_u_d( ( v2i64 ) tmp0_m, 1 ); \
+ u_out2_m = __msa_copy_u_d( ( v2i64 ) tmp1_m, 0 ); \
+ u_out3_m = __msa_copy_u_d( ( v2i64 ) tmp1_m, 1 ); \
+ \
+ STORE_DWORD( p_dst_m, u_out0_m ); \
+ p_dst_m += i_stride; \
+ STORE_DWORD( p_dst_m, u_out1_m ); \
+ p_dst_m += i_stride; \
+ STORE_DWORD( p_dst_m, u_out2_m ); \
+ p_dst_m += i_stride; \
+ STORE_DWORD( p_dst_m, u_out3_m ); \
+}
+
+/* Only for signed vecs */
+#define PCKEV_B_XORI128_STORE_VEC( in1, in2, p_dest ) \
+{ \
+ v16i8 tmp_m; \
+ \
+ tmp_m = __msa_pckev_b( ( v16i8 ) ( in1 ), ( v16i8 ) ( in2 ) ); \
+ tmp_m = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp_m, 128 ); \
+ STORE_SB( tmp_m, ( p_dest ) ); \
+}
+
+#define PCKEV_B_STORE_4_BYTES_4( in1, in2, in3, in4, \
+ p_dst_ma, i_stride ) \
+{ \
+ uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst_ma ); \
+ \
+ tmp0_m = __msa_pckev_b( ( v16i8 ) ( in2 ), ( v16i8 ) ( in1 ) ); \
+ tmp1_m = __msa_pckev_b( ( v16i8 ) ( in4 ), ( v16i8 ) ( in3 ) ); \
+ \
+ u_out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 ); \
+ u_out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 ); \
+ u_out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 ); \
+ u_out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 ); \
+ \
+ STORE_WORD( p_dst_m, u_out0_m ); \
+ p_dst_m += i_stride; \
+ STORE_WORD( p_dst_m, u_out1_m ); \
+ p_dst_m += i_stride; \
+ STORE_WORD( p_dst_m, u_out2_m ); \
+ p_dst_m += i_stride; \
+ STORE_WORD( p_dst_m, u_out3_m ); \
+}
+
+#define PCKEV_B_STORE_8_BYTES_4( in1, in2, in3, in4, \
+ p_dst_ma, i_stride ) \
+{ \
+ uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
+ v16i8 tmp0_m, tmp1_m; \
+ uint8_t *p_dst_m = ( uint8_t * ) ( p_dst_ma ); \
+ \
+ tmp0_m = __msa_pckev_b( ( v16i8 ) ( in2 ), ( v16i8 ) ( in1 ) ); \
+ tmp1_m = __msa_pckev_b( ( v16i8 ) ( in4 ), ( v16i8 ) ( in3 ) ); \
+ \
+ u_out0_m = __msa_copy_u_d( ( v2i64 ) tmp0_m, 0 ); \
+ u_out1_m = __msa_copy_u_d( ( v2i64 ) tmp0_m, 1 ); \
+ u_out2_m = __msa_copy_u_d( ( v2i64 ) tmp1_m, 0 ); \
+ u_out3_m = __msa_copy_u_d( ( v2i64 ) tmp1_m, 1 ); \
+ \
+ STORE_DWORD( p_dst_m, u_out0_m ); \
+ p_dst_m += i_stride; \
+ STORE_DWORD( p_dst_m, u_out1_m ); \
+ p_dst_m += i_stride; \
+ STORE_DWORD( p_dst_m, u_out2_m ); \
+ p_dst_m += i_stride; \
+ STORE_DWORD( p_dst_m, u_out3_m ); \
+}
+
+#define UNPCK_SIGNED_H_TO_W( in, out1, out2 ) \
+{ \
+ v8i16 tmp_m; \
+ \
+ tmp_m = __msa_clti_s_h( ( v8i16 ) ( in ), 0 ); \
+ out1 = ( v4i32 ) __msa_ilvr_h( tmp_m, ( v8i16 ) ( in ) ); \
+ out2 = ( v4i32 ) __msa_ilvl_h( tmp_m, ( v8i16 ) ( in ) ); \
+}
+
+/* Generic for Vector types and GP operations */
+#define BUTTERFLY_4( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ out0 = ( in0 ) + ( in3 ); \
+ out1 = ( in1 ) + ( in2 ); \
+ \
+ out2 = ( in1 ) - ( in2 ); \
+ out3 = ( in0 ) - ( in3 ); \
+}
+
+/* Generic for Vector types and GP operations */
+#define BUTTERFLY_8( in0, in1, in2, in3, \
+ in4, in5, in6, in7, \
+ out0, out1, out2, out3, \
+ out4, out5, out6, out7 ) \
+{ \
+ out0 = ( in0 ) + ( in7 ); \
+ out1 = ( in1 ) + ( in6 ); \
+ out2 = ( in2 ) + ( in5 ); \
+ out3 = ( in3 ) + ( in4 ); \
+ \
+ out4 = ( in3 ) - ( in4 ); \
+ out5 = ( in2 ) - ( in5 ); \
+ out6 = ( in1 ) - ( in6 ); \
+ out7 = ( in0 ) - ( in7 ); \
+}
+
+#define ADD_RESIDUE_PRED_CLIP_AND_STORE_4( p_dest, dst_stride, \
+ in0, in1, in2, in3 ) \
+{ \
+ uint32_t u_src0_m, u_src1_m, u_src2_m, u_src3_m; \
+ uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
+ v8i16 inp0_m, inp1_m; \
+ v8i16 res0_m, res1_m; \
+ v16i8 dest0_m = { 0 }; \
+ v16i8 dest1_m = { 0 }; \
+ v16i8 zero_m = { 0 }; \
+ \
+ inp0_m = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) ( in1 ), ( v2i64 ) ( in0 ) ); \
+ inp1_m = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) ( in3 ), ( v2i64 ) ( in2 ) ); \
+ \
+ LOAD_4WORDS_WITH_STRIDE( p_dest, dst_stride, \
+ u_src0_m, u_src1_m, u_src2_m, u_src3_m ); \
+ dest0_m = ( v16i8 ) __msa_insert_w( ( v4i32 ) dest0_m, 0, u_src0_m ); \
+ dest0_m = ( v16i8 ) __msa_insert_w( ( v4i32 ) dest0_m, 1, u_src1_m ); \
+ dest1_m = ( v16i8 ) __msa_insert_w( ( v4i32 ) dest1_m, 0, u_src2_m ); \
+ dest1_m = ( v16i8 ) __msa_insert_w( ( v4i32 ) dest1_m, 1, u_src3_m ); \
+ \
+ res0_m = ( v8i16 ) __msa_ilvr_b( zero_m, dest0_m ); \
+ res1_m = ( v8i16 ) __msa_ilvr_b( zero_m, dest1_m ); \
+ \
+ res0_m += inp0_m; \
+ res1_m += inp1_m; \
+ \
+ res0_m = CLIP_UNSIGNED_CHAR_H( res0_m ); \
+ res1_m = CLIP_UNSIGNED_CHAR_H( res1_m ); \
+ \
+ dest0_m = __msa_pckev_b( ( v16i8 ) res0_m, ( v16i8 ) res0_m ); \
+ dest1_m = __msa_pckev_b( ( v16i8 ) res1_m, ( v16i8 ) res1_m ); \
+ \
+ u_out0_m = __msa_copy_u_w( ( v4i32 ) dest0_m, 0 ); \
+ u_out1_m = __msa_copy_u_w( ( v4i32 ) dest0_m, 1 ); \
+ u_out2_m = __msa_copy_u_w( ( v4i32 ) dest1_m, 0 ); \
+ u_out3_m = __msa_copy_u_w( ( v4i32 ) dest1_m, 1 ); \
+ \
+ STORE_WORD( p_dest, u_out0_m ); \
+ p_dest += dst_stride; \
+ STORE_WORD( p_dest, u_out1_m ); \
+ p_dest += dst_stride; \
+ STORE_WORD( p_dest, u_out2_m ); \
+ p_dest += dst_stride; \
+ STORE_WORD( p_dest, u_out3_m ); \
+}
+
+#define LOAD_4x4_1D_BLOCK_SH( src, in0, in1, in2, in3 ) \
+{ \
+ in0 = LOAD_SH( src ); \
+ in2 = LOAD_SH( src + 8 ); \
+ \
+ in1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) in0, ( v2i64 ) in0 ); \
+ in3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) in2, ( v2i64 ) in2 ); \
+}
+
+#define FILT_6TAP_DPADD_S_H( vec0, vec1, vec2, \
+ filt0, filt1, filt2 ) \
+( { \
+ v8i16 tmp0_m, tmp1_m; \
+ \
+ tmp0_m = __msa_dotp_s_h( ( v16i8 ) ( vec0 ), ( v16i8 ) ( filt0 ) ); \
+ tmp0_m = __msa_dpadd_s_h( tmp0_m, ( v16i8 ) vec1, ( v16i8 ) filt1 ); \
+ tmp1_m = __msa_dotp_s_h( ( v16i8 ) ( vec2 ), ( v16i8 ) ( filt2 ) ); \
+ tmp0_m = __msa_adds_s_h( tmp0_m, tmp1_m ); \
+ \
+ tmp0_m; \
+} )
+
+#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( vec0, vec1, vec2, \
+ vec3, vec4, vec5, \
+ const_minus5, const20 ) \
+( { \
+ v4i32 tmp1_m, tmp2_m; \
+ v8i16 tmp3_m, tmp4_m, tmp5_m, tmp6_m; \
+ \
+ tmp1_m = ( v4i32 ) __msa_ilvr_h( ( v8i16 ) vec5, ( v8i16 ) vec0 ); \
+ tmp2_m = ( v4i32 ) __msa_ilvl_h( ( v8i16 ) vec5, ( v8i16 ) vec0 ); \
+ \
+ tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m ); \
+ tmp2_m = __msa_hadd_s_w( ( v8i16 ) tmp2_m, ( v8i16 ) tmp2_m ); \
+ \
+ tmp3_m = __msa_ilvr_h( ( v8i16 ) ( vec1 ), ( v8i16 ) ( vec4 ) ); \
+ tmp4_m = __msa_ilvl_h( ( v8i16 ) ( vec1 ), ( v8i16 ) ( vec4 ) ); \
+ \
+ tmp1_m = __msa_dpadd_s_w( tmp1_m, tmp3_m, ( v8i16 ) const_minus5 ); \
+ tmp2_m = __msa_dpadd_s_w( tmp2_m, tmp4_m, ( v8i16 ) const_minus5 ); \
+ \
+ tmp5_m = __msa_ilvr_h( ( v8i16 ) ( vec2 ), ( v8i16 ) ( vec3 ) ); \
+ tmp6_m = __msa_ilvl_h( ( v8i16 ) ( vec2 ), ( v8i16 ) ( vec3 ) ); \
+ \
+ tmp1_m = __msa_dpadd_s_w( tmp1_m, tmp5_m, ( v8i16 ) const20 ); \
+ tmp2_m = __msa_dpadd_s_w( tmp2_m, tmp6_m, ( v8i16 ) const20 ); \
+ \
+ tmp1_m = SRARI_SATURATE_SIGNED_W( tmp1_m, 10, 7 ); \
+ tmp2_m = SRARI_SATURATE_SIGNED_W( tmp2_m, 10, 7 ); \
+ \
+ tmp3_m = __msa_pckev_h( ( v8i16 ) tmp2_m, ( v8i16 ) tmp1_m ); \
+ \
+ tmp3_m; \
+} )
+
+#define AVC_XOR_SHF_B_AND_APPLY_6TAP_2COEFF_HORIZ_FILT_SH( p_src, \
+ mask0, mask1, \
+ mask2, \
+ const_minus5, \
+ const20 ) \
+( { \
+ v8i16 vec0_m, horiz_out_m; \
+ v16i8 vec1_m, vec2_m, tmp_m; \
+ \
+ tmp_m = ( v16i8 ) __msa_xori_b( ( v16u8 ) ( p_src ), 128 ); \
+ \
+ vec0_m = ( v8i16 ) __msa_vshf_b( ( v16i8 ) ( mask0 ), tmp_m, tmp_m ); \
+ vec0_m = __msa_hadd_s_h( ( v16i8 ) vec0_m, ( v16i8 ) vec0_m ); \
+ \
+ vec1_m = __msa_vshf_b( ( v16i8 ) ( mask1 ), tmp_m, tmp_m ); \
+ vec0_m = __msa_dpadd_s_h( vec0_m, ( v16i8 ) ( const_minus5 ), vec1_m ); \
+ \
+ vec2_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), tmp_m, tmp_m ); \
+ horiz_out_m = __msa_dpadd_s_h( vec0_m, ( v16i8 ) ( const20 ), vec2_m ); \
+ \
+ horiz_out_m; \
+} )
+
+#endif /* X264_MIPS_MACROS_H */
--
2.3.2
More information about the x264-devel
mailing list