[x264-devel] [PATCH] MIPS dct module msa optimization
Rishikesh More
Rishikesh.More at imgtec.com
Wed Apr 22 10:42:04 CEST 2015
Hi,
Any review comments on below patch?
Regards,
Rishikesh
-----Original Message-----
From: Rishikesh More
Sent: Friday, April 17, 2015 7:43 PM
To: x264-devel at videolan.org
Cc: Rishikesh More
Subject: [PATCH] MIPS dct module msa optimization
dct functions optimization achieved ~x3
Added X264_CPU_MSA define for MIPS MSA
Added cpu detect function for mips platform
Signed-off-by: Rishikesh More <rishikesh.more at imgtec.com>
---
Makefile | 7 +
common/cpu.c | 13 +
common/dct.c | 33 +++
common/mips/dct-c.c | 762 ++++++++++++++++++++++++++++++++++++++++++++++++++++
common/mips/dct.h | 47 ++++
x264.h | 3 +
6 files changed, 865 insertions(+)
create mode 100644 common/mips/dct-c.c
create mode 100644 common/mips/dct.h
diff --git a/Makefile b/Makefile
index 9804e5f..8332595 100644
--- a/Makefile
+++ b/Makefile
@@ -143,6 +143,13 @@ OBJASM = $(ASMSRC:%.S=%.o) endif endif
+# MSA optims
+ifeq ($(SYS_ARCH),MIPS)
+ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),) SRCS +=
+common/mips/dct-c.c endif endif
+
ifneq ($(HAVE_GETOPT_LONG),1)
SRCCLI += extras/getopt.c
endif
diff --git a/common/cpu.c b/common/cpu.c index e0d1377..2409d8a 100644
--- a/common/cpu.c
+++ b/common/cpu.c
@@ -419,6 +419,19 @@ uint32_t x264_cpu_detect( void )
return X264_CPU_ARMV8 | X264_CPU_NEON; }
+#elif ARCH_MIPS
+
+uint32_t x264_cpu_detect( void )
+{
+ int flags = 0;
+
+#ifdef HAVE_MSA
+ flags |= X264_CPU_MSA;
+#endif
+
+ return flags;
+}
+
#else
uint32_t x264_cpu_detect( void )
diff --git a/common/dct.c b/common/dct.c index 2816ce0..db370ef 100644
--- a/common/dct.c
+++ b/common/dct.c
@@ -38,6 +38,9 @@
#if ARCH_AARCH64
# include "aarch64/dct.h"
#endif
+#if ARCH_MIPS
+# include "mips/dct.h"
+#endif
/* the inverse of the scaling factors introduced by 8x8 fdct */
/* uint32 is for the asm implementation of trellis. the actual values fit in uint16. */ @@ -752,6 +755,27 @@ void x264_dct_init( int cpu, x264_dct_function_t *dctf ) #endif
}
#endif
+
+#if HAVE_MSA
+ if( cpu&X264_CPU_MSA )
+ {
+ dctf->sub4x4_dct = x264_sub4x4_dct_msa;
+ dctf->sub8x8_dct = x264_sub8x8_dct_msa;
+ dctf->sub16x16_dct = x264_sub16x16_dct_msa;
+ dctf->sub8x8_dct_dc = x264_sub8x8_dct_dc_msa;
+ dctf->sub8x16_dct_dc = x264_sub8x16_dct_dc_msa;
+ dctf->dct4x4dc = x264_dct4x4dc_msa;
+ dctf->idct4x4dc = x264_idct4x4dc_msa;
+ dctf->add4x4_idct = x264_add4x4_idct_msa;
+ dctf->add8x8_idct = x264_add8x8_idct_msa;
+ dctf->add8x8_idct_dc = x264_add8x8_idct_dc_msa;
+ dctf->add16x16_idct = x264_add16x16_idct_msa;
+ dctf->add16x16_idct_dc = x264_add16x16_idct_dc_msa;
+ dctf->add8x8_idct8 = x264_add8x8_idct8_msa;
+ dctf->add16x16_idct8 = x264_add16x16_idct8_msa;
+ }
+#endif
+
#endif // HIGH_BIT_DEPTH
}
@@ -1072,4 +1096,13 @@ void x264_zigzag_init( int cpu, x264_zigzag_function_t *pf_progressive, x264_zig
}
#endif // ARCH_AARCH64
#endif // !HIGH_BIT_DEPTH
+#if !HIGH_BIT_DEPTH
+#if HAVE_MSA
+ if( cpu&X264_CPU_MSA )
+ {
+ pf_progressive->scan_4x4 = x264_zigzag_scan_4x4_frame_msa;
+ }
+#endif
+#endif
+
}
diff --git a/common/mips/dct-c.c b/common/mips/dct-c.c new file mode 100644 index 0000000..4b43014
--- /dev/null
+++ b/common/mips/dct-c.c
@@ -0,0 +1,762 @@
+/**********************************************************************
+*******
+ * dct-c.c: mips msa transform and zigzag
+
+***********************************************************************
+******
+ * Copyright (C) 2003-2015 x264 project
+ *
+ * Authors: Parag Salasakar <parag.salasakar at imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+
+***********************************************************************
+******/
+
+#include "common/common.h"
+#include "macros.h"
+
+#if !HIGH_BIT_DEPTH
+#define AVC_INVERSE_TRANSFORM_W( in0, in1, in2, in3, \
+ out0, out1, out2, out3 ) \
+{ \
+ v4i32 _tmp0, _tmp1, _tmp2, _tmp3; \
+ \
+ _tmp0 = ( in0 ) + ( in2 ); \
+ _tmp1 = ( in0 ) - ( in2 ); \
+ _tmp2 = ( in1 ) >> 1; \
+ _tmp2 = _tmp2 - ( in3 ); \
+ _tmp3 = ( in3 ) >> 1; \
+ _tmp3 = ( in1 ) + _tmp3; \
+ \
+ BUTTERFLY_4( _tmp0, _tmp1, _tmp2, _tmp3, \
+ out0, out1, out2, out3 ); \
+}
+
+static void avc_dct4x4dc_msa( int16_t *p_src, int16_t *p_dst,
+ int32_t i_src_stride ) {
+ v8i16 src0, src1, src2, src3;
+ v8i16 sign0, sign1, sign2, sign3;
+ v4i32 src0_r, src1_r, src2_r, src3_r;
+ v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
+ v4i32 ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r;
+ v8i16 ver_res0, ver_res1, ver_res2, ver_res3;
+ v4i32 tmp0, tmp1, tmp2, tmp3;
+ v2i64 dst0, dst1;
+
+ LOAD_4VECS_SH( p_src, i_src_stride, src0, src1, src2, src3 );
+
+ sign0 = __msa_clti_s_h( src0, 0 );
+ sign1 = __msa_clti_s_h( src1, 0 );
+ sign2 = __msa_clti_s_h( src2, 0 );
+ sign3 = __msa_clti_s_h( src3, 0 );
+
+ src0_r = ( v4i32 ) __msa_ilvr_h( sign0, src0 );
+ src1_r = ( v4i32 ) __msa_ilvr_h( sign1, src1 );
+ src2_r = ( v4i32 ) __msa_ilvr_h( sign2, src2 );
+ src3_r = ( v4i32 ) __msa_ilvr_h( sign3, src3 );
+
+ BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, tmp0, tmp3, tmp2, tmp1
+ );
+
+ BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
+ hor_res0, hor_res3, hor_res2, hor_res1 );
+
+ TRANSPOSE4x4_W( hor_res0, hor_res1, hor_res2, hor_res3,
+ hor_res0, hor_res1, hor_res2, hor_res3 );
+
+ BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
+ tmp0, tmp3, tmp2, tmp1 );
+
+ BUTTERFLY_4( tmp0, tmp1, tmp2, tmp3,
+ ver_res0_r, ver_res3_r, ver_res2_r, ver_res1_r );
+
+ SRARI_W_4VECS_SW( ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r,
+ ver_res0_r, ver_res1_r, ver_res2_r, ver_res3_r, 1
+ );
+
+ ver_res0 = __msa_pckev_h( ( v8i16 ) ver_res0_r, ( v8i16 ) ver_res0_r );
+ ver_res1 = __msa_pckev_h( ( v8i16 ) ver_res1_r, ( v8i16 ) ver_res1_r );
+ ver_res2 = __msa_pckev_h( ( v8i16 ) ver_res2_r, ( v8i16 ) ver_res2_r );
+ ver_res3 = __msa_pckev_h( ( v8i16 ) ver_res3_r, ( v8i16 )
+ ver_res3_r );
+
+ dst0 = __msa_pckod_d( ( v2i64 ) ver_res1, ( v2i64 ) ver_res0 );
+ dst1 = __msa_pckod_d( ( v2i64 ) ver_res3, ( v2i64 ) ver_res2 );
+
+ STORE_SD( dst0, p_dst );
+ STORE_SD( dst1, p_dst + 8 );
+}
+
+static void avc_sub4x4_dct_msa( uint8_t * __restrict p_src,
+ int32_t i_src_stride,
+ uint8_t * __restrict p_ref,
+ int32_t i_dst_stride,
+ int16_t *p_dst ) {
+ uint32_t u_src0, u_src1, u_src2, u_src3;
+ uint32_t u_ref0, u_ref1, u_ref2, u_ref3;
+ v16i8 src = { 0 };
+ v16i8 ref = { 0 };
+ v8i16 diff0, diff1, diff2, diff3;
+ v8i16 temp0, temp1, temp2, temp3;
+ v2i64 dst0, dst1;
+
+ LOAD_4WORDS_WITH_STRIDE( p_src, i_src_stride,
+ u_src0, u_src1, u_src2, u_src3 );
+
+ LOAD_4WORDS_WITH_STRIDE( p_ref, i_dst_stride,
+ u_ref0, u_ref1, u_ref2, u_ref3 );
+
+ VEC_INSERT_4W_SB( src, u_src0, u_src1, u_src2, u_src3 );
+ VEC_INSERT_4W_SB( ref, u_ref0, u_ref1, u_ref2, u_ref3 );
+
+ diff0 = ( v8i16 ) __msa_ilvr_b( src, ref );
+ diff2 = ( v8i16 ) __msa_ilvl_b( src, ref );
+
+ diff0 = __msa_hsub_u_h( ( v16u8 ) diff0, ( v16u8 ) diff0 );
+ diff2 = __msa_hsub_u_h( ( v16u8 ) diff2, ( v16u8 ) diff2 );
+
+ diff1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff0, ( v2i64 ) diff0 );
+ diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) diff2, ( v2i64 ) diff2 );
+
+ BUTTERFLY_4( diff0, diff1, diff2, diff3, temp0, temp1, temp2, temp3
+ );
+
+ diff0 = temp0 + temp1;
+ diff1 = ( temp3 << 1 ) + temp2;
+ diff2 = temp0 - temp1;
+ diff3 = temp3 - ( temp2 << 1 );
+
+ TRANSPOSE4x4_H( diff0, diff1, diff2, diff3, temp0, temp1, temp2,
+ temp3 );
+
+ BUTTERFLY_4( temp0, temp1, temp2, temp3, diff0, diff1, diff2, diff3
+ );
+
+ temp0 = diff0 + diff1;
+ temp1 = ( diff3 << 1 ) + diff2;
+ temp2 = diff0 - diff1;
+ temp3 = diff3 - ( diff2 << 1 );
+
+ dst0 = __msa_ilvr_d( ( v2i64 ) temp1, ( v2i64 ) temp0 );
+ dst1 = __msa_ilvr_d( ( v2i64 ) temp3, ( v2i64 ) temp2 );
+
+ STORE_SD( dst0, p_dst );
+ STORE_SD( dst1, p_dst + 8 );
+}
+
+static void avc_zigzag_scan_4x4_frame_msa( int16_t pi_dct[16],
+ int16_t pi_level[16] ) {
+ v8i16 src0, src1;
+ v8i16 mask0 = { 0, 4, 1, 2, 5, 8, 12, 9 };
+ v8i16 mask1 = { 6, 3, 7, 10, 13, 14, 11, 15 };
+
+ src0 = LOAD_SH( pi_dct );
+ src1 = LOAD_SH( pi_dct + 8 );
+
+ mask0 = __msa_vshf_h( mask0, src1, src0 );
+ mask1 = __msa_vshf_h( mask1, src1, src0 );
+
+ STORE_SH( mask0, pi_level );
+ STORE_SH( mask1, pi_level + 8 );
+}
+
+static void avc_idct4x4_addblk_msa( uint8_t *p_dst, int16_t *p_src,
+ int32_t i_dst_stride ) {
+ v8i16 src0, src1, src2, src3;
+ v8i16 sign0, sign1, sign2, sign3;
+ v4i32 src0_r, src1_r, src2_r, src3_r;
+ v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
+ v4i32 ver_res0, ver_res1, ver_res2, ver_res3;
+ v8i16 out0, out1, out2, out3;
+
+ LOAD_4x4_1D_BLOCK_SH( p_src, src0, src1, src2, src3 );
+
+ sign0 = __msa_clti_s_h( src0, 0 );
+ sign1 = __msa_clti_s_h( src1, 0 );
+ sign2 = __msa_clti_s_h( src2, 0 );
+ sign3 = __msa_clti_s_h( src3, 0 );
+
+ src0_r = ( v4i32 ) __msa_ilvr_h( sign0, src0 );
+ src1_r = ( v4i32 ) __msa_ilvr_h( sign1, src1 );
+ src2_r = ( v4i32 ) __msa_ilvr_h( sign2, src2 );
+ src3_r = ( v4i32 ) __msa_ilvr_h( sign3, src3 );
+
+ AVC_INVERSE_TRANSFORM_W( src0_r, src1_r, src2_r, src3_r,
+ hor_res0, hor_res1, hor_res2, hor_res3 );
+
+ TRANSPOSE4x4_W( hor_res0, hor_res1, hor_res2, hor_res3,
+ hor_res0, hor_res1, hor_res2, hor_res3 );
+
+ AVC_INVERSE_TRANSFORM_W( hor_res0, hor_res1, hor_res2, hor_res3,
+ ver_res0, ver_res1, ver_res2, ver_res3 );
+
+ SRARI_W_4VECS_SW( ver_res0, ver_res1, ver_res2, ver_res3,
+ ver_res0, ver_res1, ver_res2, ver_res3, 6 );
+
+ out0 = __msa_pckev_h( ( v8i16 ) ver_res0, ( v8i16 ) ver_res0 );
+ out1 = __msa_pckev_h( ( v8i16 ) ver_res1, ( v8i16 ) ver_res1 );
+ out2 = __msa_pckev_h( ( v8i16 ) ver_res2, ( v8i16 ) ver_res2 );
+ out3 = __msa_pckev_h( ( v8i16 ) ver_res3, ( v8i16 ) ver_res3 );
+
+ ADD_RESIDUE_PRED_CLIP_AND_STORE_4( p_dst, i_dst_stride,
+ out0, out1, out2, out3 ); }
+
+static void avc_idct4x4_addblk_dc_msa( uint8_t *p_dst, int16_t *p_src,
+ int32_t i_dst_stride ) {
+ int16_t i_dc;
+ uint32_t u_src0, u_src1, u_src2, u_src3;
+ uint32_t u_res0, u_res1, u_res2, u_res3;
+ v16u8 pred = { 0 };
+ v16i8 zeros = { 0 };
+ v8i16 input_dc;
+ v8i16 pred_r, pred_l;
+ v4i32 dst;
+
+ i_dc = ( p_src[0] + 32 ) >> 6;
+ input_dc = __msa_fill_h( i_dc );
+ p_src[0] = 0;
+
+ LOAD_4WORDS_WITH_STRIDE( p_dst, i_dst_stride,
+ u_src0, u_src1, u_src2, u_src3 );
+ VEC_INSERT_4W_UB( pred, u_src0, u_src1, u_src2, u_src3 );
+
+ pred_r = ( v8i16 ) __msa_ilvr_b( zeros, ( v16i8 ) pred );
+ pred_l = ( v8i16 ) __msa_ilvl_b( zeros, ( v16i8 ) pred );
+
+ pred_r += input_dc;
+ pred_l += input_dc;
+
+ pred_r = CLIP_UNSIGNED_CHAR_H( pred_r );
+ pred_l = CLIP_UNSIGNED_CHAR_H( pred_l );
+
+ dst = ( v4i32 ) __msa_pckev_b( ( v16i8 ) pred_l, ( v16i8 ) pred_r
+ );
+
+ u_res0 = __msa_copy_u_w( dst, 0 );
+ u_res1 = __msa_copy_u_w( dst, 1 );
+ u_res2 = __msa_copy_u_w( dst, 2 );
+ u_res3 = __msa_copy_u_w( dst, 3 );
+
+ STORE_WORD( p_dst, u_res0 );
+ p_dst += i_dst_stride;
+ STORE_WORD( p_dst, u_res1 );
+ p_dst += i_dst_stride;
+ STORE_WORD( p_dst, u_res2 );
+ p_dst += i_dst_stride;
+ STORE_WORD( p_dst, u_res3 );
+}
+
+static void avc_idct8_addblk_msa( uint8_t *p_dst, int16_t *p_src,
+ int32_t i_dst_stride ) {
+ uint64_t u_out0, u_out1, u_out2, u_out3;
+ v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8i16 vec0, vec1, vec2, vec3;
+ v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+ v8i16 sign0, sign1, sign2, sign3, sign4, sign5, sign6, sign7;
+ v4i32 tmp0_r, tmp1_r, tmp2_r, tmp3_r, tmp4_r, tmp5_r, tmp6_r, tmp7_r;
+ v4i32 tmp0_l, tmp1_l, tmp2_l, tmp3_l, tmp4_l, tmp5_l, tmp6_l, tmp7_l;
+ v4i32 vec0_r, vec1_r, vec2_r, vec3_r, vec0_l, vec1_l, vec2_l, vec3_l;
+ v4i32 res0_r, res1_r, res2_r, res3_r, res4_r, res5_r, res6_r, res7_r;
+ v4i32 res0_l, res1_l, res2_l, res3_l, res4_l, res5_l, res6_l, res7_l;
+ v16i8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+ v16i8 zeros = { 0 };
+ v2i64 out0, out1, out2, out3;
+
+ p_src[0] += 32;
+
+ LOAD_8VECS_SH( p_src, 8, src0, src1, src2, src3, src4, src5, src6,
+ src7 );
+
+ vec0 = src0 + src4;
+ vec1 = src0 - src4;
+ vec2 = src2 >> 1;
+ vec2 = vec2 - src6;
+ vec3 = src6 >> 1;
+ vec3 = src2 + vec3;
+
+ BUTTERFLY_4( vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3 );
+
+ vec0 = src7 >> 1;
+ vec0 = src5 - vec0 - src3 - src7;
+ vec1 = src3 >> 1;
+ vec1 = src1 - vec1 + src7 - src3;
+ vec2 = src5 >> 1;
+ vec2 = vec2 - src1 + src7 + src5;
+ vec3 = src1 >> 1;
+ vec3 = vec3 + src3 + src5 + src1;
+
+ tmp4 = vec3 >> 2;
+ tmp4 += vec0;
+ tmp5 = vec2 >> 2;
+ tmp5 += vec1;
+ tmp6 = vec1 >> 2;
+ tmp6 -= vec2;
+ tmp7 = vec0 >> 2;
+ tmp7 = vec3 - tmp7;
+
+ BUTTERFLY_8( tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7,
+ res0, res1, res2, res3, res4, res5, res6, res7 );
+
+ TRANSPOSE8x8_H_SH( res0, res1, res2, res3, res4, res5, res6, res7,
+ res0, res1, res2, res3, res4, res5, res6, res7
+ );
+
+ sign0 = __msa_clti_s_h( res0, 0 );
+ sign1 = __msa_clti_s_h( res1, 0 );
+ sign2 = __msa_clti_s_h( res2, 0 );
+ sign3 = __msa_clti_s_h( res3, 0 );
+ sign4 = __msa_clti_s_h( res4, 0 );
+ sign5 = __msa_clti_s_h( res5, 0 );
+ sign6 = __msa_clti_s_h( res6, 0 );
+ sign7 = __msa_clti_s_h( res7, 0 );
+
+ tmp0_r = ( v4i32 ) __msa_ilvr_h( sign0, res0 );
+ tmp0_l = ( v4i32 ) __msa_ilvl_h( sign0, res0 );
+ tmp1_r = ( v4i32 ) __msa_ilvr_h( sign1, res1 );
+ tmp1_l = ( v4i32 ) __msa_ilvl_h( sign1, res1 );
+ tmp2_r = ( v4i32 ) __msa_ilvr_h( sign2, res2 );
+ tmp2_l = ( v4i32 ) __msa_ilvl_h( sign2, res2 );
+ tmp3_r = ( v4i32 ) __msa_ilvr_h( sign3, res3 );
+ tmp3_l = ( v4i32 ) __msa_ilvl_h( sign3, res3 );
+ tmp4_r = ( v4i32 ) __msa_ilvr_h( sign4, res4 );
+ tmp4_l = ( v4i32 ) __msa_ilvl_h( sign4, res4 );
+ tmp5_r = ( v4i32 ) __msa_ilvr_h( sign5, res5 );
+ tmp5_l = ( v4i32 ) __msa_ilvl_h( sign5, res5 );
+ tmp6_r = ( v4i32 ) __msa_ilvr_h( sign6, res6 );
+ tmp6_l = ( v4i32 ) __msa_ilvl_h( sign6, res6 );
+ tmp7_r = ( v4i32 ) __msa_ilvr_h( sign7, res7 );
+ tmp7_l = ( v4i32 ) __msa_ilvl_h( sign7, res7 );
+
+ BUTTERFLY_4( tmp0_r, tmp0_l, tmp4_l, tmp4_r,
+ vec0_r, vec0_l, vec1_l, vec1_r );
+
+ vec2_r = tmp2_r >> 1;
+ vec2_l = tmp2_l >> 1;
+ vec2_r -= tmp6_r;
+ vec2_l -= tmp6_l;
+ vec3_r = tmp6_r >> 1;
+ vec3_l = tmp6_l >> 1;
+ vec3_r += tmp2_r;
+ vec3_l += tmp2_l;
+
+ BUTTERFLY_4( vec0_r, vec1_r, vec2_r, vec3_r,
+ tmp0_r, tmp2_r, tmp4_r, tmp6_r );
+
+ BUTTERFLY_4( vec0_l, vec1_l, vec2_l, vec3_l,
+ tmp0_l, tmp2_l, tmp4_l, tmp6_l );
+
+ vec0_r = tmp7_r >> 1;
+ vec0_l = tmp7_l >> 1;
+ vec0_r = tmp5_r - vec0_r - tmp3_r - tmp7_r;
+ vec0_l = tmp5_l - vec0_l - tmp3_l - tmp7_l;
+
+ vec1_r = tmp3_r >> 1;
+ vec1_l = tmp3_l >> 1;
+ vec1_r = tmp1_r - vec1_r + tmp7_r - tmp3_r;
+ vec1_l = tmp1_l - vec1_l + tmp7_l - tmp3_l;
+ vec2_r = tmp5_r >> 1;
+ vec2_l = tmp5_l >> 1;
+ vec2_r = vec2_r - tmp1_r + tmp7_r + tmp5_r;
+ vec2_l = vec2_l - tmp1_l + tmp7_l + tmp5_l;
+ vec3_r = tmp1_r >> 1;
+ vec3_l = tmp1_l >> 1;
+ vec3_r = vec3_r + tmp3_r + tmp5_r + tmp1_r;
+ vec3_l = vec3_l + tmp3_l + tmp5_l + tmp1_l;
+
+ tmp1_r = vec3_r >> 2;
+ tmp1_l = vec3_l >> 2;
+ tmp1_r += vec0_r;
+ tmp1_l += vec0_l;
+ tmp3_r = vec2_r >> 2;
+ tmp3_l = vec2_l >> 2;
+ tmp3_r += vec1_r;
+ tmp3_l += vec1_l;
+ tmp5_r = vec1_r >> 2;
+ tmp5_l = vec1_l >> 2;
+ tmp5_r -= vec2_r;
+ tmp5_l -= vec2_l;
+ tmp7_r = vec0_r >> 2;
+ tmp7_l = vec0_l >> 2;
+ tmp7_r = vec3_r - tmp7_r;
+ tmp7_l = vec3_l - tmp7_l;
+
+ res0_r = tmp0_r + tmp7_r;
+ res0_l = tmp0_l + tmp7_l;
+ res1_r = tmp2_r + tmp5_r;
+ res1_l = tmp2_l + tmp5_l;
+ res2_r = tmp4_r + tmp3_r;
+ res2_l = tmp4_l + tmp3_l;
+ res3_r = tmp6_r + tmp1_r;
+ res3_l = tmp6_l + tmp1_l;
+ res4_r = tmp6_r - tmp1_r;
+ res4_l = tmp6_l - tmp1_l;
+ res5_r = tmp4_r - tmp3_r;
+ res5_l = tmp4_l - tmp3_l;
+ res6_r = tmp2_r - tmp5_r;
+ res6_l = tmp2_l - tmp5_l;
+ res7_r = tmp0_r - tmp7_r;
+ res7_l = tmp0_l - tmp7_l;
+
+ res0_r >>= 6;
+ res0_l >>= 6;
+ res1_r >>= 6;
+ res1_l >>= 6;
+ res2_r >>= 6;
+ res2_l >>= 6;
+ res3_r >>= 6;
+ res3_l >>= 6;
+ res4_r >>= 6;
+ res4_l >>= 6;
+ res5_r >>= 6;
+ res5_l >>= 6;
+ res6_r >>= 6;
+ res6_l >>= 6;
+ res7_r >>= 6;
+ res7_l >>= 6;
+
+ res0 = __msa_pckev_h( ( v8i16 ) res0_l, ( v8i16 ) res0_r );
+ res1 = __msa_pckev_h( ( v8i16 ) res1_l, ( v8i16 ) res1_r );
+ res2 = __msa_pckev_h( ( v8i16 ) res2_l, ( v8i16 ) res2_r );
+ res3 = __msa_pckev_h( ( v8i16 ) res3_l, ( v8i16 ) res3_r );
+ res4 = __msa_pckev_h( ( v8i16 ) res4_l, ( v8i16 ) res4_r );
+ res5 = __msa_pckev_h( ( v8i16 ) res5_l, ( v8i16 ) res5_r );
+ res6 = __msa_pckev_h( ( v8i16 ) res6_l, ( v8i16 ) res6_r );
+ res7 = __msa_pckev_h( ( v8i16 ) res7_l, ( v8i16 ) res7_r );
+
+ LOAD_8VECS_SB( p_dst, i_dst_stride,
+ dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
+
+ tmp0 = ( v8i16 ) __msa_ilvr_b( zeros, dst0 );
+ tmp1 = ( v8i16 ) __msa_ilvr_b( zeros, dst1 );
+ tmp2 = ( v8i16 ) __msa_ilvr_b( zeros, dst2 );
+ tmp3 = ( v8i16 ) __msa_ilvr_b( zeros, dst3 );
+ tmp4 = ( v8i16 ) __msa_ilvr_b( zeros, dst4 );
+ tmp5 = ( v8i16 ) __msa_ilvr_b( zeros, dst5 );
+ tmp6 = ( v8i16 ) __msa_ilvr_b( zeros, dst6 );
+ tmp7 = ( v8i16 ) __msa_ilvr_b( zeros, dst7 );
+
+ res0 += tmp0;
+ res1 += tmp1;
+ res2 += tmp2;
+ res3 += tmp3;
+ res4 += tmp4;
+ res5 += tmp5;
+ res6 += tmp6;
+ res7 += tmp7;
+
+ res0 = CLIP_UNSIGNED_CHAR_H( res0 );
+ res1 = CLIP_UNSIGNED_CHAR_H( res1 );
+ res2 = CLIP_UNSIGNED_CHAR_H( res2 );
+ res3 = CLIP_UNSIGNED_CHAR_H( res3 );
+ res4 = CLIP_UNSIGNED_CHAR_H( res4 );
+ res5 = CLIP_UNSIGNED_CHAR_H( res5 );
+ res6 = CLIP_UNSIGNED_CHAR_H( res6 );
+ res7 = CLIP_UNSIGNED_CHAR_H( res7 );
+
+ out0 = ( v2i64 ) __msa_pckev_b( ( v16i8 ) res1, ( v16i8 ) res0 );
+ out1 = ( v2i64 ) __msa_pckev_b( ( v16i8 ) res3, ( v16i8 ) res2 );
+ out2 = ( v2i64 ) __msa_pckev_b( ( v16i8 ) res5, ( v16i8 ) res4 );
+ out3 = ( v2i64 ) __msa_pckev_b( ( v16i8 ) res7, ( v16i8 ) res6 );
+
+ u_out0 = __msa_copy_s_d( out0, 0 );
+ u_out1 = __msa_copy_s_d( out0, 1 );
+ u_out2 = __msa_copy_s_d( out1, 0 );
+ u_out3 = __msa_copy_s_d( out1, 1 );
+ STORE_DWORD( ( p_dst + 0 * i_dst_stride ), u_out0 );
+ STORE_DWORD( ( p_dst + 1 * i_dst_stride ), u_out1 );
+ STORE_DWORD( ( p_dst + 2 * i_dst_stride ), u_out2 );
+ STORE_DWORD( ( p_dst + 3 * i_dst_stride ), u_out3 );
+ u_out0 = __msa_copy_s_d( out2, 0 );
+ u_out1 = __msa_copy_s_d( out2, 1 );
+ u_out2 = __msa_copy_s_d( out3, 0 );
+ u_out3 = __msa_copy_s_d( out3, 1 );
+ STORE_DWORD( ( p_dst + 4 * i_dst_stride ), u_out0 );
+ STORE_DWORD( ( p_dst + 5 * i_dst_stride ), u_out1 );
+ STORE_DWORD( ( p_dst + 6 * i_dst_stride ), u_out2 );
+ STORE_DWORD( ( p_dst + 7 * i_dst_stride ), u_out3 ); }
+
+static void avc_idct4x4dc_msa( int16_t *p_src, int32_t i_src_stride,
+ int16_t *p_dst, int32_t i_dst_stride ) {
+ uint64_t u_out0, u_out1, u_out2, u_out3;
+ v8i16 src0, src1, src2, src3;
+ v8i16 sign0, sign1, sign2, sign3;
+ v4i32 src0_r, src1_r, src2_r, src3_r;
+ v4i32 hor_res0, hor_res1, hor_res2, hor_res3;
+ v8i16 ver_res0, ver_res1, ver_res2, ver_res3;
+ v4i32 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+ v2i64 dst0, dst1;
+
+ LOAD_4VECS_SH( p_src, i_src_stride, src0, src1, src2, src3 );
+
+ sign0 = __msa_clti_s_h( src0, 0 );
+ sign1 = __msa_clti_s_h( src1, 0 );
+ sign2 = __msa_clti_s_h( src2, 0 );
+ sign3 = __msa_clti_s_h( src3, 0 );
+
+ src0_r = ( v4i32 ) __msa_ilvr_h( sign0, src0 );
+ src1_r = ( v4i32 ) __msa_ilvr_h( sign1, src1 );
+ src2_r = ( v4i32 ) __msa_ilvr_h( sign2, src2 );
+ src3_r = ( v4i32 ) __msa_ilvr_h( sign3, src3 );
+
+ BUTTERFLY_4( src0_r, src2_r, src3_r, src1_r, vec0, vec3, vec2, vec1
+ );
+
+ BUTTERFLY_4( vec0, vec1, vec2, vec3,
+ hor_res0, hor_res3, hor_res2, hor_res1 );
+
+ TRANSPOSE4x4_W( hor_res0, hor_res1, hor_res2, hor_res3,
+ hor_res0, hor_res1, hor_res2, hor_res3 );
+
+ BUTTERFLY_4( hor_res0, hor_res2, hor_res3, hor_res1,
+ vec0, vec3, vec2, vec1 );
+
+ BUTTERFLY_4( vec0, vec1, vec2, vec3, vec4, vec7, vec6, vec5 );
+
+ ver_res0 = __msa_pckev_h( ( v8i16 ) vec4, ( v8i16 ) vec4 );
+ ver_res1 = __msa_pckev_h( ( v8i16 ) vec5, ( v8i16 ) vec5 );
+ ver_res2 = __msa_pckev_h( ( v8i16 ) vec6, ( v8i16 ) vec6 );
+ ver_res3 = __msa_pckev_h( ( v8i16 ) vec7, ( v8i16 ) vec7 );
+
+ dst0 = __msa_pckod_d( ( v2i64 ) ver_res1, ( v2i64 ) ver_res0 );
+ dst1 = __msa_pckod_d( ( v2i64 ) ver_res3, ( v2i64 ) ver_res2 );
+
+ u_out0 = __msa_copy_u_d( dst0, 0 );
+ u_out1 = __msa_copy_u_d( dst0, 1 );
+ u_out2 = __msa_copy_u_d( dst1, 0 );
+ u_out3 = __msa_copy_u_d( dst1, 1 );
+
+ STORE_DWORD( ( p_dst ), u_out0 );
+ STORE_DWORD( ( p_dst + i_dst_stride ), u_out1 );
+ STORE_DWORD( ( p_dst + 2 * i_dst_stride ), u_out2 );
+ STORE_DWORD( ( p_dst + 3 * i_dst_stride ), u_out3 ); }
+
+/* sum of differences */
+static int32_t subtract_sum4x4_msa( uint8_t * __restrict p_src,
+ int32_t i_src_stride,
+ uint8_t * __restrict p_pred,
+ int32_t i_pred_stride ) {
+ int16_t i_sum;
+ uint32_t src0, src1, src2, src3;
+ uint32_t pred0, pred1, pred2, pred3;
+ v16i8 src = { 0 };
+ v16i8 pred = { 0 };
+ v16u8 src_l0, src_l1;
+ v8i16 diff0, diff1;
+
+ LOAD_4WORDS_WITH_STRIDE( p_src, i_src_stride, src0, src1, src2, src3 );
+ LOAD_4WORDS_WITH_STRIDE( p_pred, i_pred_stride,
+ pred0, pred1, pred2, pred3 );
+
+ VEC_INSERT_4W_SB( src, src0, src1, src2, src3 );
+ VEC_INSERT_4W_SB( pred, pred0, pred1, pred2, pred3 );
+
+ src_l0 = ( v16u8 ) __msa_ilvr_b( src, pred );
+ src_l1 = ( v16u8 ) __msa_ilvl_b( src, pred );
+
+ diff0 = __msa_hsub_u_h( src_l0, src_l0 );
+ diff1 = __msa_hsub_u_h( src_l1, src_l1 );
+
+ i_sum = CALC_ADDITIVE_SUM_H( diff0 + diff1 );
+
+ return i_sum;
+}
+
+void x264_dct4x4dc_msa( int16_t pi_d[16] ) {
+ avc_dct4x4dc_msa( pi_d, pi_d, 4 );
+}
+
+void x264_idct4x4dc_msa( int16_t pi_d[16] ) {
+ avc_idct4x4dc_msa( pi_d, 4, pi_d, 4 ); }
+
+void x264_add4x4_idct_msa( uint8_t *p_dst, int16_t pi_dct[16] ) {
+ avc_idct4x4_addblk_msa( p_dst, pi_dct, FDEC_STRIDE ); }
+
+void x264_add8x8_idct_msa( uint8_t *p_dst, int16_t pi_dct[4][16] ) {
+ avc_idct4x4_addblk_msa( &p_dst[0], &pi_dct[0][0],
+ FDEC_STRIDE );
+ avc_idct4x4_addblk_msa( &p_dst[4], &pi_dct[1][0],
+ FDEC_STRIDE );
+ avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE], &pi_dct[2][0],
+ FDEC_STRIDE );
+ avc_idct4x4_addblk_msa( &p_dst[4 * FDEC_STRIDE + 4], &pi_dct[3][0],
+ FDEC_STRIDE ); }
+
+void x264_add16x16_idct_msa( uint8_t *p_dst, int16_t pi_dct[16][16] ) {
+ x264_add8x8_idct_msa( &p_dst[0], &pi_dct[0] );
+ x264_add8x8_idct_msa( &p_dst[8], &pi_dct[4] );
+ x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE], &pi_dct[8] );
+ x264_add8x8_idct_msa( &p_dst[8 * FDEC_STRIDE + 8], &pi_dct[12] ); }
+
+void x264_add8x8_idct8_msa( uint8_t *p_dst, int16_t pi_dct[64] ) {
+ avc_idct8_addblk_msa( p_dst, pi_dct, FDEC_STRIDE ); }
+
+void x264_add16x16_idct8_msa( uint8_t *p_dst, int16_t pi_dct[4][64] ) {
+ avc_idct8_addblk_msa( &p_dst[0], &pi_dct[0][0], FDEC_STRIDE );
+ avc_idct8_addblk_msa( &p_dst[8], &pi_dct[1][0], FDEC_STRIDE );
+ avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE], &pi_dct[2][0], FDEC_STRIDE );
+ avc_idct8_addblk_msa( &p_dst[8 * FDEC_STRIDE + 8],
+ &pi_dct[3][0], FDEC_STRIDE ); }
+
+void x264_add8x8_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[4] ) {
+ avc_idct4x4_addblk_dc_msa( &p_dst[0], &pi_dct[0],
+ FDEC_STRIDE );
+ avc_idct4x4_addblk_dc_msa( &p_dst[4], &pi_dct[1],
+ FDEC_STRIDE );
+ avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE], &pi_dct[2],
+ FDEC_STRIDE );
+ avc_idct4x4_addblk_dc_msa( &p_dst[4 * FDEC_STRIDE + 4], &pi_dct[3],
+ FDEC_STRIDE ); }
+
+void x264_add16x16_idct_dc_msa( uint8_t *p_dst, int16_t pi_dct[16] ) {
+ int32_t i_loop_cnt;
+ for( i_loop_cnt = 0; i_loop_cnt < 4; i_loop_cnt++, pi_dct += 4, p_dst += 4 * FDEC_STRIDE )
+ {
+ avc_idct4x4_addblk_dc_msa( &p_dst[ 0], &pi_dct[0], FDEC_STRIDE );
+ avc_idct4x4_addblk_dc_msa( &p_dst[ 4], &pi_dct[1], FDEC_STRIDE );
+ avc_idct4x4_addblk_dc_msa( &p_dst[ 8], &pi_dct[2], FDEC_STRIDE );
+ avc_idct4x4_addblk_dc_msa( &p_dst[12], &pi_dct[3], FDEC_STRIDE );
+ }
+}
+
+void x264_sub4x4_dct_msa( int16_t pi_dst[16], uint8_t * __restrict p_src,
+ uint8_t * __restrict p_ref ) {
+ avc_sub4x4_dct_msa( p_src, FENC_STRIDE, p_ref, FDEC_STRIDE, pi_dst
+); }
+
+void x264_sub8x8_dct_msa( int16_t pi_dst[4][16], uint8_t * __restrict p_src,
+ uint8_t * __restrict p_ref ) {
+ avc_sub4x4_dct_msa( &p_src[0], FENC_STRIDE,
+ &p_ref[0], FDEC_STRIDE, pi_dst[0] );
+ avc_sub4x4_dct_msa( &p_src[4], FENC_STRIDE,
+ &p_ref[4], FDEC_STRIDE, pi_dst[1] );
+ avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE], FENC_STRIDE,
+ &p_ref[4 * FDEC_STRIDE], FDEC_STRIDE, pi_dst[2] );
+ avc_sub4x4_dct_msa( &p_src[4 * FENC_STRIDE + 4], FENC_STRIDE,
+ &p_ref[4 * FDEC_STRIDE + 4], FDEC_STRIDE,
+pi_dst[3] ); }
+
+void x264_sub16x16_dct_msa( int16_t pi_dst[16][16], uint8_t * __restrict p_src,
+ uint8_t * __restrict p_ref ) {
+ x264_sub8x8_dct_msa( &pi_dst[ 0], &p_src[0], &p_ref[0] );
+ x264_sub8x8_dct_msa( &pi_dst[ 4], &p_src[8], &p_ref[8] );
+ x264_sub8x8_dct_msa( &pi_dst[ 8], &p_src[8 * FENC_STRIDE],
+ &p_ref[8 * FDEC_STRIDE] );
+ x264_sub8x8_dct_msa( &pi_dst[12], &p_src[8 * FENC_STRIDE + 8],
+ &p_ref[8 * FDEC_STRIDE + 8] ); }
+
+void x264_sub8x8_dct_dc_msa( int16_t pi_dct[4],
+ uint8_t *p_pix1, uint8_t *p_pix2 ) {
+ pi_dct[0] = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
+ &p_pix2[0], FDEC_STRIDE );
+ pi_dct[1] = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
+ &p_pix2[4], FDEC_STRIDE );
+ pi_dct[2] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE], FENC_STRIDE,
+ &p_pix2[4 * FDEC_STRIDE], FDEC_STRIDE );
+ pi_dct[3] = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4], FENC_STRIDE,
+ &p_pix2[4 * FDEC_STRIDE + 4],
+ FDEC_STRIDE );
+
+ /* 2x2 DC transform */
+ int32_t i_d0 = pi_dct[0] + pi_dct[1];
+ int32_t i_d1 = pi_dct[2] + pi_dct[3];
+ int32_t i_d2 = pi_dct[0] - pi_dct[1];
+ int32_t i_d3 = pi_dct[2] - pi_dct[3];
+ pi_dct[0] = i_d0 + i_d1;
+ pi_dct[2] = i_d2 + i_d3;
+ pi_dct[3] = i_d2 - i_d3;
+ pi_dct[1] = i_d0 - i_d1;
+}
+
+void x264_sub8x16_dct_dc_msa( int16_t pi_dct[8],
+ uint8_t *p_pix1, uint8_t *p_pix2 ) {
+ int32_t i_a0 = subtract_sum4x4_msa( &p_pix1[0], FENC_STRIDE,
+ &p_pix2[0], FDEC_STRIDE );
+ int32_t i_a1 = subtract_sum4x4_msa( &p_pix1[4], FENC_STRIDE,
+ &p_pix2[4], FDEC_STRIDE );
+ int32_t i_a2 = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE], FENC_STRIDE,
+ &p_pix2[4 * FDEC_STRIDE], FDEC_STRIDE );
+ int32_t i_a3 = subtract_sum4x4_msa( &p_pix1[4 * FENC_STRIDE + 4],
+ FENC_STRIDE,
+ &p_pix2[4 * FDEC_STRIDE + 4],
+ FDEC_STRIDE );
+ int32_t i_a4 = subtract_sum4x4_msa( &p_pix1[8 * FENC_STRIDE], FENC_STRIDE,
+ &p_pix2[8 * FDEC_STRIDE], FDEC_STRIDE );
+ int32_t i_a5 = subtract_sum4x4_msa( &p_pix1[8 * FENC_STRIDE + 4],
+ FENC_STRIDE,
+ &p_pix2[8 * FDEC_STRIDE + 4],
+ FDEC_STRIDE );
+ int32_t i_a6 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE], FENC_STRIDE,
+ &p_pix2[12 * FDEC_STRIDE],
+ FDEC_STRIDE );
+ int32_t i_a7 = subtract_sum4x4_msa( &p_pix1[12 * FENC_STRIDE + 4],
+ FENC_STRIDE,
+ &p_pix2[12 * FDEC_STRIDE + 4],
+ FDEC_STRIDE );
+
+ /* 2x4 DC transform */
+ int32_t i_b0 = i_a0 + i_a1;
+ int32_t i_b1 = i_a2 + i_a3;
+ int32_t i_b2 = i_a4 + i_a5;
+ int32_t i_b3 = i_a6 + i_a7;
+ int32_t i_b4 = i_a0 - i_a1;
+ int32_t i_b5 = i_a2 - i_a3;
+ int32_t i_b6 = i_a4 - i_a5;
+ int32_t i_b7 = i_a6 - i_a7;
+ i_a0 = i_b0 + i_b1;
+ i_a1 = i_b2 + i_b3;
+ i_a2 = i_b4 + i_b5;
+ i_a3 = i_b6 + i_b7;
+ i_a4 = i_b0 - i_b1;
+ i_a5 = i_b2 - i_b3;
+ i_a6 = i_b4 - i_b5;
+ i_a7 = i_b6 - i_b7;
+ pi_dct[0] = i_a0 + i_a1;
+ pi_dct[1] = i_a2 + i_a3;
+ pi_dct[2] = i_a0 - i_a1;
+ pi_dct[3] = i_a2 - i_a3;
+ pi_dct[4] = i_a4 - i_a5;
+ pi_dct[5] = i_a6 - i_a7;
+ pi_dct[6] = i_a4 + i_a5;
+ pi_dct[7] = i_a6 + i_a7;
+}
+
+void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t
+pi_dct[16] ) {
+ avc_zigzag_scan_4x4_frame_msa( pi_dct, pi_level ); } #endif
diff --git a/common/mips/dct.h b/common/mips/dct.h new file mode 100644 index 0000000..7a33412
--- /dev/null
+++ b/common/mips/dct.h
@@ -0,0 +1,47 @@
+/**********************************************************************
+*******
+ * dct.h: mips transform and zigzag
+
+***********************************************************************
+******
+ * Copyright (C) 2009-2015 x264 project
+ *
+ * Authors: Parag Salasakar <parag.salasakar at imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+
+***********************************************************************
+******/
+
+#ifndef X264_MIPS_DCT_H
+#define X264_MIPS_DCT_H
+
+void x264_dct4x4dc_msa( int16_t d[16] ); void x264_sub4x4_dct_msa(
+int16_t pi_dct[16], uint8_t *p_pix1, uint8_t *p_pix2 ); void
+x264_sub8x8_dct_msa( int16_t pi_dct[4][16], uint8_t *p_pix1, uint8_t
+*p_pix2 ); void x264_sub16x16_dct_msa( int16_t pi_dct[16][16], uint8_t
+*p_pix1, uint8_t *p_pix2 ); void x264_sub8x8_dct_dc_msa( int16_t
+pi_dct[4], uint8_t *p_pix1, uint8_t *p_pix2 ); void
+x264_sub8x16_dct_dc_msa( int16_t pi_dct[8], uint8_t *p_pix1, uint8_t
+*p_pix2 );
+
+void x264_idct4x4dc_msa( int16_t d[16] ); void x264_add4x4_idct_msa(
+uint8_t *p_dst, int16_t pi_dct[16] ); void x264_add8x8_idct_msa(
+uint8_t *p_dst, int16_t pi_dct[4][16] ); void x264_add16x16_idct_msa(
+uint8_t *p_dst, int16_t pi_dct[16][16] ); void x264_add8x8_idct_dc_msa(
+uint8_t *p_dst, int16_t pi_dct[4] ); void x264_add16x16_idct_dc_msa(
+uint8_t *p_dst, int16_t pi_dct[16] ); void x264_add8x8_idct8_msa(
+uint8_t *p_dst, int16_t pi_dct[64] ); void x264_add16x16_idct8_msa(
+uint8_t *p_dst, int16_t pi_dct[4][64] );
+
+void x264_zigzag_scan_4x4_frame_msa( int16_t pi_level[16], int16_t
+pi_dct[16] );
+
+#endif
diff --git a/x264.h b/x264.h
index 7cc5fcc..8194246 100644
--- a/x264.h
+++ b/x264.h
@@ -158,6 +158,9 @@ typedef struct
#define X264_CPU_FAST_NEON_MRC 0x0000004 /* Transfer from NEON to ARM register is fast (Cortex-A9) */
#define X264_CPU_ARMV8 0x0000008
+/* MIPS */
+#define X264_CPU_MSA 0x0000001 /* MIPS MSA */
+
/* Analyse flags */
#define X264_ANALYSE_I4x4 0x0001 /* Analyse i4x4 */
#define X264_ANALYSE_I8x8 0x0002 /* Analyse i8x8 (requires 8x8 transform) */
--
2.3.2
More information about the x264-devel
mailing list