[x264-devel] [PATCH 3/9] MIPS MSA MC 1/2 module optimization
Rishikesh More
rishikesh.more at imgtec.com
Thu Jun 18 14:18:40 CEST 2015
This patch adds MSA (MIPS-SIMD-Arch) optimizations for pixel_avg and mc_weight functions.
Signed-off-by: Rishikesh More <rishikesh.more at imgtec.com>
---
Makefile | 7 +
common/mc.c | 7 +
common/mips/mc-c.c | 1993 ++++++++++++++++++++++++++++++++++++++++++++++++++++
common/mips/mc.h | 31 +
4 files changed, 2038 insertions(+)
create mode 100644 common/mips/mc-c.c
create mode 100644 common/mips/mc.h
diff --git a/Makefile b/Makefile
index 9804e5f..249ba2b 100644
--- a/Makefile
+++ b/Makefile
@@ -143,6 +143,13 @@ OBJASM = $(ASMSRC:%.S=%.o)
endif
endif
+# MSA optims
+ifeq ($(SYS_ARCH),MIPS)
+ifneq ($(findstring HAVE_MSA 1, $(CONFIG)),)
+SRCS += common/mips/mc-c.c
+endif
+endif
+
ifneq ($(HAVE_GETOPT_LONG),1)
SRCCLI += extras/getopt.c
endif
diff --git a/common/mc.c b/common/mc.c
index 4bb0bf3..18f1a8c 100644
--- a/common/mc.c
+++ b/common/mc.c
@@ -38,6 +38,9 @@
#if ARCH_AARCH64
#include "aarch64/mc.h"
#endif
+#if ARCH_MIPS
+#include "mips/mc.h"
+#endif
static inline void pixel_avg( pixel *dst, intptr_t i_dst_stride,
@@ -647,6 +650,10 @@ void x264_mc_init( int cpu, x264_mc_functions_t *pf, int cpu_independent )
#if ARCH_AARCH64
x264_mc_init_aarch64( cpu, pf );
#endif
+#if HAVE_MSA
+ if( cpu&X264_CPU_MSA )
+ x264_mc_init_mips( cpu, pf );
+#endif
if( cpu_independent )
{
diff --git a/common/mips/mc-c.c b/common/mips/mc-c.c
new file mode 100644
index 0000000..6348d12
--- /dev/null
+++ b/common/mips/mc-c.c
@@ -0,0 +1,1993 @@
+/*****************************************************************************
+ * mc-c.c: msa motion compensation
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Neha Rana <neha.rana at imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#include "common/common.h"
+#include "macros.h"
+#include "mc.h"
+
+#if !HIGH_BIT_DEPTH
+static const uint8_t pu_hpel_ref0[16] = {
+ 0, 1, 1, 1, 0, 1, 1, 1, 2, 3, 3, 3, 0, 1, 1, 1
+};
+
+static const uint8_t pu_hpel_ref1[16] = {
+ 0, 0, 1, 0, 2, 2, 3, 2, 2, 2, 3, 2, 2, 2, 3, 2
+};
+
+static const uint8_t pu_luma_mask_arr[16 * 8] =
+{
+ /* 8 width cases */
+ 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
+ 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
+ 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
+ /* 4 width cases */
+ 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
+ 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
+ 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
+};
+
+static const uint8_t pu_chroma_mask_arr[16 * 5] =
+{
+ 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+ 0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
+ 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
+ 0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
+};
+
+void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ int32_t i_height );
+void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ int32_t i_height );
+void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
+ intptr_t i_src_stride, int32_t i_height );
+void x264_memzero_aligned_msa( void *p_dst, size_t n );
+
+void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+ uint8_t *p_pix2, intptr_t i_pix2_stride,
+ uint8_t *p_pix3, intptr_t i_pix3_stride,
+ int32_t i_weight );
+void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+ uint8_t *p_pix2, intptr_t i_pix2_stride,
+ uint8_t *p_pix3, intptr_t i_pix3_stride,
+ int32_t i_weight );
+void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+ uint8_t *p_pix2, intptr_t i_pix2_stride,
+ uint8_t *p_pix3, intptr_t i_pix3_stride,
+ int32_t i_weight );
+void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+ uint8_t *p_pix2, intptr_t i_pix2_stride,
+ uint8_t *p_pix3, intptr_t i_pix3_stride,
+ int32_t i_weight );
+void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+ uint8_t *p_pix2, intptr_t i_pix2_stride,
+ uint8_t *p_pix3, intptr_t i_pix3_stride,
+ int32_t i_weight );
+void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight );
+void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+ uint8_t *p_pix2, intptr_t i_pix2_stride,
+ uint8_t *p_pix3, intptr_t i_pix3_stride,
+ int32_t i_weight );
+void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+ uint8_t *p_pix2, intptr_t i_pix2_stride,
+ uint8_t *p_pix3, intptr_t i_pix3_stride,
+ int32_t i_weight );
+void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
+ uint8_t *p_pix2, intptr_t i_pix2_stride,
+ uint8_t *p_pix3, intptr_t i_pix3_stride,
+ int32_t i_weight );
+
+void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ const x264_weight_t *pWeight, int32_t i_height );
+void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ const x264_weight_t *pWeight, int32_t i_height );
+
+weight_fn_t x264_mc_weight_wtab_msa[6] =
+{
+ x264_mc_weight_w4_msa,
+ x264_mc_weight_w4_msa,
+ x264_mc_weight_w8_msa,
+ x264_mc_weight_w16_msa,
+ x264_mc_weight_w16_msa,
+ x264_mc_weight_w20_msa,
+};
+
+void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src[4], intptr_t i_src_stride,
+ int32_t m_vx, int32_t m_vy,
+ int32_t i_width, int32_t i_height,
+ const x264_weight_t *pWeight );
+uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
+ uint8_t *p_src[4], intptr_t i_src_stride,
+ int32_t m_vx, int32_t m_vy,
+ int32_t i_width, int32_t i_height,
+ const x264_weight_t *pWeight );
+void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
+ intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ int32_t m_vx, int32_t m_vy,
+ int32_t i_width, int32_t i_height );
+
+static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src,
+ int32_t i_src_stride,
+ uint8_t *p_dst_u,
+ uint8_t *p_dst_v,
+ int32_t i_dst_stride,
+ uint32_t u_coef_hor0,
+ uint32_t u_coef_hor1,
+ uint32_t u_coef_ver0,
+ uint32_t u_coef_ver1 )
+{
+ uint16_t u_out0, u_out1, u_out2, u_out3;
+ v16u8 src0, src1, src2, src3, src4;
+ v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+ v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+ v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+ v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+ v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+ v8i16 res0, res1;
+
+ mask = LD_SB( &pu_chroma_mask_arr[16] );
+
+ LD_UB3( p_src, i_src_stride, src0, src1, src2 );
+ VSHF_B2_UB( src0, src1, src1, src2,
+ ( mask + 1 ), ( mask + 1 ), src3, src4 );
+ VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
+ DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+ res_hz3 );
+ MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+ coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3 );
+ ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
+ SRARI_H2_UH( res_vt0, res_vt2, 6 );
+ SAT_UH2_UH( res_vt0, res_vt2, 7 );
+ PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
+
+ u_out0 = __msa_copy_u_h( res0, 0 );
+ u_out1 = __msa_copy_u_h( res0, 2 );
+ u_out2 = __msa_copy_u_h( res1, 0 );
+ u_out3 = __msa_copy_u_h( res1, 2 );
+
+ SH( u_out0, p_dst_u );
+ p_dst_u += i_dst_stride;
+ SH( u_out1, p_dst_u );
+
+ SH( u_out2, p_dst_v );
+ p_dst_v += i_dst_stride;
+ SH( u_out3, p_dst_v );
+}
+
+static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src,
+ int32_t i_src_stride,
+ uint8_t *p_dst_u,
+ uint8_t *p_dst_v,
+ int32_t i_dst_stride,
+ uint32_t u_coef_hor0,
+ uint32_t u_coef_hor1,
+ uint32_t u_coef_ver0,
+ uint32_t u_coef_ver1 )
+{
+ uint16_t u_out0, u_out1, u_out2, u_out3;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16i8 mask;
+ v8i16 res0, res1;
+ v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+ v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+ v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+ v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+ v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+
+ mask = LD_SB( &pu_chroma_mask_arr[16] );
+
+ LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
+
+ VSHF_B2_UB( src0, src1, src1, src2,
+ ( mask + 1 ), ( mask + 1 ), src5, src6 );
+ VSHF_B2_UB( src2, src3, src3, src4,
+ ( mask + 1 ), ( mask + 1 ), src7, src8 );
+ VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
+ VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
+ DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz0,
+ res_hz1, res_hz2, res_hz3 );
+ MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+ coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3 );
+ ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
+ SRARI_H2_UH( res_vt0, res_vt1, 6 );
+ SAT_UH2_UH( res_vt0, res_vt1, 7 );
+ PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
+
+ u_out0 = __msa_copy_u_h( res0, 0 );
+ u_out1 = __msa_copy_u_h( res0, 2 );
+ u_out2 = __msa_copy_u_h( res1, 0 );
+ u_out3 = __msa_copy_u_h( res1, 2 );
+
+ SH( u_out0, p_dst_u );
+ p_dst_u += i_dst_stride;
+ SH( u_out1, p_dst_u );
+ p_dst_u += i_dst_stride;
+ SH( u_out2, p_dst_u );
+ p_dst_u += i_dst_stride;
+ SH( u_out3, p_dst_u );
+
+ DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+ res_hz3 );
+ MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+ coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3 );
+ ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
+ SRARI_H2_UH( res_vt0, res_vt1, 6 );
+ SAT_UH2_UH( res_vt0, res_vt1, 7 );
+ PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
+
+ u_out0 = __msa_copy_u_h( res0, 0 );
+ u_out1 = __msa_copy_u_h( res0, 2 );
+ u_out2 = __msa_copy_u_h( res1, 0 );
+ u_out3 = __msa_copy_u_h( res1, 2 );
+
+ SH( u_out0, p_dst_v );
+ p_dst_v += i_dst_stride;
+ SH( u_out1, p_dst_v );
+ p_dst_v += i_dst_stride;
+ SH( u_out2, p_dst_v );
+ p_dst_v += i_dst_stride;
+ SH( u_out3, p_dst_v );
+}
+
+static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src,
+ int32_t i_src_stride,
+ uint8_t *p_dst_u,
+ uint8_t *p_dst_v,
+ int32_t i_dst_stride,
+ uint32_t u_coef_hor0,
+ uint32_t u_coef_hor1,
+ uint32_t u_coef_ver0,
+ uint32_t u_coef_ver1,
+ int32_t i_height )
+{
+ if( 2 == i_height )
+ {
+ avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride,
+ p_dst_u, p_dst_v, i_dst_stride,
+ u_coef_hor0, u_coef_hor1,
+ u_coef_ver0, u_coef_ver1 );
+ }
+ else if( 4 == i_height )
+ {
+ avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride,
+ p_dst_u, p_dst_v, i_dst_stride,
+ u_coef_hor0, u_coef_hor1,
+ u_coef_ver0, u_coef_ver1 );
+ }
+}
+
+static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src,
+ int32_t i_src_stride,
+ uint8_t *p_dst_u,
+ uint8_t *p_dst_v,
+ int32_t i_dst_stride,
+ uint32_t u_coef_hor0,
+ uint32_t u_coef_hor1,
+ uint32_t u_coef_ver0,
+ uint32_t u_coef_ver1 )
+{
+ uint32_t u_out0, u_out1, u_out2, u_out3;
+ v16u8 src0, src1, src2, src3, src4;
+ v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16i8 mask;
+ v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+ v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+ v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+ v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+ v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+ v4i32 res0, res1;
+
+ mask = LD_SB( &pu_chroma_mask_arr[16] );
+
+ LD_UB3( p_src, i_src_stride, src0, src1, src2 );
+ VSHF_B2_UB( src0, src1, src1, src2,
+ ( mask + 1 ), ( mask + 1 ), src3, src4 );
+ VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
+ DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+ res_hz3 );
+ MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+ coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3 );
+ ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
+ SRARI_H2_UH( res_vt0, res_vt2, 6 );
+ SAT_UH2_UH( res_vt0, res_vt2, 7 );
+ PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
+
+ u_out0 = __msa_copy_u_w( res0, 0 );
+ u_out1 = __msa_copy_u_w( res0, 1 );
+ u_out2 = __msa_copy_u_w( res1, 0 );
+ u_out3 = __msa_copy_u_w( res1, 1 );
+ SW( u_out0, p_dst_u );
+ p_dst_u += i_dst_stride;
+ SW( u_out1, p_dst_u );
+ SW( u_out2, p_dst_v );
+ p_dst_v += i_dst_stride;
+ SW( u_out3, p_dst_v );
+}
+
+static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src,
+ int32_t i_src_stride,
+ uint8_t *p_dst_u,
+ uint8_t *p_dst_v,
+ int32_t i_dst_stride,
+ uint32_t u_coef_hor0,
+ uint32_t u_coef_hor1,
+ uint32_t u_coef_ver0,
+ uint32_t u_coef_ver1,
+ int32_t i_height )
+{
+ uint32_t u_row;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
+ v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16i8 mask;
+ v4i32 res0, res1;
+ v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+ v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+ v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+ v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+ v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+
+ mask = LD_SB( &pu_chroma_mask_arr[16] );
+
+ src0 = LD_UB( p_src );
+ p_src += i_src_stride;
+
+ for( u_row = ( i_height >> 2 ); u_row--; )
+ {
+ LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
+ p_src += ( 4 * i_src_stride );
+
+ VSHF_B2_UB( src0, src1, src1, src2,
+ ( mask + 1 ), ( mask + 1 ), src5, src6 );
+ VSHF_B2_UB( src2, src3, src3, src4,
+ ( mask + 1 ), ( mask + 1 ), src7, src8 );
+ VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
+ VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
+ DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+ res_hz3 );
+ MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+ coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3 );
+ ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
+ SRARI_H2_UH( res_vt0, res_vt1, 6 );
+ SAT_UH2_UH( res_vt0, res_vt1, 7 );
+ PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
+
+ ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride );
+ p_dst_u += ( 4 * i_dst_stride );
+
+ DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
+ res_hz3 );
+ MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
+ coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3 );
+ ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
+ SRARI_H2_UH( res_vt0, res_vt1, 6 );
+ SAT_UH2_UH( res_vt0, res_vt1, 7 );
+ PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
+
+ ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride );
+ p_dst_v += ( 4 * i_dst_stride );
+ src0 = src4;
+ }
+}
+
+static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src,
+ int32_t i_src_stride,
+ uint8_t *p_dst_u,
+ uint8_t *p_dst_v,
+ int32_t i_dst_stride,
+ uint32_t u_coef_hor0,
+ uint32_t u_coef_hor1,
+ uint32_t u_coef_ver0,
+ uint32_t u_coef_ver1,
+ int32_t i_height )
+{
+ if( 2 == i_height )
+ {
+ avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride,
+ p_dst_u, p_dst_v, i_dst_stride,
+ u_coef_hor0, u_coef_hor1,
+ u_coef_ver0, u_coef_ver1 );
+ }
+ else
+ {
+ avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride,
+ p_dst_u, p_dst_v, i_dst_stride,
+ u_coef_hor0, u_coef_hor1,
+ u_coef_ver0, u_coef_ver1,
+ i_height );
+ }
+}
+
+static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src,
+ int32_t i_src_stride,
+ uint8_t *p_dst_u,
+ uint8_t *p_dst_v,
+ int32_t i_dst_stride,
+ uint32_t u_coef_hor0,
+ uint32_t u_coef_hor1,
+ uint32_t u_coef_ver0,
+ uint32_t u_coef_ver1,
+ int32_t i_height )
+{
+ uint32_t u_row;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+ v16u8 src10, src11, src12, src13, src14;
+ v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5;
+ v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
+ v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 };
+ v16i8 coeff_hz_vec0, coeff_hz_vec1;
+ v16i8 tmp0, tmp1;
+ v16u8 coeff_hz_vec;
+ v8u16 coeff_vt_vec0, coeff_vt_vec1;
+
+ coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
+ coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
+ coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
+ coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
+ coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
+
+ LD_UB2( p_src, 16, src0, src13 );
+ p_src += i_src_stride;
+
+ VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 );
+ DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 );
+
+ for( u_row = ( i_height >> 2 ); u_row--; )
+ {
+ LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
+ LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 );
+ p_src += ( 4 * i_src_stride );
+
+ VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 );
+ VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 );
+ DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+ res_hz4 );
+ MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+ coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3 );
+
+ res_vt0 += ( res_hz0 * coeff_vt_vec1 );
+ res_vt1 += ( res_hz1 * coeff_vt_vec1 );
+ res_vt2 += ( res_hz2 * coeff_vt_vec1 );
+ res_vt3 += ( res_hz3 * coeff_vt_vec1 );
+
+ SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
+ SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
+ PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
+ ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride );
+ p_dst_u += ( 4 * i_dst_stride );
+ res_hz0 = res_hz4;
+
+ VSHF_B2_UB( src1, src5, src2, src6,
+ ( mask + 1 ), ( mask + 1 ), src5, src6 );
+ VSHF_B2_UB( src3, src7, src4, src8,
+ ( mask + 1 ), ( mask + 1 ), src7, src8 );
+ DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
+ coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
+ res_hz4 );
+ MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
+ coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
+ res_vt3 );
+
+ res_vt0 += ( res_hz5 * coeff_vt_vec1 );
+ res_vt1 += ( res_hz1 * coeff_vt_vec1 );
+ res_vt2 += ( res_hz2 * coeff_vt_vec1 );
+ res_vt3 += ( res_hz3 * coeff_vt_vec1 );
+
+ SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
+ SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
+ PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
+ ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride );
+ p_dst_v += ( 4 * i_dst_stride );
+ res_hz5 = res_hz4;
+ }
+}
+
+static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_log2_denom, int32_t i_weight,
+ int32_t i_offset_in )
+{
+ uint32_t u_load0, u_load1, u_out0, u_out1;
+ v16u8 zero = { 0 };
+ v16u8 src0, src1;
+ v4i32 dst0, dst1;
+ v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1;
+ v8i16 vec0, vec1;
+
+ i_offset_in <<= ( i_log2_denom );
+
+ if( i_log2_denom )
+ {
+ i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
+ }
+
+ wgt = ( v8u16 ) __msa_fill_h( i_weight );
+ offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+ denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
+
+ u_load0 = LW( p_src );
+ p_src += i_src_stride;
+ u_load1 = LW( p_src );
+
+ src0 = ( v16u8 ) __msa_fill_w( u_load0 );
+ src1 = ( v16u8 ) __msa_fill_w( u_load1 );
+
+ ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 );
+ MUL2( wgt, temp0, wgt, temp1, temp0, temp1 );
+ ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 );
+ MAXI_SH2_SH( vec0, vec1, 0 );
+
+ tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom );
+ tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom );
+
+ SAT_UH2_UH( tp0, tp1, 7 );
+ PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 );
+
+ u_out0 = __msa_copy_u_w( dst0, 0 );
+ u_out1 = __msa_copy_u_w( dst1, 0 );
+ SW( u_out0, p_dst );
+ p_dst += i_dst_stride;
+ SW( u_out1, p_dst );
+}
+
+static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src,
+ int32_t i_src_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_weight,
+ int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ uint32_t u_load0, u_load1, u_load2, u_load3;
+ v16u8 zero = { 0 };
+ v16u8 src0, src1, src2, src3;
+ v8u16 temp0, temp1, temp2, temp3;
+ v8u16 wgt, denom, offset;
+
+ i_offset_in <<= ( i_log2_denom );
+
+ if( i_log2_denom )
+ {
+ i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
+ }
+
+ wgt = ( v8u16 ) __msa_fill_h( i_weight );
+ offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+ denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
+
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 );
+ p_src += 4 * i_src_stride;
+
+ src0 = ( v16u8 ) __msa_fill_w( u_load0 );
+ src1 = ( v16u8 ) __msa_fill_w( u_load1 );
+ src2 = ( v16u8 ) __msa_fill_w( u_load2 );
+ src3 = ( v16u8 ) __msa_fill_w( u_load3 );
+
+ ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
+ temp0, temp1, temp2, temp3 );
+ MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
+ temp0, temp1, temp2, temp3 );
+ ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+ temp0, temp1, temp2, temp3 );
+ MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
+ SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
+ SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
+ PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+}
+
+static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height, int32_t i_log2_denom,
+ int32_t i_weight, int32_t i_offset_in )
+{
+ if( 2 == i_height )
+ {
+ avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+ i_log2_denom, i_weight, i_offset_in );
+ }
+ else
+ {
+ avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride,
+ p_dst, i_dst_stride,
+ i_height, i_log2_denom,
+ i_weight, i_offset_in );
+ }
+}
+
+static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height, int32_t i_log2_denom,
+ int32_t i_weight, int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ v16u8 zero = { 0 };
+ v16u8 src0, src1, src2, src3;
+ v8u16 temp0, temp1, temp2, temp3;
+ v8u16 wgt, denom, offset;
+ v16i8 out0, out1;
+
+ i_offset_in <<= ( i_log2_denom );
+
+ if( i_log2_denom )
+ {
+ i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
+ }
+
+ wgt = ( v8u16 ) __msa_fill_h( i_weight );
+ offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+ denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
+
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+ p_src += 4 * i_src_stride;
+
+ ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
+ temp0, temp1, temp2, temp3 );
+ MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
+ temp0, temp1, temp2, temp3 );
+ ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+ temp0, temp1, temp2, temp3 );
+ MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
+ SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
+ SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
+ PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 );
+ ST8x4_UB( out0, out1, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+}
+
+static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height, int32_t i_log2_denom,
+ int32_t i_weight, int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ v16i8 zero = { 0 };
+ v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
+ v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ v8u16 wgt, denom, offset;
+
+ i_offset_in <<= ( i_log2_denom );
+
+ if( i_log2_denom )
+ {
+ i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
+ }
+
+ wgt = ( v8u16 ) __msa_fill_h( i_weight );
+ offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+ denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
+
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+ p_src += 4 * i_src_stride;
+
+ ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
+ temp0, temp2, temp4, temp6 );
+ ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
+ temp1, temp3, temp5, temp7 );
+ MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
+ temp0, temp1, temp2, temp3 );
+ MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7,
+ temp4, temp5, temp6, temp7 );
+ ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
+ temp0, temp1, temp2, temp3 );
+ ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset,
+ temp4, temp5, temp6, temp7 );
+ MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
+ MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 );
+ SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
+ SRL_H4_UH( temp4, temp5, temp6, temp7, denom );
+ SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
+ SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 );
+ PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
+ dst0, dst1, dst2, dst3 );
+
+ ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
+ p_dst += 4 * i_dst_stride;
+ }
+}
+
+static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ uint32_t u_load0, u_load1, u_out0, u_out1;
+ v8i16 src1_wgt, src2_wgt;
+ v16u8 in0, in1, in2, in3;
+ v8i16 temp0, temp1, temp2, temp3;
+ v16i8 zero = { 0 };
+ v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
+
+ src1_wgt = __msa_fill_h( i_src1_weight );
+ src2_wgt = __msa_fill_h( i_src2_weight );
+ u_load0 = LW( p_src1_in );
+ u_load1 = LW( p_src1_in + i_src1_stride );
+ in0 = ( v16u8 ) __msa_fill_w( u_load0 );
+ in1 = ( v16u8 ) __msa_fill_w( u_load1 );
+ u_load0 = LW( p_src2_in );
+ u_load1 = LW( p_src2_in + i_src2_stride );
+ in2 = ( v16u8 ) __msa_fill_w( u_load0 );
+ in3 = ( v16u8 ) __msa_fill_w( u_load1 );
+ ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3,
+ temp0, temp1, temp2, temp3 );
+ temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt );
+ temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt );
+ SRAR_H2_SH( temp0, temp1, denom );
+ CLIP_SH2_0_255( temp0, temp1 );
+ PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 );
+ u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
+ u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
+ SW( u_out0, p_dst );
+ p_dst += i_dst_stride;
+ SW( u_out1, p_dst );
+}
+
+static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ uint32_t u_load0, u_load1, u_load2, u_load3;
+ v8i16 src1_wgt, src2_wgt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ v16i8 zero = { 0 };
+ v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
+
+ src1_wgt = __msa_fill_h( i_src1_weight );
+ src2_wgt = __msa_fill_h( i_src2_weight );
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
+ p_src1_in += ( 4 * i_src1_stride );
+ src0 = ( v16u8 ) __msa_fill_w( u_load0 );
+ src1 = ( v16u8 ) __msa_fill_w( u_load1 );
+ src2 = ( v16u8 ) __msa_fill_w( u_load2 );
+ src3 = ( v16u8 ) __msa_fill_w( u_load3 );
+ LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
+ p_src2_in += ( 4 * i_src2_stride );
+ src4 = ( v16u8 ) __msa_fill_w( u_load0 );
+ src5 = ( v16u8 ) __msa_fill_w( u_load1 );
+ src6 = ( v16u8 ) __msa_fill_w( u_load2 );
+ src7 = ( v16u8 ) __msa_fill_w( u_load3 );
+ ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
+ temp0, temp1, temp2, temp3 );
+ ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7,
+ temp4, temp5, temp6, temp7 );
+ temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt );
+ temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt );
+ temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt );
+ temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt );
+ SRAR_H4_SH( temp0, temp1, temp2, temp3, denom );
+ CLIP_SH4_0_255(temp0, temp1, temp2, temp3);
+ PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+}
+
+static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ if( 2 == i_height )
+ {
+ avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride,
+ p_src2_in, i_src2_stride,
+ p_dst, i_dst_stride,
+ i_log2_denom, i_src1_weight,
+ i_src2_weight, i_offset_in );
+ }
+ else
+ {
+ avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride,
+ p_src2_in, i_src2_stride,
+ p_dst, i_dst_stride,
+ i_height, i_log2_denom,
+ i_src1_weight, i_src2_weight,
+ i_offset_in );
+ }
+}
+
+static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ v8i16 src1_wgt, src2_wgt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v8i16 temp0, temp1, temp2, temp3;
+ v8i16 res0, res1, res2, res3;
+ v16i8 zero = { 0 };
+ v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
+
+ src1_wgt = __msa_fill_h( i_src1_weight );
+ src2_wgt = __msa_fill_h( i_src2_weight );
+
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
+ p_src1_in += ( 4 * i_src1_stride );
+ LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
+ p_src2_in += ( 4 * i_src2_stride );
+ ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
+ temp0, temp1, temp2, temp3 );
+ ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3,
+ res0, res1, res2, res3 );
+ res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
+ res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
+ res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
+ res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
+ SRAR_H4_SH( res0, res1, res2, res3, denom );
+ CLIP_SH4_0_255(res0, res1, res2, res3);
+ PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3,
+ dst0, dst1, dst2, dst3 );
+ ST8x1_UB( dst0, p_dst );
+ p_dst += i_dst_stride;
+ ST8x1_UB( dst1, p_dst );
+ p_dst += i_dst_stride;
+ ST8x1_UB( dst2, p_dst );
+ p_dst += i_dst_stride;
+ ST8x1_UB( dst3, p_dst );
+ p_dst += i_dst_stride;
+ }
+}
+
+static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ v8i16 src1_wgt, src2_wgt;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1, dst2, dst3;
+ v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
+ v16i8 zero = { 0 };
+ v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
+
+ src1_wgt = __msa_fill_h( i_src1_weight );
+ src2_wgt = __msa_fill_h( i_src2_weight );
+
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
+ p_src1_in += ( 4 * i_src1_stride );
+ LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
+ p_src2_in += ( 4 * i_src2_stride );
+ ILVRL_B2_SH( zero, src0, temp1, temp0 );
+ ILVRL_B2_SH( zero, src1, temp3, temp2 );
+ ILVRL_B2_SH( zero, src2, temp5, temp4 );
+ ILVRL_B2_SH( zero, src3, temp7, temp6 );
+ ILVRL_B2_SH( zero, dst0, res1, res0 );
+ ILVRL_B2_SH( zero, dst1, res3, res2 );
+ ILVRL_B2_SH( zero, dst2, res5, res4 );
+ ILVRL_B2_SH( zero, dst3, res7, res6 );
+ res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
+ res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
+ res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
+ res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
+ res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt );
+ res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt );
+ res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt );
+ res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt );
+ SRAR_H4_SH( res0, res1, res2, res3, denom );
+ SRAR_H4_SH( res4, res5, res6, res7, denom );
+ CLIP_SH4_0_255( res0, res1, res2, res3 );
+ CLIP_SH4_0_255( res4, res5, res6, res7 );
+ PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7,
+ dst0, dst1, dst2, dst3 );
+ ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
+ p_dst += 4 * i_dst_stride;
+ }
+}
+
+static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ uint32_t u_load0, u_load1, u_out0, u_out1;
+ v16u8 src1_wgt, src2_wgt, wgt;
+ v16i8 in0, in1, in2, in3;
+ v8u16 temp0, temp1, denom, offset;
+
+ i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
+
+ src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
+ src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
+ offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+ denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
+
+ wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
+
+ u_load0 = LW( p_src1_in );
+ u_load1 = LW( p_src1_in + i_src1_stride );
+ in0 = ( v16i8 ) __msa_fill_w( u_load0 );
+ in1 = ( v16i8 ) __msa_fill_w( u_load1 );
+
+ u_load0 = LW( p_src2_in );
+ u_load1 = LW( p_src2_in + i_src2_stride );
+ in2 = ( v16i8 ) __msa_fill_w( u_load0 );
+ in3 = ( v16i8 ) __msa_fill_w( u_load1 );
+
+ ILVR_B2_SB( in2, in0, in3, in1, in0, in1 );
+
+ temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 );
+ temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 );
+ temp0 >>= denom;
+ temp1 >>= denom;
+ MAXI_SH2_UH( temp0, temp1, 0 );
+ SAT_UH2_UH( temp0, temp1, 7 );
+ PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 );
+
+ u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
+ u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
+ SW( u_out0, p_dst );
+ p_dst += i_dst_stride;
+ SW( u_out1, p_dst );
+}
+
+static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ uint32_t u_load0, u_load1, u_load2, u_load3;
+ v16u8 src1_wgt, src2_wgt, wgt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 temp0, temp1, temp2, temp3;
+ v8u16 res0, res1, res2, res3;
+ v8u16 denom, offset;
+
+ i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
+
+ src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
+ src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
+ offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+ denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
+
+ wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
+
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
+ p_src1_in += ( 4 * i_src1_stride );
+
+ src0 = ( v16u8 ) __msa_fill_w( u_load0 );
+ src1 = ( v16u8 ) __msa_fill_w( u_load1 );
+ src2 = ( v16u8 ) __msa_fill_w( u_load2 );
+ src3 = ( v16u8 ) __msa_fill_w( u_load3 );
+
+ LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
+ p_src2_in += ( 4 * i_src2_stride );
+
+ src4 = ( v16u8 ) __msa_fill_w( u_load0 );
+ src5 = ( v16u8 ) __msa_fill_w( u_load1 );
+ src6 = ( v16u8 ) __msa_fill_w( u_load2 );
+ src7 = ( v16u8 ) __msa_fill_w( u_load3 );
+
+ ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+ temp0, temp1, temp2, temp3 );
+ DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
+ res0, res1, res2, res3 );
+ ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
+ res0, res1, res2, res3 );
+ SRA_4V( res0, res1, res2, res3, denom );
+ MAXI_SH4_UH( res0, res1, res2, res3, 0 );
+ SAT_UH4_UH( res0, res1, res2, res3, 7 );
+ PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+}
+
+static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ if( 2 == i_height )
+ {
+ avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride,
+ p_src2_in, i_src2_stride,
+ p_dst, i_dst_stride,
+ i_log2_denom, i_src1_weight,
+ i_src2_weight, i_offset_in );
+ }
+ else
+ {
+ avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride,
+ p_src2_in, i_src2_stride,
+ p_dst, i_dst_stride,
+ i_height, i_log2_denom,
+ i_src1_weight,
+ i_src2_weight, i_offset_in );
+ }
+}
+
+
+static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ v16u8 src1_wgt, src2_wgt, wgt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 temp0, temp1, temp2, temp3;
+ v8u16 res0, res1, res2, res3;
+ v8u16 denom, offset;
+ v16i8 out0, out1;
+
+ i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
+
+ src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
+ src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
+ offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+ denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
+
+ wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
+
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
+ p_src1_in += ( 4 * i_src1_stride );
+
+ LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
+ p_src2_in += ( 4 * i_src2_stride );
+
+ ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+ temp0, temp1, temp2, temp3 );
+ DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
+ res0, res1, res2, res3 );
+ ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
+ res0, res1, res2, res3 );
+ SRA_4V( res0, res1, res2, res3, denom );
+ MAXI_SH4_UH( res0, res1, res2, res3, 0 );
+ SAT_UH4_UH( res0, res1, res2, res3, 7 );
+ PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 );
+ ST8x4_UB( out0, out1, p_dst, i_dst_stride );
+ p_dst += 4 * i_dst_stride;
+ }
+}
+
+static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in,
+ int32_t i_src1_stride,
+ uint8_t *p_src2_in,
+ int32_t i_src2_stride,
+ uint8_t *p_dst,
+ int32_t i_dst_stride,
+ int32_t i_height,
+ int32_t i_log2_denom,
+ int32_t i_src1_weight,
+ int32_t i_src2_weight,
+ int32_t i_offset_in )
+{
+ uint8_t u_cnt;
+ v16u8 src1_wgt, src2_wgt, wgt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
+ v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
+ v8u16 denom, offset;
+
+ i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
+
+ src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
+ src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
+ offset = ( v8u16 ) __msa_fill_h( i_offset_in );
+ denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
+
+ wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
+
+ for( u_cnt = i_height / 4; u_cnt--; )
+ {
+ LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
+ p_src1_in += ( 4 * i_src1_stride );
+
+ LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
+ p_src2_in += ( 4 * i_src2_stride );
+
+ ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+ temp0, temp2, temp4, temp6 );
+ ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
+ temp1, temp3, temp5, temp7 );
+ DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
+ res0, res1, res2, res3 );
+ ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
+ res0, res1, res2, res3 );
+ DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt,
+ res4, res5, res6, res7 );
+ ADD4( res4, offset, res5, offset, res6, offset, res7, offset,
+ res4, res5, res6, res7 );
+ SRA_4V( res0, res1, res2, res3, denom );
+ SRA_4V( res4, res5, res6, res7, denom );
+ MAXI_SH4_UH( res0, res1, res2, res3, 0 );
+ MAXI_SH4_UH( res4, res5, res6, res7, 0 );
+ SAT_UH4_UH( res0, res1, res2, res3, 7 );
+ SAT_UH4_UH( res4, res5, res6, res7, 7 );
+ PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6,
+ temp0, temp1, temp2, temp3 );
+ ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
+ p_dst += 4 * i_dst_stride;
+ }
+}
+
+static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height )
+{
+ int32_t i_cnt;
+ uint32_t u_src0, u_src1;
+
+ for( i_cnt = ( i_height / 2 ); i_cnt--; )
+ {
+ u_src0 = LW( p_src );
+ p_src += i_src_stride;
+ u_src1 = LW( p_src );
+ p_src += i_src_stride;
+
+ SW( u_src0, p_dst );
+ p_dst += i_dst_stride;
+ SW( u_src1, p_dst );
+ p_dst += i_dst_stride;
+ }
+}
+
+static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height )
+{
+ int32_t i_cnt;
+ uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if( 0 == i_height % 12 )
+ {
+ for( i_cnt = ( i_height / 12 ); i_cnt--; )
+ {
+ LD_UB8( p_src, i_src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7 );
+ p_src += ( 8 * i_src_stride );
+
+ u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+ u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+ u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
+ u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
+ u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
+ u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
+ u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
+ u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
+
+ SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+
+ LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+ p_src += ( 4 * i_src_stride );
+
+ u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+ u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+ u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
+ u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
+
+ SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+ }
+ else if( 0 == i_height % 8 )
+ {
+ for( i_cnt = i_height >> 3; i_cnt--; )
+ {
+ LD_UB8( p_src, i_src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7 );
+ p_src += ( 8 * i_src_stride );
+
+ u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+ u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+ u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
+ u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
+ u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
+ u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
+ u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
+ u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
+
+ SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+ }
+ else if( 0 == i_height % 4 )
+ {
+ for( i_cnt = ( i_height / 4 ); i_cnt--; )
+ {
+ LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+ p_src += ( 4 * i_src_stride );
+ u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+ u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+ u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
+ u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
+
+ SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+ }
+ else if( 0 == i_height % 2 )
+ {
+ for( i_cnt = ( i_height / 2 ); i_cnt--; )
+ {
+ LD_UB2( p_src, i_src_stride, src0, src1 );
+ p_src += ( 2 * i_src_stride );
+ u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
+ u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
+
+ SD( u_out0, p_dst );
+ p_dst += i_dst_stride;
+ SD( u_out1, p_dst );
+ p_dst += i_dst_stride;
+ }
+ }
+}
+
+
+static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height, int32_t i_width )
+{
+ int32_t i_cnt, i_loop_cnt;
+ uint8_t *p_src_tmp, *p_dst_tmp;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ for( i_cnt = ( i_width >> 4 ); i_cnt--; )
+ {
+ p_src_tmp = p_src;
+ p_dst_tmp = p_dst;
+
+ for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; )
+ {
+ LD_UB8( p_src_tmp, i_src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7 );
+ p_src_tmp += ( 8 * i_src_stride );
+
+ ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
+ p_dst_tmp, i_dst_stride );
+ p_dst_tmp += ( 8 * i_dst_stride );
+ }
+
+ p_src += 16;
+ p_dst += 16;
+ }
+}
+
+static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height )
+{
+ int32_t i_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+
+ if( 0 == i_height % 12 )
+ {
+ for( i_cnt = ( i_height / 12 ); i_cnt--; )
+ {
+ LD_UB8( p_src, i_src_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7 );
+ p_src += ( 8 * i_src_stride );
+ ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
+ p_dst, i_dst_stride );
+ p_dst += ( 8 * i_dst_stride );
+
+ LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+ p_src += ( 4 * i_src_stride );
+ ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+ }
+ else if( 0 == i_height % 8 )
+ {
+ copy_16multx8mult_msa( p_src, i_src_stride,
+ p_dst, i_dst_stride, i_height, 16 );
+ }
+ else if( 0 == i_height % 4 )
+ {
+ for( i_cnt = ( i_height >> 2 ); i_cnt--; )
+ {
+ LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
+ p_src += ( 4 * i_src_stride );
+
+ ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+ }
+}
+
+static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride,
+ uint8_t *p_src2, int32_t i_src2_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height )
+{
+ int32_t i_cnt;
+ uint32_t u_out0, u_out1;
+ v16u8 src0, src1, src2, src3;
+ v16u8 dst0, dst1;
+
+ for( i_cnt = ( i_height / 2 ); i_cnt--; )
+ {
+ LD_UB2( p_src1, i_src1_stride, src0, src1 );
+ p_src1 += ( 2 * i_src1_stride );
+ LD_UB2( p_src2, i_src2_stride, src2, src3 );
+ p_src2 += ( 2 * i_src2_stride );
+
+ AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 );
+
+ u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 );
+ u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 );
+ SW( u_out0, p_dst );
+ p_dst += i_dst_stride;
+ SW( u_out1, p_dst );
+ p_dst += i_dst_stride;
+ }
+}
+
+static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride,
+ uint8_t *p_src2, int32_t i_src2_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height )
+{
+ int32_t i_cnt;
+ uint64_t u_out0, u_out1, u_out2, u_out3;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3;
+
+ for( i_cnt = ( i_height / 4 ); i_cnt--; )
+ {
+ LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 );
+ p_src1 += ( 4 * i_src1_stride );
+ LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 );
+ p_src2 += ( 4 * i_src2_stride );
+
+ AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7,
+ dst0, dst1, dst2, dst3 );
+
+ u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 );
+ u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 );
+ u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 );
+ u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 );
+ SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
+ p_dst += ( 4 * i_dst_stride );
+ }
+}
+
+static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride,
+ uint8_t *p_src2, int32_t i_src2_stride,
+ uint8_t *p_dst, int32_t i_dst_stride,
+ int32_t i_height )
+{
+ int32_t i_cnt;
+ v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
+ v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+ for( i_cnt = ( i_height / 8 ); i_cnt--; )
+ {
+ LD_UB8( p_src1, i_src1_stride,
+ src0, src1, src2, src3, src4, src5, src6, src7 );
+ p_src1 += ( 8 * i_src1_stride );
+ LD_UB8( p_src2, i_src2_stride,
+ dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
+ p_src2 += ( 8 * i_src2_stride );
+
+ AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+ dst0, dst1, dst2, dst3 );
+ AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+ dst4, dst5, dst6, dst7 );
+
+ ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
+ p_dst, i_dst_stride );
+ p_dst += ( 8 * i_dst_stride );
+ }
+}
+
+void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ int32_t i_height )
+{
+ copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
+}
+
+void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
+ intptr_t i_src_stride, int32_t i_height )
+{
+ copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
+}
+
+void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
+ intptr_t i_src_stride, int32_t i_height )
+{
+ copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
+}
+
+void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 16 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride,
+ 16, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride,
+ 16, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 8 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride,
+ 8, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride,
+ 8, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 16 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 16, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 16, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 8 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 8, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 8, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 4 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 4, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 4, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 16 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 16, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 16, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 8 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 8, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 8, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 4 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 4, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 4, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride,
+ uint8_t *p_pix2, intptr_t pix2_stride,
+ uint8_t *p_pix3, intptr_t pix3_stride,
+ int32_t i_weight )
+{
+ if( 32 == i_weight )
+ {
+ avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 2 );
+ }
+ else if( i_weight < 0 || i_weight > 63 )
+ {
+ avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+ else
+ {
+ avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride,
+ p_pix3, pix3_stride,
+ p_pix1, pix1_stride, 5, i_weight,
+ ( 64 - i_weight ), 0 );
+ }
+}
+
+void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ const x264_weight_t *pWeight, int32_t i_height )
+{
+ int32_t i_log2_denom = pWeight->i_denom;
+ int32_t i_offset = pWeight->i_offset;
+ int32_t i_weight = pWeight->i_scale;
+
+ avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+ i_height, i_log2_denom, i_weight, i_offset );
+}
+
+void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ const x264_weight_t *pWeight, int32_t i_height )
+{
+ int32_t i_log2_denom = pWeight->i_denom;
+ int32_t i_offset = pWeight->i_offset;
+ int32_t i_weight = pWeight->i_scale;
+
+ avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+ i_height, i_log2_denom, i_weight, i_offset );
+}
+
+void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ const x264_weight_t *pWeight, int32_t i_height )
+{
+ int32_t i_log2_denom = pWeight->i_denom;
+ int32_t i_offset = pWeight->i_offset;
+ int32_t i_weight = pWeight->i_scale;
+
+ avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
+ i_height, i_log2_denom, i_weight, i_offset );
+}
+
+void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ const x264_weight_t *pWeight, int32_t i_height )
+{
+ x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride,
+ pWeight, i_height );
+ x264_mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride,
+ pWeight, i_height );
+}
+
+void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
+ uint8_t *p_src[4], intptr_t i_src_stride,
+ int32_t m_vx, int32_t m_vy,
+ int32_t i_width, int32_t i_height,
+ const x264_weight_t *pWeight )
+{
+ int32_t i_qpel_idx;
+ int32_t i_offset;
+ uint8_t *p_src1;
+
+ i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
+ i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
+ p_src1 = p_src[pu_hpel_ref0[i_qpel_idx]] + i_offset +
+ ( 3 == ( m_vy & 3 ) ) * i_src_stride;
+
+ if( i_qpel_idx & 5 )
+ {
+ uint8_t *p_src2 = p_src[pu_hpel_ref1[i_qpel_idx]] +
+ i_offset + ( 3 == ( m_vx&3 ) );
+
+ if( 16 == i_width )
+ {
+ avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
+ p_dst, i_dst_stride, i_height );
+ }
+ else if( 8 == i_width )
+ {
+ avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride,
+ p_dst, i_dst_stride, i_height );
+ }
+ else if( 4 == i_width )
+ {
+ avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride,
+ p_dst, i_dst_stride, i_height );
+ }
+
+ if( pWeight->weightfn )
+ {
+ if( 16 == i_width )
+ {
+ x264_mc_weight_w16_msa( p_dst, i_dst_stride,
+ p_dst, i_dst_stride,
+ pWeight, i_height );
+ }
+ else if( 8 == i_width )
+ {
+ x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
+ pWeight, i_height );
+ }
+ else if( 4 == i_width )
+ {
+ x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
+ pWeight, i_height );
+ }
+ }
+ }
+ else if( pWeight->weightfn )
+ {
+ if( 16 == i_width )
+ {
+ x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
+ pWeight, i_height );
+ }
+ else if( 8 == i_width )
+ {
+ x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
+ pWeight, i_height );
+ }
+ else if( 4 == i_width )
+ {
+ x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
+ pWeight, i_height );
+ }
+ }
+ else
+ {
+ if( 16 == i_width )
+ {
+ copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
+ i_height );
+ }
+ else if( 8 == i_width )
+ {
+ copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
+ i_height );
+ }
+ else if( 4 == i_width )
+ {
+ copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
+ i_height );
+ }
+ }
+}
+
+void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
+ intptr_t i_dst_stride,
+ uint8_t *p_src, intptr_t i_src_stride,
+ int32_t m_vx, int32_t m_vy,
+ int32_t i_width, int32_t i_height )
+{
+ int32_t i_d8x = m_vx & 0x07;
+ int32_t i_d8y = m_vy & 0x07;
+ int32_t i_coeff_horiz1 = ( 8 - i_d8x );
+ int32_t i_coeff_vert1 = ( 8 - i_d8y );
+ int32_t i_coeff_horiz0 = i_d8x;
+ int32_t i_coeff_vert0 = i_d8y;
+
+ p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2;
+
+ if( 2 == i_width )
+ {
+ avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride,
+ p_dst_u, p_dst_v, i_dst_stride,
+ i_coeff_horiz0, i_coeff_horiz1,
+ i_coeff_vert0, i_coeff_vert1,
+ i_height );
+ }
+ else if( 4 == i_width )
+ {
+ avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride,
+ p_dst_u, p_dst_v, i_dst_stride,
+ i_coeff_horiz0, i_coeff_horiz1,
+ i_coeff_vert0, i_coeff_vert1,
+ i_height );
+ }
+ else if( 8 == i_width )
+ {
+ avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride,
+ p_dst_u, p_dst_v, i_dst_stride,
+ i_coeff_horiz0, i_coeff_horiz1,
+ i_coeff_vert0, i_coeff_vert1,
+ i_height );
+ }
+}
+
+void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf )
+{
+ if( cpu & X264_CPU_MSA )
+ {
+ pf->mc_luma = x264_mc_luma_msa;
+ pf->mc_chroma = x264_mc_chroma_msa;
+
+ pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_msa;
+ pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_msa;
+ pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_msa;
+ pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_msa;
+ pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_msa;
+ pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_msa;
+ pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_msa;
+ pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_msa;
+ pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_msa;
+
+ pf->weight = x264_mc_weight_wtab_msa;
+ pf->offsetadd = x264_mc_weight_wtab_msa;
+ pf->offsetsub = x264_mc_weight_wtab_msa;
+
+ pf->copy_16x16_unaligned = x264_mc_copy_w16_msa;
+ pf->copy[PIXEL_16x16] = x264_mc_copy_w16_msa;
+ pf->copy[PIXEL_8x8] = x264_mc_copy_w8_msa;
+ pf->copy[PIXEL_4x4] = x264_mc_copy_w4_msa;
+
+ pf->memcpy_aligned = memcpy;
+ pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;
+ }
+}
+#endif
diff --git a/common/mips/mc.h b/common/mips/mc.h
new file mode 100644
index 0000000..d4f098a
--- /dev/null
+++ b/common/mips/mc.h
@@ -0,0 +1,31 @@
+/*****************************************************************************
+ * mc.h: msa motion compensation
+ *****************************************************************************
+ * Copyright (C) 2015 x264 project
+ *
+ * Authors: Neha Rana <neha.rana at imgtec.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
+ *
+ * This program is also available under a commercial proprietary license.
+ * For more information, contact us at licensing at x264.com.
+ *****************************************************************************/
+
+#ifndef X264_MIPS_MC_H
+#define X264_MIPS_MC_H
+
+void x264_mc_init_mips( int cpu, x264_mc_functions_t *pf );
+
+#endif
--
2.3.7
More information about the x264-devel
mailing list