diff --git a/common/ppc/deblock.c b/common/ppc/deblock.c index f127cec..86816ca 100644 --- a/common/ppc/deblock.c +++ b/common/ppc/deblock.c @@ -72,28 +72,26 @@ static inline void write16x4(uint8_t *dst, int dst_stride, *(dst_int+15*int_dst_stride) = *(src_int + 15); } -/** \brief performs a 6x16 transpose of data in src, and stores it to dst - \todo FIXME: see if we can't spare some vec_lvsl() by them factorizing - out of unaligned_load() */ +/** \brief performs a 6x16 transpose of data in src, and stores it to dst */ #define readAndTranspose16x6(src, src_stride, r8, r9, r10, r11, r12, r13) {\ register vec_u8_t r0, r1, r2, r3, r4, r5, r6, r7, r14, r15;\ - VEC_LOAD(src, r0, 16, vec_u8_t); \ - VEC_LOAD(src + src_stride, r1, 16, vec_u8_t); \ - VEC_LOAD(src + 2*src_stride, r2, 16, vec_u8_t); \ - VEC_LOAD(src + 3*src_stride, r3, 16, vec_u8_t); \ - VEC_LOAD(src + 4*src_stride, r4, 16, vec_u8_t); \ - VEC_LOAD(src + 5*src_stride, r5, 16, vec_u8_t); \ - VEC_LOAD(src + 6*src_stride, r6, 16, vec_u8_t); \ - VEC_LOAD(src + 7*src_stride, r7, 16, vec_u8_t); \ - VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t); \ - VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t); \ + VEC_LOAD(src, r0, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + src_stride, r1, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 2*src_stride, r2, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 3*src_stride, r3, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 4*src_stride, r4, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 5*src_stride, r5, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 6*src_stride, r6, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 7*src_stride, r7, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 14*src_stride, r14, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 15*src_stride, r15, 16, vec_u8_t, pix ); \ \ - VEC_LOAD(src + 8*src_stride, r8, 16, vec_u8_t); \ - VEC_LOAD(src + 9*src_stride, r9, 16, vec_u8_t); \ - VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t); \ - VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t); \ - VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t); \ - VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t); \ + VEC_LOAD(src + 8*src_stride, r8, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 9*src_stride, r9, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 10*src_stride, r10, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 11*src_stride, r11, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 12*src_stride, r12, 16, vec_u8_t, pix ); \ + VEC_LOAD(src + 13*src_stride, r13, 16, vec_u8_t, pix ); \ \ /*Merge first pairs*/ \ r0 = vec_mergeh(r0, r8); /*0, 8*/ \ @@ -294,6 +292,7 @@ void x264_deblock_h_luma_altivec(uint8_t *pix, int stride, int alpha, int beta, if((tc0[0] & tc0[1] & tc0[2] & tc0[3]) < 0) return; PREP_LOAD; + vec_u8_t _pix_ = vec_lvsl(0, pix-3); readAndTranspose16x6(pix-3, stride, line0, line1, line2, line3, line4, line5); h264_loop_filter_luma_altivec(line0, line1, line2, line3, line4, line5, alpha, beta, tc0); transpose4x16(line1, line2, line3, line4); diff --git a/common/ppc/mc.c b/common/ppc/mc.c index b9a7e4e..7f3509d 100644 --- a/common/ppc/mc.c +++ b/common/ppc/mc.c @@ -40,6 +40,11 @@ typedef void (*pf_mc_t)( uint8_t *src, int i_src, uint8_t *dst, int i_dst, int i_height ); + +static const int hpel_ref0[16] = {0,1,1,1,0,1,1,1,2,3,3,3,0,1,1,1}; +static const int hpel_ref1[16] = {0,0,0,0,2,2,3,2,2,2,3,2,2,2,3,2}; + + static inline int x264_tapfilter( uint8_t *pix, int i_pix_next ) { return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] + @@ -52,11 +57,10 @@ static inline int x264_tapfilter1( uint8_t *pix ) pix[ 3]; } -/* pixel_avg */ -static inline void pixel_avg_w4( uint8_t *dst, int i_dst, + +static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, - uint8_t *src2, int i_src2, - int i_height ) + uint8_t *src2, int i_height ) { int x, y; for( y = 0; y < i_height; y++ ) @@ -67,57 +71,70 @@ static inline void pixel_avg_w4( uint8_t *dst, int i_dst, } dst += i_dst; src1 += i_src1; - src2 += i_src2; + src2 += i_src1; } } -static inline void pixel_avg_w8( uint8_t *dst, int i_dst, + +static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, - uint8_t *src2, int i_src2, - int i_height ) + uint8_t *src2, int i_height ) { int y; vec_u8_t src1v, src2v; - LOAD_ZERO; PREP_LOAD; PREP_STORE8; + PREP_LOAD_SRC( src1 ); + PREP_LOAD_SRC( src2 ); + for( y = 0; y < i_height; y++ ) { - VEC_LOAD( src1, src1v, 8, vec_u8_t ); - VEC_LOAD( src2, src2v, 8, vec_u8_t ); + VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 ); + VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 ); src1v = vec_avg( src1v, src2v ); VEC_STORE8( src1v, dst ); dst += i_dst; src1 += i_src1; - src2 += i_src2; + src2 += i_src1; } } -static inline void pixel_avg_w16( uint8_t *dst, int i_dst, + +static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, int i_dst, uint8_t *src1, int i_src1, - uint8_t *src2, int i_src2, - int i_height ) + uint8_t *src2, int i_height ) { int y; vec_u8_t src1v, src2v; PREP_LOAD; - PREP_STORE16; + PREP_LOAD_SRC( src1 ); + PREP_LOAD_SRC( src2 ); + for( y = 0; y < i_height; y++ ) { - VEC_LOAD( src1, src1v, 16, vec_u8_t ); - VEC_LOAD( src2, src2v, 16, vec_u8_t ); + VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 ); + VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 ); src1v = vec_avg( src1v, src2v ); - VEC_STORE16( src1v, dst ); + vec_st(src1v, 0, dst); dst += i_dst; src1 += i_src1; - src2 += i_src2; + src2 += i_src1; } } +static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, int i_dst, + uint8_t *src1, int i_src1, + uint8_t *src2, int i_height ) +{ + x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height); + x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height); +} + /* mc_copy: plain c */ + #define MC_COPY( name, a ) \ -static void name( uint8_t *src, int i_src, \ - uint8_t *dst, int i_dst, int i_height ) \ +static void name( uint8_t *dst, int i_dst, \ + uint8_t *src, int i_src, int i_height ) \ { \ int y; \ for( y = 0; y < i_height; y++ ) \ @@ -127,118 +144,99 @@ static void name( uint8_t *src, int i_src, \ dst += i_dst; \ } \ } -MC_COPY( mc_copy_w4, 4 ) -MC_COPY( mc_copy_w8, 8 ) -MC_COPY( mc_copy_w16, 16 ) +MC_COPY( x264_mc_copy_w4_altivec, 4 ) +MC_COPY( x264_mc_copy_w8_altivec, 8 ) -void mc_luma_altivec( uint8_t *dst, int i_dst_stride, +static void x264_mc_copy_w16_altivec( uint8_t *dst, int i_dst, + uint8_t *src, int i_src, int i_height ) +{ + int y; + vec_u8_t cpyV; + PREP_LOAD; + PREP_LOAD_SRC( src ); + + for( y = 0; y < i_height; y++) + { + VEC_LOAD( src, cpyV, 16, vec_u8_t, src ); + vec_st(cpyV, 0, dst); + + src += i_src; + dst += i_dst; + } +} + + +static void mc_luma_altivec( uint8_t *dst, int i_dst_stride, uint8_t *src[4], int i_src_stride, int mvx, int mvy, int i_width, int i_height ) { - uint8_t *src1, *src2; - - /* todo : fixme... */ - int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0; - - int hpel1x = mvx>>1; - int hpel1y = (mvy+1-correction)>>1; - int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 ); - - - src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1); - - if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + int offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; + if( qpel_idx & 5 ) /* qpel interpolation needed */ { - int hpel2x = (mvx+1)>>1; - int hpel2y = (mvy+correction)>>1; - int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 ); - - src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1); + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); switch(i_width) { case 4: - pixel_avg_w4( dst, i_dst_stride, src1, i_src_stride, - src2, i_src_stride, i_height ); + x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 8: - pixel_avg_w8( dst, i_dst_stride, src1, i_src_stride, - src2, i_src_stride, i_height ); + x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 16: default: - pixel_avg_w16( dst, i_dst_stride, src1, i_src_stride, - src2, i_src_stride, i_height ); + x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height ); } - + } else { switch(i_width) { case 4: - mc_copy_w4( src1, i_src_stride, dst, i_dst_stride, i_height ); + x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height ); break; case 8: - mc_copy_w8( src1, i_src_stride, dst, i_dst_stride, i_height ); + x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height ); break; case 16: - mc_copy_w16( src1, i_src_stride, dst, i_dst_stride, i_height ); + x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height ); break; } - } } -uint8_t *get_ref_altivec( uint8_t *dst, int * i_dst_stride, + + +static uint8_t *get_ref_altivec( uint8_t *dst, int *i_dst_stride, uint8_t *src[4], int i_src_stride, int mvx, int mvy, int i_width, int i_height ) { - uint8_t *src1, *src2; - - /* todo : fixme... */ - int correction = (((mvx&3) == 3 && (mvy&3) == 1) || ((mvx&3) == 1 && (mvy&3) == 3)) ? 1:0; - - int hpel1x = mvx>>1; - int hpel1y = (mvy+1-correction)>>1; - int filter1 = (hpel1x & 1) + ( (hpel1y & 1) << 1 ); - - - src1 = src[filter1] + (hpel1y >> 1) * i_src_stride + (hpel1x >> 1); - - if ( (mvx|mvy) & 1 ) /* qpel interpolation needed */ + int qpel_idx = ((mvy&3)<<2) + (mvx&3); + int offset = (mvy>>2)*i_src_stride + (mvx>>2); + uint8_t *src1 = src[hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride; + if( qpel_idx & 5 ) /* qpel interpolation needed */ { - int hpel2x = (mvx+1)>>1; - int hpel2y = (mvy+correction)>>1; - int filter2 = (hpel2x & 1) + ( (hpel2y & 1) <<1 ); - - src2 = src[filter2] + (hpel2y >> 1) * i_src_stride + (hpel2x >> 1); - + uint8_t *src2 = src[hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3); switch(i_width) { case 4: - pixel_avg_w4( dst, *i_dst_stride, src1, i_src_stride, - src2, i_src_stride, i_height ); + x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 8: - pixel_avg_w8( dst, *i_dst_stride, src1, i_src_stride, - src2, i_src_stride, i_height ); + x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 12: case 16: default: - pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride, - src2, i_src_stride, i_height ); + x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); break; case 20: - //FIXME suboptimal - pixel_avg_w16( dst, *i_dst_stride, src1, i_src_stride, - src2, i_src_stride, i_height ); - pixel_avg_w4( dst+16, *i_dst_stride, src1+16, i_src_stride, - src2+16, i_src_stride, i_height ); + x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height ); break; } return dst; - } else { @@ -273,6 +271,8 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride, LOAD_ZERO; PREP_LOAD; + PREP_LOAD_SRC( src ); + PREP_LOAD_SRC( srcp ); PREP_STORE4; vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v; vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8; @@ -292,14 +292,14 @@ static void mc_chroma_altivec_4xh( uint8_t *dst, int i_dst_stride, permv = vec_lvsl( 0, (uint8_t *) 1 ); shiftv = vec_splat_u16( 6 ); - VEC_LOAD( src, src2v_8, 5, vec_u8_t ); + VEC_LOAD( src, src2v_8, 5, vec_u8_t, src ); src3v_8 = vec_perm( src2v_8, src2v_8, permv ); for( y = 0; y < i_height; y++ ) { src0v_8 = src2v_8; src1v_8 = src3v_8; - VEC_LOAD( srcp, src2v_8, 5, vec_u8_t ); + VEC_LOAD( srcp, src2v_8, 5, vec_u8_t, srcp ); src3v_8 = vec_perm( src2v_8, src2v_8, permv ); dstv_16 = k32v; @@ -339,6 +339,8 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride, LOAD_ZERO; PREP_LOAD; + PREP_LOAD_SRC( src ); + PREP_LOAD_SRC( srcp ); PREP_STORE8; vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v; vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8; @@ -358,14 +360,14 @@ static void mc_chroma_altivec_8xh( uint8_t *dst, int i_dst_stride, permv = vec_lvsl( 0, (uint8_t *) 1 ); shiftv = vec_splat_u16( 6 ); - VEC_LOAD( src, src2v_8, 9, vec_u8_t ); + VEC_LOAD( src, src2v_8, 9, vec_u8_t, src); src3v_8 = vec_perm( src2v_8, src2v_8, permv ); for( y = 0; y < i_height; y++ ) { src0v_8 = src2v_8; src1v_8 = src3v_8; - VEC_LOAD( srcp, src2v_8, 9, vec_u8_t ); + VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, srcp ); src3v_8 = vec_perm( src2v_8, src2v_8, permv ); dstv_16 = k32v; @@ -431,8 +433,8 @@ static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride, #define HPEL_FILTER_HORIZONTAL() \ { \ - VEC_LOAD( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t ); \ - VEC_LOAD( &src[x+14+i_stride*y], src6v, 16, vec_u8_t ); \ + VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \ + VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \ \ src2v = vec_sld( src1v, src6v, 1 ); \ src3v = vec_sld( src1v, src6v, 2 ); \ @@ -468,17 +470,17 @@ static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride, \ destv = vec_packsu( dest1v, dest2v ); \ \ - VEC_STORE16( destv, &dsth[x+i_stride*y] ); \ + VEC_STORE16( destv, &dsth[x+i_stride*y], dsth ); \ } #define HPEL_FILTER_VERTICAL() \ { \ - VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t ); \ - VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t ); \ - VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t ); \ - VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t ); \ - VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t ); \ - VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t ); \ + VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \ + VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \ + VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \ + VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \ + VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \ + VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \ \ temp1v = vec_u8_to_s16_h( src1v ); \ temp2v = vec_u8_to_s16_h( src2v ); \ @@ -508,7 +510,7 @@ static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride, \ destv = vec_packsu( dest1v, dest2v ); \ \ - VEC_STORE16( destv, &dstv[x+i_stride*y] ); \ + VEC_STORE16( destv, &dstv[x+i_stride*y], dsth ); \ } #define HPEL_FILTER_CENTRAL() \ @@ -541,7 +543,7 @@ static void mc_chroma_altivec( uint8_t *dst, int i_dst_stride, \ destv = vec_packsu( dest1v, dest2v ); \ \ - VEC_STORE16( destv, &dstc[x-16+i_stride*y] ); \ + VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \ } void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src, @@ -556,7 +558,9 @@ void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint vec_s16_t tempav, tempbv, tempcv, tempdv, tempev; PREP_LOAD; + PREP_LOAD_SRC( src); PREP_STORE16; + PREP_STORE16_DST( dsth ); LOAD_ZERO; vec_u16_t twov, fourv, fivev, sixv; @@ -612,12 +616,12 @@ void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint } /* Partial vertical filter */ - VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t ); - VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t ); - VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t ); - VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t ); - VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t ); - VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t ); + VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); + VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); + VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); + VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); + VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); + VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); temp1v = vec_u8_to_s16_h( src1v ); temp2v = vec_u8_to_s16_h( src2v ); diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c index adf728f..6a7218c 100644 --- a/common/ppc/pixel.c +++ b/common/ppc/pixel.c @@ -45,8 +45,8 @@ static int name( uint8_t *pix1, int i_pix1, \ vec_s32_t sumv = zero_s32v; \ for( y = 0; y < ly; y++ ) \ { \ - VEC_LOAD( pix1, pix1v, lx, vec_u8_t ); \ - VEC_LOAD( pix2, pix2v, lx, vec_u8_t ); \ + VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t ); \ + VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t ); \ sumv = (vec_s32_t) vec_sum4s( \ vec_sub( vec_max( pix1v, pix2v ), \ vec_min( pix1v, pix2v ) ), \ @@ -123,14 +123,20 @@ static int pixel_satd_4x4_altivec( uint8_t *pix1, int i_pix1, DECLARE_ALIGNED_16( int i_satd ); PREP_DIFF; + PREP_LOAD_SRC( pix1 ); vec_s16_t diff0v, diff1v, diff2v, diff3v; vec_s16_t temp0v, temp1v, temp2v, temp3v; vec_s32_t satdv; - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); + vec_u8_t _offset1v_ = vec_lvsl(0, pix2); + vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2); + + + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v ); /* Hadamar H */ VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, @@ -167,10 +173,14 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1, vec_s16_t temp0v, temp1v, temp2v, temp3v; vec_s32_t satdv; - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); + PREP_LOAD_SRC( pix1 ); + vec_u8_t _offset1v_ = vec_lvsl(0, pix2); + vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2); + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, @@ -182,10 +192,10 @@ static int pixel_satd_4x8_altivec( uint8_t *pix1, int i_pix1, VEC_ADD_ABS( temp2v, satdv, satdv ); VEC_ADD_ABS( temp3v, satdv, satdv ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v, @@ -219,10 +229,16 @@ static int pixel_satd_8x4_altivec( uint8_t *pix1, int i_pix1, temp4v, temp5v, temp6v, temp7v; vec_s32_t satdv; - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); + + PREP_LOAD_SRC( pix1 ); + vec_u8_t _offset1v_ = vec_lvsl(0, pix2); + vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2); + + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); @@ -268,14 +284,19 @@ static int pixel_satd_8x8_altivec( uint8_t *pix1, int i_pix1, temp4v, temp5v, temp6v, temp7v; vec_s32_t satdv; - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); + PREP_LOAD_SRC( pix1 ); + vec_u8_t _offset1v_ = vec_lvsl(0, pix2); + vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2); + + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); @@ -323,14 +344,18 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1, temp4v, temp5v, temp6v, temp7v; vec_s32_t satdv; - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); + PREP_LOAD_SRC( pix1 ); + vec_u8_t _offset1v_ = vec_lvsl(0, pix2); + vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2); + + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, @@ -352,14 +377,14 @@ static int pixel_satd_8x16_altivec( uint8_t *pix1, int i_pix1, VEC_ADD_ABS( temp6v, satdv, satdv ); VEC_ADD_ABS( temp7v, satdv, satdv ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v ); VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v, temp0v, temp1v, temp2v, temp3v ); VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v, @@ -398,6 +423,7 @@ static int pixel_satd_16x8_altivec( uint8_t *pix1, int i_pix1, LOAD_ZERO; PREP_LOAD; + PREP_LOAD_SRC( pix2 ); vec_s32_t satdv; vec_s16_t pix1v, pix2v; vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v, @@ -489,6 +515,8 @@ static int pixel_satd_16x16_altivec( uint8_t *pix1, int i_pix1, diffl4v, diffl5v, diffl6v, diffl7v; vec_s16_t temp0v, temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v; + PREP_LOAD_SRC( pix2 ); + VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v ); VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v ); @@ -1715,18 +1743,20 @@ static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, int i_pix1, uint8_t *pix2 int32_t i_satd=0; PREP_DIFF; + PREP_LOAD_SRC( pix1 ); + PREP_LOAD_SRC( pix2 ); vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v; - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v ); - VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 ); + VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 ); vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v; @@ -1806,14 +1836,16 @@ static void ssim_4x4x2_core_altivec( const uint8_t *pix1, int stride1, vec_u8_t pix1v, pix2v; vec_u32_t s1v, s2v, ssv, s12v; PREP_LOAD; + PREP_LOAD_SRC (pix1); + PREP_LOAD_SRC (pix2); LOAD_ZERO; s1v = s2v = ssv = s12v = zero_u32v; for(y=0; y<4; y++) { - VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t ); - VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t ); + VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 ); + VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 ); s1v = vec_sum4s( pix1v, s1v ); s2v = vec_sum4s( pix2v, s2v ); diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h index 2756e38..9ad97c6 100644 --- a/common/ppc/ppccommon.h +++ b/common/ppc/ppccommon.h @@ -80,16 +80,36 @@ typedef union { /*********************************************************************** * PREP_LOAD: declares two vectors required to perform unaligned loads - * VEC_LOAD: loads n bytes from u8 * p into vector v of type t + * VEC_LOAD: loads n bytes from u8 * p into vector v of type t where o is from original src offset + * VEC_LOAD:_G: loads n bytes from u8 * p into vectory v of type t - use when offset is not known + * VEC_LOAD_OFFSET: as above, but with offset vector known in advance **********************************************************************/ #define PREP_LOAD \ vec_u8_t _hv, _lv -#define VEC_LOAD( p, v, n, t ) \ +#define PREP_LOAD_SRC( src ) \ + vec_u8_t _##src##_ = vec_lvsl(0, src) + +#define VEC_LOAD_G( p, v, n, t ) \ _hv = vec_ld( 0, p ); \ v = (t) vec_lvsl( 0, p ); \ _lv = vec_ld( n - 1, p ); \ - v = (t) vec_perm( _hv, _lv, (vec_u8_t) v ) + v = (t) vec_perm( _hv, _lv, (vec_u8_t) v ) + +#define VEC_LOAD( p, v, n, t, g ) \ + _hv = vec_ld( 0, p ); \ + _lv = vec_ld( n - 1, p ); \ + v = (t) vec_perm( _hv, _lv, (vec_u8_t) _##g##_ ) + +#define VEC_LOAD_OFFSET( p, v, n, t, o ) \ + _hv = vec_ld( 0, p); \ + _lv = vec_ld( n - 1, p ); \ + v = (t) vec_perm( _hv, _lv, (vec_u8_t) o ) + +#define VEC_LOAD_PARTIAL( p, v, n, t, g) \ + _hv = vec_ld( 0, p); \ + v = (t) vec_perm( _hv, _hv, (vec_u8_t) _##g##_ ) + /*********************************************************************** * PREP_STORE##n: declares required vectors to store n bytes to a @@ -97,59 +117,35 @@ typedef union { * VEC_STORE##n: stores n bytes from vector v to address p **********************************************************************/ #define PREP_STORE16 \ - vec_u8_t _tmp1v, _tmp2v \ + vec_u8_t _tmp1v\ -#define VEC_STORE16( v, p ) \ +#define PREP_STORE16_DST( dst ) \ + vec_u8_t _##dst##l_ = vec_lvsl(0, dst); \ + vec_u8_t _##dst##r_ = vec_lvsr(0, dst); + +#define VEC_STORE16( v, p, o ) \ _hv = vec_ld( 0, p ); \ - _tmp2v = vec_lvsl( 0, p ); \ _lv = vec_ld( 15, p ); \ - _tmp1v = vec_perm( _lv, _hv, _tmp2v ); \ - _tmp2v = vec_lvsr( 0, p ); \ - _lv = vec_perm( (vec_u8_t) v, _tmp1v, _tmp2v ); \ + _tmp1v = vec_perm( _lv, _hv, _##o##l_ ); \ + _lv = vec_perm( (vec_u8_t) v, _tmp1v, _##o##r_ ); \ vec_st( _lv, 15, (uint8_t *) p ); \ - _hv = vec_perm( _tmp1v, (vec_u8_t) v, _tmp2v ); \ - vec_st( _hv, 0, (uint8_t *) p ) + _hv = vec_perm( _tmp1v, (vec_u8_t) v, _##o##r_ ); \ + vec_st( _hv, 0, (uint8_t *) p ) + #define PREP_STORE8 \ - PREP_STORE16; \ - vec_u8_t _tmp3v, _tmp4v; \ - const vec_u8_t sel_h = \ - (vec_u8_t) CV(-1,-1,-1,-1,-1,-1,-1,-1,0,0,0,0,0,0,0,0) - -#define PREP_STORE8_HL \ - PREP_STORE8; \ - const vec_u8_t sel_l = \ - (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1) - -#define VEC_STORE8 VEC_STORE8_H - -#define VEC_STORE8_H( v, p ) \ - _tmp3v = vec_lvsr( 0, (uint8_t *) p ); \ - _tmp4v = vec_perm( (vec_u8_t) v, (vec_u8_t) v, _tmp3v ); \ - _lv = vec_ld( 7, (uint8_t *) p ); \ - _tmp1v = vec_perm( sel_h, zero_u8v, _tmp3v ); \ - _lv = vec_sel( _lv, _tmp4v, _tmp1v ); \ - vec_st( _lv, 7, (uint8_t *) p ); \ - _hv = vec_ld( 0, (uint8_t *) p ); \ - _tmp2v = vec_perm( zero_u8v, sel_h, _tmp3v ); \ - _hv = vec_sel( _hv, _tmp4v, _tmp2v ); \ - vec_st( _hv, 0, (uint8_t *) p ) - -#define VEC_STORE8_L( v, p ) \ - _tmp3v = vec_lvsr( 8, (uint8_t *) p ); \ - _tmp4v = vec_perm( (vec_u8_t) v, (vec_u8_t) v, _tmp3v ); \ - _lv = vec_ld( 7, (uint8_t *) p ); \ - _tmp1v = vec_perm( sel_l, zero_u8v, _tmp3v ); \ - _lv = vec_sel( _lv, _tmp4v, _tmp1v ); \ - vec_st( _lv, 7, (uint8_t *) p ); \ - _hv = vec_ld( 0, (uint8_t *) p ); \ - _tmp2v = vec_perm( zero_u8v, sel_l, _tmp3v ); \ - _hv = vec_sel( _hv, _tmp4v, _tmp2v ); \ - vec_st( _hv, 0, (uint8_t *) p ) + vec_u8_t _tmp3v \ + +#define VEC_STORE8( v, p ) \ + _tmp3v = vec_lvsl(0, p); \ + v = vec_perm(v, v, _tmp3v); \ + vec_ste((vec_u32_t)v,0,(uint32_t*)p); \ + vec_ste((vec_u32_t)v,4,(uint32_t*)p) + #define PREP_STORE4 \ PREP_STORE16; \ - vec_u8_t _tmp3v; \ + vec_u8_t _tmp2v, _tmp3v; \ const vec_u8_t sel = \ (vec_u8_t) CV(-1,-1,-1,-1,0,0,0,0,0,0,0,0,0,0,0,0) @@ -226,17 +222,18 @@ typedef union { * d: s16v * * Loads n bytes from p1 and p2, do the diff of the high elements into - * d, increments p1 and p2 by i1 and i2 + * d, increments p1 and p2 by i1 and i2 into known offset g **********************************************************************/ #define PREP_DIFF \ LOAD_ZERO; \ PREP_LOAD; \ vec_s16_t pix1v, pix2v; -#define VEC_DIFF_H(p1,i1,p2,i2,n,d) \ - VEC_LOAD( p1, pix1v, n, vec_s16_t ); \ + +#define VEC_DIFF_H(p1,i1,p2,i2,n,d,g) \ + VEC_LOAD_PARTIAL( p1, pix1v, n, vec_s16_t, p1); \ pix1v = vec_u8_to_s16( pix1v ); \ - VEC_LOAD( p2, pix2v, n, vec_s16_t ); \ + VEC_LOAD( p2, pix2v, n, vec_s16_t, g); \ pix2v = vec_u8_to_s16( pix2v ); \ d = vec_sub( pix1v, pix2v ); \ p1 += i1; \ @@ -254,10 +251,10 @@ typedef union { * and i2 **********************************************************************/ #define VEC_DIFF_HL(p1,i1,p2,i2,dh,dl) \ - VEC_LOAD( p1, pix1v, 16, vec_s16_t ); \ + pix1v = vec_ld(0, p1); \ temp0v = vec_u8_to_s16_h( pix1v ); \ temp1v = vec_u8_to_s16_l( pix1v ); \ - VEC_LOAD( p2, pix2v, 16, vec_s16_t ); \ + VEC_LOAD( p2, pix2v, 16, vec_s16_t, p2); \ temp2v = vec_u8_to_s16_h( pix2v ); \ temp3v = vec_u8_to_s16_l( pix2v ); \ dh = vec_sub( temp0v, temp2v ); \