[x264-devel] 8x8 and 16x16 Altivec implementation of variance
Guillaume POIRIER
gpoirier at mplayerhq.hu
Thu Jan 22 22:13:42 CET 2009
Hello folks,
The attached patch add 8x8 and 16x16 Altivec implementation of
variance computation.
Here are the bench figures on a PPC7450:
var_8x8: C: 71, Altivec: 22, speed-up: 3.2
var_16x16: C:254, Altivec: 59, speed-up: 4.3
The 8x8 doesn't such a big speed-up because the data is 8-bytes
aligned, not 16-bytes aligned, so it's necessary to permute it before
using it.
I believe I can improve things by doing some unrolling and avoiding
this horizontal add with vec_sum4s (though it's are quite cool since
it allows to make do without 3 unpacks and one less add).
I'm essentially submitting it for ideas and also for archiving
purposes since my hard drive is one of these infamous Seagate
ST3750330AS with a fuzzy firmware ;-).
Guillaume
--
Only a very small fraction of our DNA does anything; the rest is all
comments and ifdefs.
Katharine Hepburn - "Death will be a great relief. No more interviews."
-------------- next part --------------
diff --git a/common/ppc/pixel.c b/common/ppc/pixel.c
index 921c01e..37a4ffd 100644
--- a/common/ppc/pixel.c
+++ b/common/ppc/pixel.c
@@ -1630,6 +1630,110 @@ static int pixel_ssd_8x8_altivec ( uint8_t *pix1, int i_stride_pix1,
return sum;
}
+
+/****************************************************************************
+ * variance
+ ****************************************************************************/
+static int x264_pixel_var_16x16_altivec( uint8_t *pix, int i_stride )
+{
+ DECLARE_ALIGNED_16(uint32_t sum);
+ DECLARE_ALIGNED_16(uint32_t sqr);
+
+ LOAD_ZERO;
+ vec_u32_t sqr_v = zero_u32v;
+ vec_u32_t sum_v = zero_u32v;
+
+ int y;
+ for( y = 0; y < 16; ++y )
+ {
+ vec_u8_t pix_v = vec_ld(0, pix);
+
+ sum_v = vec_sum4s(pix_v, sum_v);
+
+ vec_u16_t mule = vec_mule(pix_v, pix_v);
+ vec_u16_t mulo = vec_mulo(pix_v, pix_v);
+
+ vec_u32_t mule_h = vec_u16_to_u32_h(mule);
+ vec_u32_t mule_l = vec_u16_to_u32_l(mule);
+
+ vec_u32_t mulo_h = vec_u16_to_u32_h(mulo);
+ vec_u32_t mulo_l = vec_u16_to_u32_l(mulo);
+
+ vec_u32_t mule_sqr = vec_add(mule_h, mule_l);
+ vec_u32_t mulo_sqr = vec_add(mulo_h, mulo_l);
+ vec_u32_t mul_sqr = vec_add(mule_sqr, mulo_sqr);
+ sqr_v = vec_add(sqr_v, mul_sqr);
+
+ pix += i_stride;
+ }
+ sum_v = vec_add( sum_v, vec_sld( sum_v, sum_v, 8 ) );
+ sum_v = vec_add( sum_v, vec_sld( sum_v, sum_v, 4 ) );
+ vec_ste(sum_v, 0, &sum);
+
+ sqr_v = vec_add( sqr_v, vec_sld( sqr_v, sqr_v, 8 ) );
+ sqr_v = vec_add( sqr_v, vec_sld( sqr_v, sqr_v, 4 ) );
+ vec_ste(sqr_v, 0, &sqr);
+
+ uint32_t var = sqr - (sum * sum >> 8);
+ return var;
+}
+
+static int x264_pixel_var_8x8_altivec( uint8_t *pix, int i_stride )
+{
+ DECLARE_ALIGNED_16(uint32_t sum);
+ DECLARE_ALIGNED_16(uint32_t sqr);
+
+ LOAD_ZERO;
+ vec_u32_t sqr_v = zero_u32v;
+ vec_u32_t sum_v = zero_u32v;
+
+ vec_u8_t perm0 = vec_lvsl( 0, pix );
+ vec_u8_t perm1 = vec_lvsl( 0, pix+i_stride );
+
+ int y;
+ for( y = 0; y < 8; y+=2 )
+ {
+ vec_u8_t pix0_v = vec_ld(0, pix);
+ vec_u8_t pix1_v = vec_ld(i_stride, pix);
+ uint8_t *pix_i_stride = pix+i_stride;
+
+ pix0_v= vec_perm(pix0_v, pix0_v, perm0);
+ pix1_v = vec_perm(pix1_v, pix1_v, perm1);
+
+ //vec_u16_t pix_16v = vec_u8_to_u16_(pix_v);
+ vec_u8_t pix_v = vec_mergeh(pix0_v, pix1_v);
+
+ sum_v = vec_sum4s(pix_v, sum_v);
+
+ vec_u16_t mule = vec_mule(pix_v, pix_v);
+ vec_u16_t mulo = vec_mulo(pix_v, pix_v);
+
+ vec_u32_t mule_h = vec_u16_to_u32_h(mule);
+ vec_u32_t mule_l = vec_u16_to_u32_l(mule);
+
+ vec_u32_t mulo_h = vec_u16_to_u32_h(mulo);
+ vec_u32_t mulo_l = vec_u16_to_u32_l(mulo);
+
+ vec_u32_t mule_sqr = vec_add(mule_h, mule_l);
+ vec_u32_t mulo_sqr = vec_add(mulo_h, mulo_l);
+ vec_u32_t mul_sqr = vec_add(mule_sqr, mulo_sqr);
+ sqr_v = vec_add(sqr_v, mul_sqr);
+
+ pix += i_stride<<1;
+ }
+ sum_v = vec_add( sum_v, vec_sld( sum_v, sum_v, 8 ) );
+ sum_v = vec_add( sum_v, vec_sld( sum_v, sum_v, 4 ) );
+ vec_ste(sum_v, 0, &sum);
+
+ sqr_v = vec_add( sqr_v, vec_sld( sqr_v, sqr_v, 8 ) );
+ sqr_v = vec_add( sqr_v, vec_sld( sqr_v, sqr_v, 4 ) );
+ vec_ste(sqr_v, 0, &sqr);
+
+ uint32_t var = sqr - (sum * sum >> 6);
+ return var;
+}
+
+
/**********************************************************************
* SA8D routines: sum of 8x8 Hadamard transformed differences
**********************************************************************/
@@ -1856,5 +1960,8 @@ void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec;
+ pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_altivec;
+ pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_altivec;
+
pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
}
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index 7c87885..510ab26 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -87,6 +87,22 @@ typedef union {
#define vec_u16_to_u8(v) vec_pack( v, zero_u16v )
#define vec_s16_to_u8(v) vec_packsu( v, zero_s16v )
+
+/***********************************************************************
+ * 16 <-> 32 bits conversions
+ **********************************************************************/
+#define vec_u16_to_u32_h(v) (vec_u32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_u32_l(v) (vec_u32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_s32_h(v) (vec_s32_t) vec_mergeh( zero_u16v, (vec_u16_t) v )
+#define vec_u16_to_s32_l(v) (vec_s32_t) vec_mergel( zero_u16v, (vec_u16_t) v )
+
+#define vec_u16_to_u32(v) vec_u16_to_u32_h(v)
+#define vec_u16_to_s32(v) vec_u16_to_s32_h(v)
+
+#define vec_u32_to_u16(v) vec_pack( v, zero_u32v )
+#define vec_s32_to_u16(v) vec_packsu( v, zero_s32v )
+
+
/***********************************************************************
* PREP_LOAD: declares two vectors required to perform unaligned loads
* VEC_LOAD: loads n bytes from u8 * p into vector v of type t where o is from original src offset
More information about the x264-devel
mailing list