[x264-devel] ppc: AltiVec plane_copy_deinterleave_rgb
Alexandra Hájková
git at videolan.org
Tue Jan 24 21:14:13 CET 2017
x264 | branch: master | Alexandra Hájková <alexandra at khirnov.net> | Wed Dec 7 19:48:02 2016 +0000| [00f1670087db1b025a8088289de8938bf88a0d8b] | committer: Henrik Gramner
ppc: AltiVec plane_copy_deinterleave_rgb
Also add some missing vector types in ppccommon.h
> http://git.videolan.org/gitweb.cgi/x264.git/?a=commit;h=00f1670087db1b025a8088289de8938bf88a0d8b
---
common/ppc/mc.c | 67 ++++++++++++++++++++++++++++++++++++++++++++++++++
common/ppc/ppccommon.h | 24 ++++++++++++++++++
2 files changed, 91 insertions(+)
diff --git a/common/ppc/mc.c b/common/ppc/mc.c
index 08998f2..5737ec6 100644
--- a/common/ppc/mc.c
+++ b/common/ppc/mc.c
@@ -198,6 +198,70 @@ void x264_store_interleave_chroma_altivec( uint8_t *dst, intptr_t i_dst,
}
}
+#if HAVE_VSX
+void x264_plane_copy_deinterleave_rgb_altivec( uint8_t *dsta, intptr_t i_dsta,
+ uint8_t *dstb, intptr_t i_dstb,
+ uint8_t *dstc, intptr_t i_dstc,
+ uint8_t *src, intptr_t i_src,
+ int pw, int w, int h )
+{
+ if( pw == 3 )
+ {
+ const vec_u8_t mask[4] = {
+ { 0x00, 0x03, 0x06, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x01, 0x04, 0x07, 0x0A, 0x0D, 0x10, 0x13, 0x16 },
+ { 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0x09, 0x0C, 0x0F, 0x12, 0x15, 0x18, 0x1B, 0x1E },
+ { 0x02, 0x05, 0x08, 0x0B, 0x0E, 0x11, 0x14, 0x17, 0x1A, 0x1D, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF },
+ { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x10, 0x13, 0x16, 0x19, 0x1C, 0x1F }
+ };
+
+ for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src )
+ {
+ for( int x = 0; x < w; x += 16 )
+ {
+ vec_u8_t srcv1 = vec_vsx_ld( 3 * x, src );
+ vec_u8_t srcv2 = vec_vsx_ld( 3 * x + 16, src );
+ vec_u8_t srcv3 = vec_vsx_ld( 3 * x + 32, src );
+ vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
+ vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv2, srcv3, mask[1] ); // a8 a9 a10 a11 a12 a13 a14 a15 b8 b9 b10 b11 b12 b13 b14 b15
+ vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta );
+ vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb );
+
+ srcv1 = vec_perm( srcv1, srcv2, mask[2] ); // c0 c1 c2 c3 c4 c5 c6 c7 c8 c9
+ srcv1 = vec_perm( srcv1, srcv3, mask[3] ); // c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15
+ vec_st( srcv1, x, dstc );
+ }
+ }
+ }
+ else
+ {
+ const vec_u8_t mask[2] = {
+ { 0x00, 0x04, 0x08, 0x0C, 0x10, 0x14, 0x18, 0x1C, 0x01, 0x05, 0x09, 0x0D, 0x11, 0x15, 0x19, 0x1D },
+ { 0x02, 0x06, 0x0A, 0x0E, 0x12, 0x16, 0x1A, 0x1E, 0x03, 0x07, 0x0B, 0x0F, 0x13, 0x17, 0x1B, 0x1F }
+ };
+
+ for( int y = 0; y < h; y++, dsta += i_dsta, dstb += i_dstb, dstc += i_dstc, src += i_src )
+ {
+ for( int x = 0; x < w; x += 16 )
+ {
+ vec_u8_t srcv1 = vec_vsx_ld( 4 * x, src );
+ vec_u8_t srcv2 = vec_vsx_ld( 4 * x + 16, src );
+ vec_u8_t srcv3 = vec_vsx_ld( 4 * x + 32, src );
+ vec_u8_t srcv4 = vec_vsx_ld( 4 * x + 48, src );
+
+ vec_u64_t tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[0] ); // a0 a1 a2 a3 a4 a5 a6 a7 b0 b1 b2 b3 b4 b5 b6 b7
+ vec_u64_t tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[0] ); // a8 a9 a10 a11 a12 a13 a14 a15 b8 b9 b10 b11 b12 b13 b14 b15
+ vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dsta );
+ vec_st( (vec_u8_t)vec_mergel( tmp1, tmp2 ), x, dstb );
+
+ tmp1 = (vec_u64_t)vec_perm( srcv1, srcv2, mask[1] ); // c0 c1 c2 c3 c4 c5 c6 c7
+ tmp2 = (vec_u64_t)vec_perm( srcv3, srcv4, mask[1] ); // c8 c9 c10 c11 c12 c13 c14 c15
+ vec_st( (vec_u8_t)vec_mergeh( tmp1, tmp2 ), x, dstc );
+ }
+ }
+ }
+}
+#endif
+
static void mc_luma_altivec( uint8_t *dst, intptr_t i_dst_stride,
uint8_t *src[4], intptr_t i_src_stride,
int mvx, int mvy,
@@ -1232,5 +1296,8 @@ void x264_mc_init_altivec( x264_mc_functions_t *pf )
pf->plane_copy_swap = x264_plane_copy_swap_altivec;
pf->plane_copy_interleave = x264_plane_copy_interleave_altivec;
pf->store_interleave_chroma = x264_store_interleave_chroma_altivec;
+#if HAVE_VSX
+ pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_altivec;
+#endif // HAVE_VSX
#endif // !HIGH_BIT_DEPTH
}
diff --git a/common/ppc/ppccommon.h b/common/ppc/ppccommon.h
index bfc3c7d..855298b 100644
--- a/common/ppc/ppccommon.h
+++ b/common/ppc/ppccommon.h
@@ -45,6 +45,20 @@
#define vec_s16_t vector signed short
#define vec_u32_t vector unsigned int
#define vec_s32_t vector signed int
+#if HAVE_VSX
+#define vec_u64_t vector unsigned long long
+#define vec_s64_t vector signed long long
+
+typedef union {
+ uint64_t s[2];
+ vec_u64_t v;
+} vec_u64_u;
+
+typedef union {
+ int64_t s[2];
+ vec_s64_t v;
+} vec_s64_u;
+#endif
typedef union {
uint32_t s[4];
@@ -52,6 +66,11 @@ typedef union {
} vec_u32_u;
typedef union {
+ int32_t s[4];
+ vec_s32_t v;
+} vec_s32_u;
+
+typedef union {
uint16_t s[8];
vec_u16_t v;
} vec_u16_u;
@@ -66,6 +85,11 @@ typedef union {
vec_u8_t v;
} vec_u8_u;
+typedef union {
+ int8_t s[16];
+ vec_s8_t v;
+} vec_s8_u;
+
/***********************************************************************
* Null vector
**********************************************************************/
More information about the x264-devel
mailing list