[x264-devel] Re: [Griffith Rees <griff.rees at gmail.com>] Re: Re: Problems compiling xubuntu ppc

Tue Jan 23 14:13:47 CET 2007

Hi,

(quotation reordered)
> On 1/22/07, Guillaume POIRIER < poirierg at gmail.com> wrote:

> > Attached patch should allow you to go further in compilation process,
> > but since I haven't addressed all issues yet, it won't completely
> > compile.
> >
> > If all goes "well", you should terminate compilation with this error
> > (which is normal):
> >
> > gcc-4.1.1 -O4 -ffast-math  -Wall -I. -D__X264__ -DHAVE_MALLOC_H
> > -maltivec -mabi=altivec -DARCH_PPC -DSYS_LINUX -DHAVE_PTHREAD -s
> > -fomit-frame-pointer   -c -o common/ppc/dct.o common/ppc/dct.c
> > common/ppc/dct.c: In function 'x264_add8x8_idct8_altivec':
> > common/ppc/dct.c:331: error: invalid parameter combination for AltiVec
> intrinsic
> > common/ppc/dct.c:331: error: invalid parameter combination for AltiVec
> intrinsic

[..]

On 1/23/07, Griffith Rees <griff.rees at gmail.com> wrote:
> Cool. I'll apply it as soon as I get back to my powerbook.


Attached patch allows to go even further in compilation, so that you
should have this message instead:

gcc -O4 -ffast-math  -Wall -I. -D__X264__ -DHAVE_MALLOC_H -maltivec
-mabi=altivec -DARCH_PPC -DSYS_LINUX -DHAVE_PTHREAD -s
-fomit-frame-pointer   -c -o common/ppc/quant.o common/ppc/quant.c
common/ppc/quant.c: In function 'x264_quant_8x8_altivec':
common/ppc/quant.c:180: error: invalid parameter combination for
AltiVec intrinsic
common/ppc/quant.c:180: error: invalid parameter combination for
AltiVec intrinsic
common/ppc/quant.c:180: error: invalid parameter combination for
AltiVec intrinsic
common/ppc/quant.c:180: error: invalid parameter combination for
AltiVec intrinsic
common/ppc/quant.c:180: error: invalid parameter combination for
AltiVec intrinsic
common/ppc/quant.c:180: error: invalid parameter combination for
AltiVec intrinsic
make: *** [common/ppc/quant.o] Error 1


I'll try to smash the remaining for the issues tonight.
I'm not even sure if the changes I made broke the computed results, so
that would have to be checked and fixed too.


> > BTW, this bug report clearly shows that x264's user base on Linux is
> > very, very thin as this code has been on x264's svn for several weeks.
>
> One thing though: I think this speaks more to the penetration of x264 with
> linux ppc users, rather than the linux community as a whole.

Yep, that's what I implied. Since Altivec is a PPC-feature (and since
the offending code is Altivec code), it seemed obvious that I was
talking about PPC-on-Linux ;-)


> Case in point:
> Cinelerra, a project I expect x264 developers have at least heard of, (
> cvs.cinelerra.org), has been looking for an altivec maintainer for at least
> a year now (I think). I finally got enough time to offer my services, and I
> hope to make some contribution there,

Well, if you wanna contribute Altivec code, you're certainly welcome
to contribute to x264 too you know :-)

Guillaume
-------------- next part --------------
Index: common/ppc/quant.c
===================================================================

--- common/ppc/quant.c	(revision 620)
+++ common/ppc/quant.c	(working copy)
@@ -18,10 +18,6 @@
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 *****************************************************************************/
 
-#ifdef HAVE_ALTIVEC_H
-#include <altivec.h>
-#endif
-
 #include "common/common.h"
 #include "ppccommon.h"
 #include "quant.h"            
@@ -40,36 +36,36 @@
 multOddvA = vec_mulo(coefvA, mfvA);                                          \
 multEvenvB = vec_mule(coefvB, mfvB);                                         \
 multOddvB = vec_mulo(coefvB, mfvB);                                          \
-multEvenvA = vec_adds(multEvenvA, fV);                                        \
-multOddvA = vec_adds(multOddvA, fV);                                          \
-multEvenvB = vec_adds(multEvenvB, fV);                                        \
-multOddvB = vec_adds(multOddvB, fV);                                          \
-multEvenvA = vec_sr(multEvenvA, i_qbitsv);                                   \
-multOddvA = vec_sr(multOddvA, i_qbitsv);                                     \
-multEvenvB = vec_sr(multEvenvB, i_qbitsv);                                   \
-multOddvB = vec_sr(multOddvB, i_qbitsv);                                     \
+multEvenvA = vec_add(multEvenvA, (vec_u32_t)fV);                             \
+multOddvA  = vec_add(multOddvA,  (vec_u32_t)fV);                             \
+multEvenvB = vec_add(multEvenvB, (vec_u32_t)fV);                             \
+multOddvB  = vec_add(multOddvB,  (vec_u32_t)fV);                             \
+multEvenvA = vec_sr(multEvenvA, (vec_u32_t)i_qbitsv);                        \
+multOddvA = vec_sr(multOddvA, (vec_u32_t)i_qbitsv);                          \
+multEvenvB = vec_sr(multEvenvB, (vec_u32_t)i_qbitsv);                        \
+multOddvB = vec_sr(multOddvB, (vec_u32_t)i_qbitsv);                          \
 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
 temp1v = vec_xor(temp1v, mskA);                                              \
 temp2v = vec_xor(temp2v, mskB);                                              \
-temp1v = vec_adds(temp1v, vec_and(mskA, one));                                \
-vec_st(temp1v, (dct0), dct);                                                 \
-temp2v = vec_adds(temp2v, vec_and(mskB, one));                                \
-vec_st(temp2v, (dct1), dct);
+temp1v = vec_adds(temp1v, vec_and(mskA, one));                               \
+vec_st(temp1v, (dct0), (int16_t*)dct);                                       \
+temp2v = vec_adds(temp2v, vec_and(mskB, one));                               \
+vec_st(temp2v, (dct1), (int16_t*)dct);
                 
 void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
     vector bool short mskA;
     vec_s32_t i_qbitsv;
     vec_u16_t coefvA;
     vec_u32_t multEvenvA, multOddvA;
-    vec_u32_t mfvA;
+    vec_u16_t mfvA;
     vec_s16_t zerov, one;
     vec_s32_t fV;
 
     vector bool short mskB;
     vec_u16_t coefvB;
     vec_u32_t multEvenvB, multOddvB;
-    vec_u32_t mfvB;
+    vec_u16_t mfvB;
 
     vec_s16_t temp1v, temp2v;
 
@@ -100,22 +96,22 @@
 multOddvA = vec_mulo(coefvA, mfv);                              \
 multEvenvB = vec_mule(coefvB, mfv);                             \
 multOddvB = vec_mulo(coefvB, mfv);                              \
-multEvenvA = vec_add(multEvenvA, fV);                           \
-multOddvA = vec_add(multOddvA, fV);                             \
-multEvenvB = vec_add(multEvenvB, fV);                           \
-multOddvB = vec_add(multOddvB, fV);                             \
-multEvenvA = vec_sr(multEvenvA, i_qbitsv);                      \
-multOddvA = vec_sr(multOddvA, i_qbitsv);                        \
-multEvenvB = vec_sr(multEvenvB, i_qbitsv);                      \
-multOddvB = vec_sr(multOddvB, i_qbitsv);                        \
+multEvenvA = vec_add(multEvenvA, (vec_u32_t)fV);                \
+multOddvA  = vec_add(multOddvA,  (vec_u32_t)fV);                \
+multEvenvB = vec_add(multEvenvB, (vec_u32_t)fV);                \
+multOddvB  = vec_add(multOddvB,  (vec_u32_t)fV);                \
+multEvenvA = vec_sr(multEvenvA, (vec_u32_t)i_qbitsv);           \
+multOddvA = vec_sr(multOddvA, (vec_u32_t)i_qbitsv);             \
+multEvenvB = vec_sr(multEvenvB, (vec_u32_t)i_qbitsv);           \
+multOddvB = vec_sr(multOddvB, (vec_u32_t)i_qbitsv);             \
 temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
 temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
 temp1v = vec_xor(temp1v, mskA);                                 \
 temp2v = vec_xor(temp2v, mskB);                                 \
 temp1v = vec_add(temp1v, vec_and(mskA, one));                   \
-vec_st(temp1v, (dct0), dct);                                    \
+vec_st(temp1v, (dct0), (int16_t*)dct);                          \
 temp2v = vec_add(temp2v, vec_and(mskB, one));                   \
-vec_st(temp2v, (dct1), dct);
+vec_st(temp2v, (dct1), (int16_t*)dct);
 
 
 void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
@@ -132,7 +128,7 @@
 
     vec_s16_t temp1v, temp2v;
 
-    vec_u32_t mfv;
+    vec_u16_t mfv;
     vect_int_u mf_u;
     mf_u.s[0]=i_quant_mf;
     mfv = vec_splat( mf_u.v, 0 );
Index: common/ppc/dct.c
===================================================================
--- common/ppc/dct.c	(revision 620)
+++ common/ppc/dct.c	(working copy)
@@ -60,8 +60,8 @@
     permHighv = (vec_u8_t) CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17);
     VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
 
-    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
+    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0,  (int16_t*)dct);
+    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, (int16_t*)dct);
 }
 
 void x264_sub8x8_dct_altivec( int16_t dct[4][4][4],
@@ -94,14 +94,14 @@
     VEC_DCT( dct0v, dct1v, dct2v, dct3v, tmp0v, tmp1v, tmp2v, tmp3v );
     VEC_DCT( dct4v, dct5v, dct6v, dct7v, tmp4v, tmp5v, tmp6v, tmp7v );
 
-    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0, dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16, dct);
-    vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32, dct);
-    vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48, dct);
-    vec_st(vec_perm(tmp0v, tmp1v, permLowv),  64, dct);
-    vec_st(vec_perm(tmp2v, tmp3v, permLowv), 80, dct);
-    vec_st(vec_perm(tmp4v, tmp5v, permLowv), 96, dct);
-    vec_st(vec_perm(tmp6v, tmp7v, permLowv), 112, dct);
+    vec_st(vec_perm(tmp0v, tmp1v, permHighv), 0,   (int16_t*)dct);
+    vec_st(vec_perm(tmp2v, tmp3v, permHighv), 16,  (int16_t*)dct);
+    vec_st(vec_perm(tmp4v, tmp5v, permHighv), 32,  (int16_t*)dct);
+    vec_st(vec_perm(tmp6v, tmp7v, permHighv), 48,  (int16_t*)dct);
+    vec_st(vec_perm(tmp0v, tmp1v, permLowv),  64,  (int16_t*)dct);
+    vec_st(vec_perm(tmp2v, tmp3v, permLowv),  80,  (int16_t*)dct);
+    vec_st(vec_perm(tmp4v, tmp5v, permLowv),  96,  (int16_t*)dct);
+    vec_st(vec_perm(tmp6v, tmp7v, permLowv),  112, (int16_t*)dct);
 }
 
 void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
@@ -311,8 +311,8 @@
 
 void x264_add8x8_idct8_altivec( uint8_t *dst, int16_t dct[8][8] )
 {
-    vec_s16_t onev = vec_splat_s16(1);
-    vec_s16_t twov = vec_splat_s16(2);
+    vec_u16_t onev = vec_splat_s16(1);
+    vec_u16_t twov = vec_splat_s16(2);
 
     dct[0][0] += 32; // rounding for the >>6 at the end
 
@@ -341,7 +341,7 @@
 
     vec_u8_t perm_ldv = vec_lvsl(0, dst);
     vec_u8_t perm_stv = vec_lvsr(8, dst);
-    vec_s16_t sixv = vec_splat_s16(6);
+    vec_u16_t sixv = vec_splat_s16(6);
     const vec_u8_t sel = (vec_u8_t) CV(0,0,0,0,0,0,0,0,-1,-1,-1,-1,-1,-1,-1,-1);
     LOAD_ZERO;
 
Index: common/ppc/quant.h
===================================================================
--- common/ppc/quant.h	(revision 620)
+++ common/ppc/quant.h	(working copy)
@@ -18,6 +18,10 @@
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
 *****************************************************************************/
 
+#ifdef SYS_LINUX
+#include <altivec.h>
+#endif
+
 #ifndef _PPC_QUANT_H
 #define _PPC_QUANT_H 1