[x264-devel] Re: [PATCH] Altivec optimizations for quant4x4 and quant4x4dc

Thu Aug 24 18:41:12 CEST 2006

Hi,

this post is a re-sent, from another e-mail address, it looks like the
other mail got lost.

Guillaume POIRIER a écrit :

> Hi all,
> Please find in attachment the altivec-optimized
> version of quant 4x4(+dc) and quant 8x8.
> All now pass regression tests, so they are all
> activated
> I've also found out that even though my 4x4 DC
> also pass regression tests, it doesn't seem to
> be called too often.
> It's likely to be because I had to restrict its use
> to the cases where maxQdc < (1<<15).
> That can hopefully be improved if I manage to
> understand why I can't use  my 4x4dc with
> maxQdc < (1<<16) or more.


I still haven't found what's the problem. In practice, it doesn't
matter too much because with default flat matrices, the optimized
version is used


> I've also cleaned up the patch to put the unions as
> typedefs, and also fixed some indentation problems.

The attached patch now also features sub8x8_dct8 in Altivec in
addition to previous optimized routines. This new routine still need
to be a bit cleaned up, and the code need to be factorized in a macro,
but it works.

I've introduced another transpose8x8 routine, shamelessly taken from
FFmpeg's: the one on ppccommon.h didn't do what I wanted, but maybe
it's just because I didn't know how to use it.

Last but not least, when I tested that my patch was applying cleanly
to svn, I had to remove this hunk:

Index: common/macroblock.c
===================================================================

--- common/macroblock.c (revision 540)
+++ common/macroblock.c (working copy)
@@ -26,7 +26,7 @@

 #include "common.h"

-static const int dequant_mf[6][4][4] =
+static const int dequant_mf[6][4][4] __attribute__((__aligned__(16))) =
 {
     { {10, 13, 10, 13}, {13, 16, 13, 16}, {10, 13, 10, 13}, {13, 16, 13, 16} },
     { {11, 14, 11, 14}, {14, 18, 14, 18}, {11, 14, 11, 14}, {14, 18, 14, 18} },


Now x264 in svn doesn't have this line since r552:
https://trac.videolan.org/x264/changeset/552

I don't know what what happened to this variable, but I need to ensure
that it's aligned (as my code assume it).
The diff indicates that the declaration of dequant_mf[6][4][4] was
just removed, and I can't find where it's declared, though grep does
show that it's still used, so I guess there's a declaration somewhere,
it's just that _I_ can't locate it.

Cheers,

Guillaume
-------------- next part --------------
Index: encoder/macroblock.c
===================================================================
--- encoder/macroblock.c	(revision 540)
+++ encoder/macroblock.c	(working copy)
@@ -31,7 +31,7 @@
 /* def_quant4_mf only for probe_skip; actual encoding uses matrices from set.c */
 /* FIXME this seems to make better decisions with cqm=jvt, but could screw up
  * with general custom matrices. */
-static const int def_quant4_mf[6][4][4] =
+static const int def_quant4_mf[6][4][4] __attribute__((__aligned__(16))) =
 {
     { { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 },
       { 13107, 8066, 13107, 8066 }, { 8066, 5243, 8066, 5243 } },
@@ -195,7 +195,7 @@
     int y = 4 * block_idx_y[idx];
     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
-    int16_t dct4x4[4][4];
+    DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
 
     if( h->mb.b_lossless )
     {
@@ -223,7 +223,7 @@
     int y = 8 * (idx>>1);
     uint8_t *p_src = &h->mb.pic.p_fenc[0][x+y*FENC_STRIDE];
     uint8_t *p_dst = &h->mb.pic.p_fdec[0][x+y*FDEC_STRIDE];
-    int16_t dct8x8[8][8];
+    DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
 
     h->dctf.sub8x8_dct8( dct8x8, p_src, p_dst );
 
@@ -242,7 +242,7 @@
     uint8_t  *p_src = h->mb.pic.p_fenc[0];
     uint8_t  *p_dst = h->mb.pic.p_fdec[0];
 
-    int16_t dct4x4[16+1][4][4];
+    DECLARE_ALIGNED( int16_t, dct4x4[16+1][4][4], 16 );
 
     int i;
 
@@ -305,8 +305,8 @@
         uint8_t  *p_dst = h->mb.pic.p_fdec[1+ch];
         int i_decimate_score = 0;
 
-        int16_t dct2x2[2][2];
-        int16_t dct4x4[4][4][4];
+        DECLARE_ALIGNED( int16_t, dct2x2[2][2] , 16 );
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
 
         if( h->mb.b_lossless )
         {
@@ -494,7 +494,7 @@
         }
         else if( h->mb.b_transform_8x8 )
         {
-            int16_t dct8x8[4][8][8];
+            DECLARE_ALIGNED( int16_t, dct8x8[4][8][8], 16 );
             int nnz8x8[4] = {1,1,1,1};
             b_decimate &= !h->mb.b_trellis; // 8x8 trellis is inherently optimal decimation
             h->dctf.sub16x16_dct8( dct8x8, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
@@ -539,7 +539,7 @@
         }
         else
         {
-            int16_t dct4x4[16][4][4];
+            DECLARE_ALIGNED( int16_t, dct4x4[16][4][4], 16 );
             int nnz8x8[4] = {1,1,1,1};
             h->dctf.sub16x16_dct( dct4x4, h->mb.pic.p_fenc[0], h->mb.pic.p_fdec[0] );
 
@@ -869,7 +869,7 @@
 
     if( h->mb.b_transform_8x8 )
     {
-        int16_t dct8x8[8][8];
+        DECLARE_ALIGNED( int16_t, dct8x8[8][8], 16 );
         h->dctf.sub8x8_dct8( dct8x8, p_fenc, p_fdec );
         quant_8x8( h, dct8x8, h->quant8_mf[CQM_8PY], i_qp, 0 );
         scan_zigzag_8x8full( h->dct.luma8x8[i8], dct8x8 );
@@ -888,7 +888,7 @@
     else
     {
         int i4;
-        int16_t dct4x4[4][4][4];
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4][4], 16 );
         h->dctf.sub8x8_dct( dct4x4, p_fenc, p_fdec );
         quant_4x4( h, dct4x4[0], h->quant4_mf[CQM_4PY], i_qp, 0 );
         quant_4x4( h, dct4x4[1], h->quant4_mf[CQM_4PY], i_qp, 0 );
@@ -919,7 +919,7 @@
 
     for( ch = 0; ch < 2; ch++ )
     {
-        int16_t dct4x4[4][4];
+        DECLARE_ALIGNED( int16_t, dct4x4[4][4], 16 );
         p_fenc = h->mb.pic.p_fenc[1+ch] + (i8&1)*4 + (i8>>1)*4*FENC_STRIDE;
         p_fdec = h->mb.pic.p_fdec[1+ch] + (i8&1)*4 + (i8>>1)*4*FDEC_STRIDE;
 
Index: common/ppc/dct.h
===================================================================
--- common/ppc/dct.h	(revision 540)
+++ common/ppc/dct.h	(working copy)
@@ -30,5 +30,7 @@
         uint8_t *pix1, uint8_t *pix2 );
 void x264_sub16x16_dct_altivec( int16_t dct[16][4][4],
         uint8_t *pix1, uint8_t *pix2 );
+void x264_sub8x8_dct8_altivec( int16_t dct[8][8],
+        uint8_t *pix1, uint8_t *pix2 );
 
 #endif
Index: common/ppc/ppccommon.h
===================================================================
--- common/ppc/ppccommon.h	(revision 540)
+++ common/ppc/ppccommon.h	(working copy)
@@ -184,6 +184,40 @@
     b6 = vec_mergeh( a3, a7 ); \
     b7 = vec_mergel( a3, a7 )
 
+/* Same as above, but just takes the 8x8 matrix to transpose as parameter */
+#define TRANSPOSE8(a,b,c,d,e,f,g,h) \
+do { \
+    __typeof__(a)  _A1, _B1, _C1, _D1, _E1, _F1, _G1, _H1; \
+    __typeof__(a)  _A2, _B2, _C2, _D2, _E2, _F2, _G2, _H2; \
+ \
+    _A1 = vec_mergeh (a, e); \
+    _B1 = vec_mergel (a, e); \
+    _C1 = vec_mergeh (b, f); \
+    _D1 = vec_mergel (b, f); \
+    _E1 = vec_mergeh (c, g); \
+    _F1 = vec_mergel (c, g); \
+    _G1 = vec_mergeh (d, h); \
+    _H1 = vec_mergel (d, h); \
+ \
+    _A2 = vec_mergeh (_A1, _E1); \
+    _B2 = vec_mergel (_A1, _E1); \
+    _C2 = vec_mergeh (_B1, _F1); \
+    _D2 = vec_mergel (_B1, _F1); \
+    _E2 = vec_mergeh (_C1, _G1); \
+    _F2 = vec_mergel (_C1, _G1); \
+    _G2 = vec_mergeh (_D1, _H1); \
+    _H2 = vec_mergel (_D1, _H1); \
+ \
+    a = vec_mergeh (_A2, _E2); \
+    b = vec_mergel (_A2, _E2); \
+    c = vec_mergeh (_B2, _F2); \
+    d = vec_mergel (_B2, _F2); \
+    e = vec_mergeh (_C2, _G2); \
+    f = vec_mergel (_C2, _G2); \
+    g = vec_mergeh (_D2, _H2); \
+    h = vec_mergel (_D2, _H2); \
+} while (0)
+
 /***********************************************************************
  * VEC_TRANSPOSE_4
  ***********************************************************************
Index: common/ppc/quant.c
===================================================================
--- common/ppc/quant.c	(revision 0)
+++ common/ppc/quant.c	(revision 0)
@@ -0,0 +1,184 @@
+/*****************************************************************************
+* quant.c: h264 encoder
+*****************************************************************************
+* Authors: Guillaume Poirier <poirierg at gmail.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+*****************************************************************************/
+
+#ifdef HAVE_ALTIVEC_H
+#include <altivec.h>
+#endif
+
+#include "common/common.h"
+#include "ppccommon.h"
+#include "quant.h"            
+
+// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+#define QUANT_16_U( a, b, c, d, e, f )                              \
+temp1v = vec_ld((a), *dct);                                         \
+temp2v = vec_ld((d), *dct);                                         \
+mfvA = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((b), *quant_mf), (vec_u32_t)vec_ld((c), *quant_mf));    \
+mfvB = (vec_u16_t) vec_packs((vec_u32_t)vec_ld((e), *quant_mf), (vec_u32_t)vec_ld((f), *quant_mf));    \
+mskA = vec_cmplt(temp1v, zerov);                                    \
+mskB = vec_cmplt(temp2v, zerov);                                    \
+coefvA = (vec_u16_t) vec_abs(vec_sub(zerov, temp1v), temp1v);       \
+coefvB = (vec_u16_t) vec_abs(vec_sub(zerov, temp2v), temp2v);       \
+multEvenvA = vec_mule(coefvA, mfvA);                                \
+multOddvA = vec_mulo(coefvA, mfvA);                                 \
+multEvenvB = vec_mule(coefvB, mfvB);                                \
+multOddvB = vec_mulo(coefvB, mfvB);                                 \
+multEvenvA = vec_add(multEvenvA, fV);                               \
+multOddvA = vec_add(multOddvA, fV);                                 \
+multEvenvB = vec_add(multEvenvB, fV);                               \
+multOddvB = vec_add(multOddvB, fV);                                 \
+multEvenvA = vec_sr(multEvenvA, i_qbitsv);                          \
+multOddvA = vec_sr(multOddvA, i_qbitsv);                            \
+multEvenvB = vec_sr(multEvenvB, i_qbitsv);                          \
+multOddvB = vec_sr(multOddvB, i_qbitsv);                            \
+temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+temp1v = vec_xor(temp1v, mskA);                                     \
+temp2v = vec_xor(temp2v, mskB);                                     \
+temp1v = vec_add(temp1v, vec_and(mskA, one));                       \
+vec_st(temp1v, (a), dct);                                           \
+temp2v = vec_add(temp2v, vec_and(mskB, one));                       \
+vec_st(temp2v, (d), dct);
+                
+void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f ) {
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_s32_t multEvenvA, multOddvA, mfvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB, mfvB;
+
+    vec_s16_t temp1v, temp2v;
+
+    vect_int_u qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    vect_sint_u f_u;
+    f_u.s[0]=f;
+
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+    
+    QUANT_16_U( 0, 0, 16, 16, 32, 48 );
+}
+
+// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
+#define QUANT_16_U_DC( a, d )                                       \
+temp1v = vec_ld((a), *dct);                                         \
+temp2v = vec_ld((d), *dct);                                         \
+mskA = vec_cmplt(temp1v, zerov);                                    \
+mskB = vec_cmplt(temp2v, zerov);                                    \
+coefvA = (vec_u16_t) vec_abs(vec_sub(zerov, temp1v), temp1v);       \
+coefvB = (vec_u16_t) vec_abs(vec_sub(zerov, temp2v), temp2v);       \
+multEvenvA = vec_mule(coefvA, mfv);                                 \
+multOddvA = vec_mulo(coefvA, mfv);                                  \
+multEvenvB = vec_mule(coefvB, mfv);                                 \
+multOddvB = vec_mulo(coefvB, mfv);                                  \
+multEvenvA = vec_add(multEvenvA, fV);                               \
+multOddvA = vec_add(multOddvA, fV);                                 \
+multEvenvB = vec_add(multEvenvB, fV);                               \
+multOddvB = vec_add(multOddvB, fV);                                 \
+multEvenvA = vec_sr(multEvenvA, i_qbitsv);                          \
+multOddvA = vec_sr(multOddvA, i_qbitsv);                            \
+multEvenvB = vec_sr(multEvenvB, i_qbitsv);                          \
+multOddvB = vec_sr(multOddvB, i_qbitsv);                            \
+temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
+temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
+temp1v = vec_xor(temp1v, mskA);                                     \
+temp2v = vec_xor(temp2v, mskB);                                     \
+temp1v = vec_add(temp1v, vec_and(mskA, one));                       \
+vec_st(temp1v, (a), dct);                                           \
+temp2v = vec_add(temp2v, vec_and(mskB, one));                       \
+vec_st(temp2v, (d), dct);
+
+
+void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f ) {
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_s32_t multEvenvA, multOddvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB;
+
+    vec_s16_t temp1v, temp2v;
+
+    vec_s32_t mfv;
+    vect_int_u mf_u;
+    mf_u.s[0]=i_quant_mf;
+    mfv = vec_splat( mf_u.v, 0 );
+    mfv = vec_packs( mfv, mfv);
+
+    vect_int_u qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    vect_sint_u f_u;
+    f_u.s[0]=f;
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+
+    QUANT_16_U_DC( 0, 16 );
+}
+
+
+void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f ) {
+    vector bool short mskA;
+    vec_s32_t i_qbitsv;
+    vec_u16_t coefvA;
+    vec_s32_t multEvenvA, multOddvA, mfvA;
+    vec_s16_t zerov, one;
+    vec_s32_t fV;
+    
+    vector bool short mskB;
+    vec_u16_t coefvB;
+    vec_u32_t multEvenvB, multOddvB, mfvB;
+    
+    vec_s16_t temp1v, temp2v;
+    
+    vect_int_u qbits_u;
+    qbits_u.s[0]=i_qbits;
+    i_qbitsv = vec_splat(qbits_u.v, 0);
+
+    vect_sint_u f_u;
+    f_u.s[0]=f;
+    fV = vec_splat(f_u.v, 0);
+
+    zerov = vec_splat_s16(0);
+    one = vec_splat_s16(1);
+    
+    int i;
+    for ( i=0; i<4; i++ ) {
+        QUANT_16_U( i*2*16, i*4*16, i*4*16+16, i*2*16+16, i*4*16+32, i*4*16+48 );
+    }
+}
+
Index: common/ppc/dct.c
===================================================================
--- common/ppc/dct.c	(revision 540)
+++ common/ppc/dct.c	(working copy)
@@ -5,6 +5,7 @@
  * $Id$
  *
  * Authors: Eric Petit <titer at m0k.org>
+ *          Guillaume Poirier <gpoirier at mplayerhq.hu>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -250,3 +251,130 @@
     VEC_STORE8_L( temp6v, dct[15][2] );
     VEC_STORE8_L( temp7v, dct[15][3] );
 }
+
+/****************************************************************************
+* 8x8 transform:
+****************************************************************************/
+
+void x264_sub8x8_dct8_altivec( int16_t dct[8][8], uint8_t *pix1, uint8_t *pix2 )
+{
+    int i;
+    DECLARE_ALIGNED( int16_t, tmp[8][8], 16);
+    
+    PREP_DIFF;
+
+    vec_s16_t dct0v, dct1v, dct2v, dct3v;
+    vec_s16_t dct4v, dct5v, dct6v, dct7v;
+
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct0v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct1v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct2v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct3v );
+
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct4v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct5v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct6v );
+    VEC_DIFF_H( pix1, FENC_STRIDE, pix2, FDEC_STRIDE, 8, dct7v );
+
+
+    /* DCT8_1D in altivec baby! */
+    vec_s16_t s07v = vec_add( dct0v, dct7v);    // int s07 = SRC(0) + SRC(7);
+    vec_s16_t s16v = vec_add( dct1v, dct6v);    // int s16 = SRC(1) + SRC(6);
+    vec_s16_t s25v = vec_add( dct2v, dct5v);    // int s25 = SRC(2) + SRC(5);
+    vec_s16_t s34v = vec_add( dct3v, dct4v);    // int s34 = SRC(3) + SRC(4);
+
+
+    vec_s16_t a0v = vec_add(s07v, s34v);        // int a0 = s07 + s34;
+    vec_s16_t a1v = vec_add(s16v, s25v);        // int a1 = s16 + s25;
+    vec_s16_t a2v = vec_sub(s07v, s34v);        // int a2 = s07 - s34;
+    vec_s16_t a3v = vec_sub(s16v, s25v);        // int a3 = s16 - s25;
+
+    
+    vec_s16_t d07v = vec_sub( dct0v, dct7v);    // int d07 = SRC(0) - SRC(7);
+    vec_s16_t d16v = vec_sub( dct1v, dct6v);    // int d16 = SRC(1) - SRC(6);
+    vec_s16_t d25v = vec_sub( dct2v, dct5v);    // int d25 = SRC(2) - SRC(5);
+    vec_s16_t d34v = vec_sub( dct3v, dct4v);    // int d34 = SRC(3) - SRC(4);
+
+    vec_s16_t onev = vec_splat_s16(1);
+
+     // int a4 = d16 + d25 + (d07 + (d07>>1));
+    vec_s16_t a4v = vec_add( vec_add(d16v, d25v),
+                             vec_add(d07v, vec_sra(d07v, onev)) );
+    // int a5 = d07 - d34 - (d25 + (d25>>1));
+    vec_s16_t a5v = vec_sub( vec_sub(d07v, d34v),
+                             vec_add(d25v, vec_sra(d25v, onev)) );
+    // int a6 = d07 + d34 - (d16 + (d16>>1));
+    vec_s16_t a6v = vec_sub( vec_add(d07v, d34v),
+                             vec_add(d16v, vec_sra(d16v, onev)) );
+    // const int a7 = d16 - d25 + (d34 + (d34>>1));
+    vec_s16_t a7v = vec_add( vec_sub(d16v, d25v),
+                             vec_add(d34v, vec_sra(d34v, onev)) );
+
+    vec_s16_t twov = vec_add( onev, onev );
+    dct0v = vec_add( a0v, a1v );               // DST(0) =  a0 + a1;
+    dct1v = vec_add( a4v, vec_sra(a7v, twov) ); // DST(1) =  a4 + (a7>>2);
+    dct2v = vec_add( a2v, vec_sra(a3v, onev) ); // DST(2) =  a2 + (a3>>1);
+    dct3v = vec_add( a5v, vec_sra(a6v, twov) ); // DST(3) =  a5 + (a6>>2);
+    dct4v = vec_sub( a0v, a1v );               // DST(4) =  a0 - a1;
+    dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); // DST(5) =  a6 - (a5>>2);
+    dct6v = vec_sub( vec_sra(a2v, onev), a3v ); // DST(6) = (a2>>1) - a3 ;
+    dct7v = vec_sub( vec_sra(a4v, twov), a7v ); // DST(7) = (a4>>2) - a7 ;
+
+    /* Ok, it's working up until here Wee! */
+       
+    TRANSPOSE8(dct0v,dct1v,dct2v,dct3v,
+               dct4v,dct5v,dct6v,dct7v);
+
+    /* shameless c/p of the above code */
+    /* DCT8_1D in altivec baby! */
+    s07v = vec_add( dct0v, dct7v);    // int s07 = SRC(0) + SRC(7);
+    s16v = vec_add( dct1v, dct6v);    // int s16 = SRC(1) + SRC(6);
+    s25v = vec_add( dct2v, dct5v);    // int s25 = SRC(2) + SRC(5);
+    s34v = vec_add( dct3v, dct4v);    // int s34 = SRC(3) + SRC(4);
+    
+    
+    a0v = vec_add(s07v, s34v);        // int a0 = s07 + s34;
+    a1v = vec_add(s16v, s25v);        // int a1 = s16 + s25;
+    a2v = vec_sub(s07v, s34v);        // int a2 = s07 - s34;
+    a3v = vec_sub(s16v, s25v);        // int a3 = s16 - s25;
+    
+    
+    d07v = vec_sub( dct0v, dct7v);    // int d07 = SRC(0) - SRC(7);
+    d16v = vec_sub( dct1v, dct6v);    // int d16 = SRC(1) - SRC(6);
+    d25v = vec_sub( dct2v, dct5v);    // int d25 = SRC(2) - SRC(5);
+    d34v = vec_sub( dct3v, dct4v);    // int d34 = SRC(3) - SRC(4);
+    
+    // int a4 = d16 + d25 + (d07 + (d07>>1));
+    a4v = vec_add( vec_add(d16v, d25v),
+                             vec_add(d07v, vec_sra(d07v, onev)) );
+    // int a5 = d07 - d34 - (d25 + (d25>>1));
+    a5v = vec_sub( vec_sub(d07v, d34v),
+                             vec_add(d25v, vec_sra(d25v, onev)) );
+    // int a6 = d07 + d34 - (d16 + (d16>>1));
+    a6v = vec_sub( vec_add(d07v, d34v),
+                             vec_add(d16v, vec_sra(d16v, onev)) );
+    // const int a7 = d16 - d25 + (d34 + (d34>>1));
+    a7v = vec_add( vec_sub(d16v, d25v),
+                             vec_add(d34v, vec_sra(d34v, onev)) );
+    
+    dct0v = vec_add( a0v, a1v );               // DST(0) =  a0 + a1;
+    dct1v = vec_add( a4v, vec_sra(a7v, twov) ); // DST(1) =  a4 + (a7>>2);
+    dct2v = vec_add( a2v, vec_sra(a3v, onev) ); // DST(2) =  a2 + (a3>>1);
+    dct3v = vec_add( a5v, vec_sra(a6v, twov) ); // DST(3) =  a5 + (a6>>2);
+    dct4v = vec_sub( a0v, a1v );               // DST(4) =  a0 - a1;
+    dct5v = vec_sub( a6v, vec_sra(a5v, twov) ); // DST(5) =  a6 - (a5>>2);
+    dct6v = vec_sub( vec_sra(a2v, onev), a3v ); // DST(6) = (a2>>1) - a3 ;
+    dct7v = vec_sub( vec_sra(a4v, twov), a7v ); // DST(7) = (a4>>2) - a7 ;
+    
+/* end of shameless copy */
+
+    vec_st( dct0v, 0, dct );
+    vec_st( dct1v, 16, dct );
+    vec_st( dct2v, 32, dct );
+    vec_st( dct3v, 48, dct );
+    
+    vec_st( dct4v, 64, dct );
+    vec_st( dct5v, 80, dct );
+    vec_st( dct6v, 96, dct );
+    vec_st( dct7v, 112, dct );
+}
Index: common/ppc/quant.h
===================================================================
--- common/ppc/quant.h	(revision 0)
+++ common/ppc/quant.h	(revision 0)
@@ -0,0 +1,39 @@
+/*****************************************************************************
+* quant.h: h264 encoder library
+*****************************************************************************
+* Authors: Guillaume Poirier <poirierg at gmail.com>
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
+*****************************************************************************/
+
+#ifndef _PPC_QUANT_H
+#define _PPC_QUANT_H 1
+
+typedef union {
+  int s[4];
+  vector int v;
+} vect_int_u;
+
+typedef union {
+  signed int s[4];
+  signed vector int v;
+} vect_sint_u;
+
+
+void x264_quant_4x4_altivec( int16_t dct[4][4], int quant_mf[4][4], int const i_qbits, int const f );
+void x264_quant_8x8_altivec( int16_t dct[8][8], int quant_mf[8][8], int const i_qbits, int const f );
+
+void x264_quant_4x4_dc_altivec( int16_t dct[4][4], int i_quant_mf, int const i_qbits, int const f );
+#endif
Index: common/quant.c
===================================================================
--- common/quant.c	(revision 540)
+++ common/quant.c	(working copy)
@@ -25,6 +25,9 @@
 #ifdef HAVE_MMXEXT
 #include "i386/quant.h"
 #endif
+#ifdef ARCH_PPC
+#   include "ppc/quant.h"
+#endif
 
 #define QUANT_ONE( coef, mf ) \
 { \
@@ -271,4 +274,39 @@
         pf->dequant_8x8 = x264_dequant_8x8_mmx;
     }
 #endif  /* HAVE_MMXEXT */
+    
+#ifdef ARCH_PPC
+    if( cpu&X264_CPU_ALTIVEC )
+    {
+        /* determine the biggest coeffient in all quant8_mf tables */
+        for( i = 0; i < 2*6*8*8; i++ )
+        {
+            int q = h->quant8_mf[0][0][0][i];
+            if( maxQ8 < q )
+                maxQ8 = q;
+        }
+
+        for( i = 0; i < 4*6*4*4; i++ )
+        {
+            int q = h->quant4_mf[0][0][0][i];
+            if( maxQ4 < q )
+                maxQ4 = q;
+            if( maxQdc < q && i%16 == 0 )
+                maxQdc = q;
+        }
+
+        if( maxQ8 < (1<<16) )
+        {
+            pf->quant_8x8_core = x264_quant_8x8_altivec;
+        }
+        if( maxQ4 < (1<<16) )
+        {
+            pf->quant_4x4_core = x264_quant_4x4_altivec;
+        }
+        if( maxQdc < (1<<15) )
+        {
+           pf->quant_4x4_dc_core = x264_quant_4x4_dc_altivec;
+        }
+    }
+#endif
 }
Index: common/dct.c
===================================================================
--- common/dct.c	(revision 540)
+++ common/dct.c	(working copy)
@@ -437,6 +437,8 @@
         dctf->sub4x4_dct    = x264_sub4x4_dct_altivec;
         dctf->sub8x8_dct    = x264_sub8x8_dct_altivec;
         dctf->sub16x16_dct  = x264_sub16x16_dct_altivec;
+
+        dctf->sub8x8_dct8   = x264_sub8x8_dct8_altivec;
     }
 #endif
 }
Index: tools/checkasm.c
===================================================================
--- tools/checkasm.c	(revision 540)
+++ tools/checkasm.c	(working copy)
@@ -437,8 +437,9 @@
     x264_quant_function_t qf_c;
     x264_quant_function_t qf_ref;
     x264_quant_function_t qf_a;
-    int16_t dct1[64], dct2[64];
-    uint8_t cqm_buf[64];
+    int16_t dct1[64]    __attribute__((__aligned__(16)));
+    int16_t dct2[64]    __attribute__((__aligned__(16)));
+    uint8_t cqm_buf[64] __attribute__((__aligned__(16)));
     int ret = 0, ok, used_asm;
     int oks[2] = {1,1}, used_asms[2] = {0,0};
     int i, i_cqm;
Index: Makefile
===================================================================
--- Makefile	(revision 540)
+++ Makefile	(working copy)
@@ -43,7 +43,8 @@
 
 # AltiVec optims
 ifeq ($(ARCH),PPC)
-SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c
+SRCS += common/ppc/mc.c common/ppc/pixel.c common/ppc/dct.c \
+	common/ppc/quant.c
 endif
 
 # VIS optims